## Cleaning Checklist 
Note: argument "inplace=True" will modify original dataframe. inplace = False [Default] will generate a seperate dataframe 

1. Profiling
2. Missing Data
3. Duplicate Data
4. Correct Data Types
5. Numeric Data
6. Text Data
7. Date and Time Data
8. Categorical Data
9. Verify

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("df.csv")

Profiling

In [None]:
# Display the first few rows
print("First 5 rows:\n", df.head())

# Random Sample
print("\nSample of Data:\n", df.sample(n=5))

# Get a summary of the DataFrame, including data types and non-null counts
print("\nDataFrame Info:\n", df.info())

# Get descriptive statistics for numerical columns
print("\nNumerical Statistics:\n", df.describe())

# Get value counts for categorical columns to understand distributions
for col in df.select_dtypes(include=['object', 'category']).columns:
    print(f"\nValue Counts for {col}:\n", df[col].value_counts(dropna=False))

First 5 rows:
    age   sex   trx  week  wbc  rbc adverse_effects  num_effects
0   62  male  Drug     0  7.3  5.1              No            0
1   62  male  Drug     1  NaN  NaN              No            0
2   62  male  Drug    12  5.6  5.0              No            0
3   62  male  Drug    16  NaN  NaN              No            0
4   62  male  Drug     2  6.6  5.1              No            0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16103 entries, 0 to 16102
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              16103 non-null  int64  
 1   sex              16103 non-null  object 
 2   trx              16103 non-null  object 
 3   week             16103 non-null  int64  
 4   wbc              9128 non-null   float64
 5   rbc              9127 non-null   float64
 6   adverse_effects  16103 non-null  object 
 7   num_effects      16103 non-null  int64  
dtypes: float64(2), int64(3), obj

Missing Values

In [None]:
# Identify missing values
print("Missing Values:\n", df.isna().sum())

# Option 1: Impute numerical missing values with the mean or median
df['numerical_column'].fillna(df['numerical_column'].mean())

# Option 2: Impute string missing values with a custom string
df['hair_color'].fillna('baldy')

# Option 3: Forward fill missing values (use with caution, especially for time series)
df['another_column'].fillna(method='ffill')

# Option 4: Backward fill missing values (use with caution)
df['yet_another_column'].fillna(method='bfill')

# Option 5: Remove entire rows. A: with any missing values or 
df.dropna()
# B: ONLY where ALL cells in the row are NaN
df.dropna(how='all')

# Option 6: Remove columns with too many missing values (e.g., > 70% missing)
missing_percentage = (df.isna().sum() / len(df)) * 100
cols_to_drop = missing_percentage[missing_percentage > 70].index
# df.drop(cols_to_drop, axis=1)

# Option 7: Flag missing values (create a new binary column)
df['numerical_column_missing'] = df['numerical_column'].isna().astype(int)

Duplicate Data

In [None]:
# Identify duplicate rows
print("Number of duplicate rows:", df.duplicated().sum())

# View duplicate rows
print("\nDuplicate Rows:\n", df[df.duplicated(keep=False)]) # keep=False shows all duplicates

# Remove duplicate rows, keeping the first occurrence
df.drop_duplicates()

# Remove duplicate rows, keeping the last occurrence
df.drop_duplicates(keep='last')

# Remove duplicates based on a subset of columns
df.drop_duplicates(subset=['column1', 'column2'])

# Identify and keep only truly unique rows (where no exact duplicate exists anywhere)
# Using keep=False ensures that ALL occurrences of a duplicate set are dropped.
df_truly_unique = df.drop_duplicates(keep=False)

Correct Data Types

In [None]:
# Check current data types
print("\nCurrent Data Types:\n", df.dtypes)

# Convert a column to a different data type
df['numerical_column'] = pd.to_numeric(df['numerical_column'], errors='coerce') # 'coerce' will turn invalid parsing into NaN
df['thousands_seperator'].str.replace(',', '') #replace comma with no space to remove, then covert to_numeric
df['date_column'] = pd.to_datetime(df['date_column'], format='%Y-%m-%d', errors='coerce')
df['categorical_column'] = df['categorical_column'].astype('category') 
df['binary_column'] = df['binary_column'].astype('boolean') # Assuming 0/1 or True/False
df['adverse_effects'] = df['adverse_effects'].str.lower().str.strip().map({'no': False, 'yes':True}).astype('boolean') # or similar 

# Convert a column to string
df['id_column'] = df['id_column'].astype('string') # StringDtype > 'str' for performance and without converting NaN's to strings

# Verify the changes
print("\nUpdated Data Types:\n", df.dtypes)

# why boolean?
# Memory Efficiency: It's highly memory-efficient, storing each value as a single bit (or very close to it) internally.
# Clarity and Correctness: It accurately represents the true/false nature of the data, which is what 'yes'/'no' logically imply.
# Handles Missing Values: Unlike NumPy's bool dtype, Pandas' boolean dtype gracefully handles missing values (pd.NA). If your data might have blanks, unknown responses, or other indicators of missingness, boolean is crucial because it won't coerce them to True or False.
# Direct Logical Operations: You can perform logical operations (&, |, ~) directly on these columns.

# why categorical?
# Memory Efficiency
# Performance Improvement
# Statistical Signaling
# Optional Defined Order (Ordinal Data): Categorical data can be ordered or unordered.


Numeric Data

In [None]:
# Negative Data that shouldn't be negative
df['numerical_column'] = df['numerical_column'].abs()

# For numerical columns, using IQR (Interquartile Range) method
Q1 = df['numerical_column'].quantile(0.25)
Q3 = df['numerical_column'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['numerical_column'] < lower_bound) | (df['numerical_column'] > upper_bound)]
print("\nOutliers in numerical_column:\n", outliers)

sns.catplot(kind='box', data=df, y='numerical_column') 
plt.title('numerical_column')
plt.show()

# Option 1: Remove outliers/ data that falls outside of a desired min & max
df_no_outliers = df[~((df['numerical_column'] < lower_bound) | (df['numerical_column'] > upper_bound))]

# Option 2: Cap or floor outliers, replacing outliers with the lower/upper bound value
df['numerical_column_capped'] = df['numerical_column'].clip(lower=lower_bound, upper=upper_bound)

# Option 3: Transform the data (e.g., log transformation for skewed data)
import numpy as np
# df['numerical_column_log'] = np.log(df['numerical_column']) # Be mindful of zero or negative values


Text Data

In [None]:
# Convert case
df['text_column'] = df['text_column'].str.lower()
df['text_column'] = df['text_column'].str.upper()
df['text_column'] = df['text_column'].str.title()

# Remove leading/trailing whitespace
df['text_column'] = df['text_column'].str.strip()

# Remove punctuation
import string
def remove_punctuation(text):
    if isinstance(text, str):
        return text.translate(str.maketrans('', '', string.punctuation))
    return text
df['text_column'] = df['text_column'].apply(remove_punctuation)

# Replace entire matching cell value, regardless of data type
df['column'] = df['column'].replace('a', 'b')

# Remove special characters
df['text_column'] = df['text_column'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

# Correct spelling errors (can be complex, often involves libraries like `fuzzywuzzy` or manual mapping)
# Example of simple replacement:
df['text_column'] = df['text_column'].str.replace('misspelled', 'correct')

# Handle inconsistent formatting (e.g., different ways of writing the same thing)
df['city_column'] = df['city_column'].str.replace('St.', 'Saint', regex=False).str.strip()

# Split the 'Name' column into 'First Name' and 'Last Name'
df[['First Name', 'Last Name']] = df['Name'].str.split(',', expand=True)

Date & Time Data

In [None]:

# Handle strings or different date formats during conversion
df['date_column'] = pd.to_datetime(df['date_column'], format='%Y-%m-%d', errors='coerce')

# Extract date components
df['year'] = df['date_column'].dt.year
df['month'] = df['date_column'].dt.month
df['day'] = df['date_column'].dt.day
df['day_of_week'] = df['date_column'].dt.day_name()

# Handle time components if present
df['hour'] = df['date_column'].dt.hour
df['minute'] = df['date_column'].dt.minute
df['second'] = df['date_column'].dt.second

# Handle time zones (if applicable)
# df['datetime_utc'] = df['datetime_column'].dt.tz_localize('UTC')
# df['datetime_local'] = df['datetime_utc'].dt.tz_convert('US/Pacific')

Categorical Data

In [None]:
# Check distinct values for inconsistencies, use .value_counts for how often they appear
print("\nDistinct values in categorical_column:\n", df['categorical_column'].unique())

# Defining categories
df = pd.DataFrame({
    'fruit': ['apple', 'banana', 'orange', 'apple', 'banana'],
    'size': ['small', 'large', 'medium', 'small', 'large']})
df['fruit'] = df['fruit'].astype('category')
df['size'] = df['size'].astype('category')

# You can also specify it during Series creation
s_cat = pd.Series(['A', 'B', 'A', 'C'], dtype='category')

# Standardize categories (e.g., 'USA', 'U.S.A.', 'United States' to 'USA')
df['country_column'] = df['country_column'].replace(['U.S.A.', 'United States'], 'USA')

# Group less frequent categories into an 'Other' category (to reduce dimensionality)
value_counts = df['category_column'].value_counts()
value_counts_proportion = df['category_column'].value_counts(normalize=True)
infrequent_categories = value_counts[value_counts < 10].index # Example threshold
df['category_cleaned'] = df['category_column'].apply(lambda x: 'Other' if x in infrequent_categories else x)

# Create categorical data type column
df['categorical_column'] = df['categorical_column'].astype('category') 

# why categorical?
# Memory Efficiency
# Performance Improvement
# Statistical Signaling
# Optional Defined Order (Ordinal Data): Categorical data can be ordered or unordered.

Verify

In [None]:
# Re-run descriptive statistics and info
print("\nCleaned DataFrame Info:\n", df.info())
print("\nCleaned Numerical Statistics:\n", df.describe())

# Check for remaining missing values or duplicates
print("\nMissing Values After Cleaning:\n", df.isna().sum())
print("\nNumber of Duplicate Rows After Cleaning:", df.duplicated().sum())

# Sample the cleaned data to visually inspect
print("\nSample of Cleaned Data:\n", df.sample(5))

# Uniqueness check for categorical columns 
for col in df.select_dtypes(include=['object', 'category']).columns:
    print(f"\nValue Counts for {col}:\n", df[col].value_counts(dropna=False))