# Automating Data Cleaning with a 5 steps pipeline

**Defining a pipeline to automate the data cleaning.**


In [1]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder 
import numpy as np 
import os


## STEP 1 - READING THE DATA

In [2]:
# Function to read data based on file extension
def read_data(file_path):
    _ , file_ext = os.path.splitext(file_path)
    if file_ext == '.csv':
        return pd.read_csv(file_path)
    elif file_ext == '.json':
        return pd.read_json(file_path)
    elif file_ext in ['.xls', '.xlsx']:
        return pd.read_excel(file_path)
    else:
        raise ValueError("Unknown file format")

## STEP 2 - DEALING WITH DUPLICATES

In [3]:
# 2. Check if there are duplicates
def drop_duplicates(df, columns=None): 
	if columns == None: 
		df.drop_duplicates(inplace=True) 
	else: 
		df.drop_duplicates(subset = columns, inplace=False)
	return df 

## STEP 3 - DEALING WITH MISSING VALUES

In [4]:
def check_missing_data(df):
    # Check for missing values
    proportion_null_rows = 100*(round(df.isnull().any(axis=1).sum()/df.any(axis=1).count(),2))
    if proportion_null_rows <= 5:
        print(f"There are {df.isnull().any(axis=1).sum()} rows with a null value. All of them are erased!")
        df.dropna()
    else:
        print("Too many null values, we need to check columns by columns further.")
        if df.isnull().sum().sum() > 0:
            print("\nProportion of missing values by column")
            values = 100*(round(df.isnull().sum()/df.count(),2))
            print(values)
            dealing_missing_data(df)
        else:
            print("No missing values detected!")
            

def dealing_missing_data(df):
    values = 100*(round(df.isnull().sum()/df.count(),2))
    to_delete = []
    to_impute = []
    to_check = []
    for name, proportion in values.items():
        if int(proportion) == 0:
            continue
        elif int(proportion) <= 10:
            to_impute.append(name)
            df.fillna(df[name].median()) 
        else: 
            to_check.append(name)
    print(f"\nThe missing values in {to_impute} have been replaced by the median.")
    print(f"The columns {to_check} should be further understood")
    


## STEP 4 - DETECTING DATA TYPES MISMATCHES

In [5]:
# define the expected types
expected_types = {'recipe': 'int64', 
                  'calories': 'float64', 
                  'carbohydrate': 'float64',
                  'sugar': 'float64', 
                  'protein': 'float64',
                  'category': 'str', 
                  'servings': 'int64',
                  'high_traffic': 'bool'               
                  }

# detect type mismatches
def check_data_types(df, expected_types):
    """
    Check the data types of a DataFrame against expected types.

    Parameters:
    - df (pd.DataFrame): The DataFrame to check.
    - expected_types (dict): A dictionary mapping column names to expected data types (e.g., 'int', 'float', 'datetime').

    Returns:
    - dict: A report of mismatches and suggested corrections.
    """
    for column, expected_type in expected_types.items():
        actual_type = df[column].dtype

        # Create a readable version of numpy dtype for reporting
        readable_type = np.dtype(actual_type).name
        if not np.issubdtype(actual_type, np.dtype(expected_type).type):
            message = f"Column '{column}' has type '{readable_type}' instead of '{expected_type}'."
            suggestion = f"Convert '{column}' to '{expected_type}'."
            print(f"{message}", f"{suggestion}")

    print("No data types mismatch detected")

## STEP 5 - DETECTING OUTLIERS

In [6]:
# Function to find outliers using IQR
def find_outliers_IQR(df):
    outlier_indices = []
    df = df.select_dtypes(include=['number'])
    for column in df.columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Get the indices of outliers for feature column
        outlier_list_col = df[(df[column] < lower_bound) | (df[column] > upper_bound)].index
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = list(set(outlier_indices))  # Get unique indices
    return df.iloc[outlier_indices]


# FINAL PIPELINE

In [7]:
file_path = "data/my_data.csv"

df = read_data("data/my_data.csv")

df = drop_duplicates(df, columns=["recipe","calories"])

check_missing_data(df)

check_data_types(df, expected_types)

outliers = find_outliers_IQR(df)
print("Outliers detected using IQR:")
print(outliers)

Too many null values, we need to check columns by columns further.

Proportion of missing values by column
recipe           0.0
calories         6.0
carbohydrate     6.0
sugar            6.0
protein          6.0
category         0.0
servings         0.0
high_traffic    65.0
dtype: float64

The missing values in ['calories', 'carbohydrate', 'sugar', 'protein'] have been replaced by the median.
The columns ['high_traffic'] should be further understood
Column 'category' has type 'object' instead of 'str'. Convert 'category' to 'str'.
Column 'servings' has type 'object' instead of 'int64'. Convert 'servings' to 'int64'.
Column 'high_traffic' has type 'object' instead of 'bool'. Convert 'high_traffic' to 'bool'.
No data types mismatch detected
Outliers detected using IQR:
     recipe  calories  carbohydrate  sugar  protein
513     514      2.98          9.81  28.58    19.11
3         4     97.03         30.56  38.63     0.02
518     519    161.81         16.80  30.56     0.14
520     521   