Experiment - 6 

Implement the missing value, and outlier handling data preprocessing techniques on the dataset imported in lab 4 or any other dataset.

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
# Function to handle missing values
def handle_missing_values(data, strategy='mean'):
    """
    This function fills missing values based on the chosen strategy:
    - mean: fills with column mean
    - median: fills with column median
    - mode: fills with column mode (most frequent value)
    """
    if strategy == 'mean':
        return data.fillna(data.mean(numeric_only=True))
    elif strategy == 'median':
        return data.fillna(data.median(numeric_only=True))
    elif strategy == 'mode':
        return data.fillna(data.mode().iloc[0])
    else:
        raise ValueError("Invalid strategy! Use 'mean', 'median', or 'mode'.")

In [5]:
def handle_outliers(data):
    """
    Detects and handles outliers using the IQR (Interquartile Range) method on numeric columns only.
    Outliers will be capped to the lower and upper bounds defined by IQR.
    """
    # Select numeric columns only
    numeric_cols = data.select_dtypes(include=[np.number])

    Q1 = numeric_cols.quantile(0.25)
    Q3 = numeric_cols.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Capping the outliers to the upper and lower bounds
    numeric_cols_outliers_removed = numeric_cols.clip(lower=lower_bound, upper=upper_bound, axis=1)

    # Replace the numeric columns in the original dataset
    data[numeric_cols.columns] = numeric_cols_outliers_removed
    
    return data

In [6]:
# Example usage
if __name__ == "__main__":
    # File path to your sample.csv
    file_path = 'sample.csv'
    
    # Load the dataset
    dataset = pd.read_csv(file_path)
    
    # Show initial missing values count
    print("Missing values before handling:")
    print(dataset.isnull().sum())
    
    # Handling missing values (fill with mean)
    dataset_cleaned = handle_missing_values(dataset, strategy='mean')
    
    # Show missing values count after handling
    print("\nMissing values after handling:")
    print(dataset_cleaned.isnull().sum())
    
    # Handling outliers
    dataset_no_outliers = handle_outliers(dataset_cleaned)
    
    # Show some statistics after outlier handling
    print("\nDataset statistics after handling outliers:")
    print(dataset_no_outliers.describe())

Missing values before handling:
Series_reference        0
Period                  0
Data_value             41
Suppressed          44004
STATUS                  0
UNITS                   0
Magnitude               0
Subject                 0
Group                   0
Series_title_1          0
Series_title_2      22434
Series_title_3      44004
Series_title_4      44004
Series_title_5      44004
dtype: int64

Missing values after handling:
Series_reference        0
Period                  0
Data_value              0
Suppressed          44004
STATUS                  0
UNITS                   0
Magnitude               0
Subject                 0
Group                   0
Series_title_1          0
Series_title_2      22434
Series_title_3      44004
Series_title_4      44004
Series_title_5      44004
dtype: int64

Dataset statistics after handling outliers:
             Period    Data_value  Suppressed  Magnitude  Series_title_3  \
count  44004.000000  44004.000000         0.0    44004.0     