# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Data Preprocessing Pipeline

In [2]:
def data_preprocessing_pipeline(data):    
    # Identify numeric and categorical features
    numeric_features = data.select_dtypes(include=['float', 'int']).columns
    categorical_features = data.select_dtypes(include=['object']).columns
    
    print("Numeric Features:", numeric_features)
    print("Categorical Features:", categorical_features)
    
    # Handle missing values in numeric features
    # Fill missing values with the mean of the column
    data[numeric_features] = data[numeric_features].fillna(data[numeric_features].mean())
    
    # Detect and handle outliers in numeric features using IQR
    for feature in numeric_features:
        Q1 = data[feature].quantile(0.25)  # 25th percentile
        Q3 = data[feature].quantile(0.75)  # 75th percentile
        IQR = Q3 - Q1  # Interquartile range
        lower_bound = Q1 - (1.5 * IQR)  # Lower bound for outliers
        upper_bound = Q3 + (1.5 * IQR)  # Upper bound for outliers
        
        # Replace outliers with the mean of the column
        data[feature] = np.where(
            (data[feature] < lower_bound) | (data[feature] > upper_bound),
            data[feature].mean(), data[feature]
        )
     
    # Normalize numeric features
    scaler = StandardScaler()  # Create a StandardScaler object
    data[numeric_features] = scaler.fit_transform(data[numeric_features])  # Fit and transform data
    
    # Handle missing values in categorical features
    for column in categorical_features:
        # Check if the column has any non-missing values
        if not data[column].isna().all():
            # Replace NaN values with the mode of the column
            mode_value = data[column].mode()[0]  # Mode's first value (most frequent)
            data[column] = data[column].fillna(mode_value)
        else:
            # If the entire column is NaN, fill with a default value (e.g., 'Unknown')
            data[column] = data[column].fillna('Unknown')
    
    
    
    # Return the preprocessed data
    return data

# Load the Dataset

In [3]:
# Load dataset
data = pd.read_csv("data.csv")

In [4]:
data.head()

Unnamed: 0,NumericFeature1,NumericFeature2,CategoricalFeature
0,1.0,7,A
1,2.0,8,B
2,,9,
3,4.0,10,A
4,5.0,11,B


# Apply preprocessing pipeline

In [5]:
#Perform data preprocessing
cleaned_data = data_preprocessing_pipeline(data)

Numeric Features: Index(['NumericFeature1', 'NumericFeature2'], dtype='object')
Categorical Features: Index(['CategoricalFeature'], dtype='object')


In [6]:
cleaned_data

Unnamed: 0,NumericFeature1,NumericFeature2,CategoricalFeature
0,-1.535624,-1.09937,A
1,-0.944999,-0.749128,B
2,0.0,-0.398886,A
3,0.23625,-0.048645,A
4,0.826874,0.301597,B
5,1.417499,1.994431,C
