# **Data Preparation**

## Library Import

In [2]:
# Imports
import numpy as np
import pandas as pd
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

## Dataset

In [3]:
# Dataset Paths
current_dir = os.path.dirname(os.path.abspath("__file__"))
train_path = os.path.join(current_dir, "../data/train.csv")
test_path = os.path.join(current_dir, "../data/test.csv")

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

## Data Cleaning

In [None]:
# Data Train Cleaning

def clean_train_data(df):
    # Remove duplicates
    df = df.drop_duplicates()

    # Handle missing values
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    numeric_imputer = SimpleImputer(strategy='median')
    categorical_imputer = SimpleImputer(strategy='most_frequent')

    df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])
    df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

    # Deal with outliers (e.g., capping outliers to the 1st and 99th percentiles)
    for col in numeric_cols:
        lower_bound = df[col].quantile(0.01)
        upper_bound = df[col].quantile(0.99)
        df[col] = np.clip(df[col], lower_bound, upper_bound)

    return df

cleaned_train_df = clean_train_data(train_df)

In [13]:
# Data Test Cleaning

def clean_test_data(df):
    # Remove duplicates
    df = df.drop_duplicates()

    # Handle missing values
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    numeric_imputer = SimpleImputer(strategy='median')
    categorical_imputer = SimpleImputer(strategy='most_frequent')

    df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])
    df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

    # Deal with outliers (e.g., capping outliers to the 1st and 99th percentiles)
    for col in numeric_cols:
        lower_bound = df[col].quantile(0.01)
        upper_bound = df[col].quantile(0.99)
        df[col] = np.clip(df[col], lower_bound, upper_bound)

    return df

cleaned_test_df = clean_test_data(test_df)

## Data Preprocessing

In [14]:
# Data Train Preprocessing 
def preprocess_train_data(df):
    # Separate features and target
    y = df['label']
    X = df.drop(columns=['label', 'id'])

    # Scale numeric features
    numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
    scaler = StandardScaler()
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    # Encode categorical features
    categorical_cols = X.select_dtypes(include=['object']).columns
    encoders = {}
    for col in categorical_cols:
        encoder = LabelEncoder()
        X[col] = encoder.fit_transform(X[col])
        encoders[col] = encoder

    # Handle imbalanced dataset
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    return X_resampled, y_resampled, scaler, encoders

X_train, y_train, scaler, encoders = preprocess_train_data(cleaned_train_df)

In [15]:
# Data Test Preprocessing 
def preprocess_test_data(df, scaler, encoders):
    # Separate features
    X = df.drop(columns=['label', 'id'])

    # Scale numeric features
    numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
    X[numeric_cols] = scaler.transform(X[numeric_cols])

    # Encode categorical features
    categorical_cols = X.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if col in encoders:
            encoder = encoders[col]
            X[col] = encoder.transform(X[col])

    return X

X_test = preprocess_test_data(cleaned_test_df, scaler, encoders)

KeyError: "['label'] not found in axis"