# Data Preprocessing

## Initialization

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from copy import deepcopy
from sklearn.preprocessing import OneHotEncoder

import src.utils as utils

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [4]:
from typing import cast

# Train
X_train = cast(pd.DataFrame, utils.deserialize_data("data/interim/X_train.pkl"))
y_train = cast(pd.Series, utils.deserialize_data("data/interim/y_train.pkl"))

# Valid
X_valid = cast(pd.DataFrame, utils.deserialize_data("data/interim/X_valid.pkl"))
y_valid = cast(pd.Series, utils.deserialize_data("data/interim/y_valid.pkl"))

# Test
X_test = cast(pd.DataFrame, utils.deserialize_data("data/interim/X_test.pkl"))
y_test = cast(pd.Series, utils.deserialize_data("data/interim/y_test.pkl"))

## Handling Data Duplication

In [None]:
def drop_duplicate_data(X: pd.DataFrame, y: pd.Series) -> tuple[pd.DataFrame, pd.Series]:
    """
    Removes duplicate rows from the feature set and synchronizes the target series.

    :param X: The input features containing potential duplicates.
    :type X: pd.DataFrame
    :param y: The target values associated with X.
    :type y: pd.Series
    :raises TypeError: If X is not a pd.DataFrame or y is not a pd.Series
    :return: A tuple of (X, y) with duplicates removed.
    :rtype: tuple[pd.DataFrame, pd.Series]
    """
    if not isinstance(X, pd.DataFrame):
        raise TypeError(f"Expected pd.DataFrame, got {type(X).__name__}")
    if not isinstance(y, pd.Series):
        raise TypeError(f"Expected pd.Series, got {type(y).__name__}")

    print("Fungsi drop_duplicate_data: parameter telah divalidasi")
    X_copy = X.copy()
    y_copy = y.copy()
    print(f"Fungsi drop_duplicate_data: shape dataset sebelum dropping duplicate adalah {X_copy.shape}")
    X_duplicate = X_copy[X_copy.duplicated()]
    print(f"Fungsi drop_duplicate_data: shape dari data yang duplicate adalah {X_duplicate.shape}")
    X_clean = (X_copy.shape[0] - X_duplicate.shape[0], X_copy.shape[1])
    print(f"Fungsi drop_duplicate_data: shape dataset setelah drop duplicate seharusnya adalah {X_clean}")
    X_copy = X_copy.drop_duplicates()
    y_copy = y_copy[X_copy.index]
    print(f"Fungsi drop_duplicate_data: shape dataset setelah dropping duplicate adalah {X_copy.shape}")
    return X_copy, y_copy

_, _ = drop_duplicate_data(X_train, y_train)

Fungsi drop_duplicate_data: parameter telah divalidasi
Fungsi drop_duplicate_data: shape dataset sebelum dropping duplicate adalah (26064, 11)
Fungsi drop_duplicate_data: shape dari data yang duplicate adalah (96, 11)
Fungsi drop_duplicate_data: shape dataset setelah drop duplicate seharusnya adalah (25968, 11)
Fungsi drop_duplicate_data: shape dataset setelah dropping duplicate adalah (25968, 11)
