# Import Packages

In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
import numpy as np

# Load Dataset

In [55]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,Y
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,8.23
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,6.09
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,7.65
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,6.6
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,6.9


# Data Preprocessing Pipeline
* **KNN Imputer**: Replace missing values using KNN.
* **Ordinal Feature Encoder**: Convert categorical variables into numerical variables while keeping the order of categories.
* **Label Feature Encoder**: Convert categorical labels into numerical labels.
* **Feature Dropper**: Drops unnecessary features from the dataset.

In [56]:
class NullsImputer(BaseEstimator, TransformerMixin):

    def __init__(self, n_neighbors=1):
        self.imputer = KNNImputer(n_neighbors=n_neighbors)

    def fit(self, X, y=None):
        self.imputer.fit(X)
        return self

    def transform(self, X):
        X_imputed = self.imputer.transform(X)
        return pd.DataFrame(X_imputed, columns=X.columns, index=X.index) if isinstance(X, pd.DataFrame) else X_imputed

In [57]:
class OrdinalFeatureEncoder(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.ordinal_encoder = OrdinalEncoder(
            categories=[
                ['Small', 'Medium', 'High'],
                ['Tier 1', 'Tier 2', 'Tier 3'],
                ['Grocery Store', 'Supermarket Type1', 'Supermarket Type2', 'Supermarket Type3']
            ],
            handle_unknown='use_encoded_value', 
            unknown_value=-2
        )

    def fit(self, X, y=None):
        self.ordinal_encoder.fit(X[['X9', 'X10', 'X11']])
        return self

    def transform(self, X):
        encoded_values = self.ordinal_encoder.transform(X[['X9', 'X10', 'X11']]) + 1
        X[['X9_encoded', 'X10_encoded', 'X11_encoded']] = encoded_values
        X.replace(-1, np.nan, inplace=True)
        return X

In [58]:
class LabelFeatureEncoder(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.X3_label_encoder = LabelEncoder()
        self.X5_label_encoder = LabelEncoder()
        self.X7_label_encoder = LabelEncoder()

    def fit(self, X, y=None):
        X3_processed = X['X3'].str.lower().replace({'lf': 'low fat', 'reg': 'regular'})
        self.X3_label_encoder.fit(X3_processed)
        self.X5_label_encoder.fit(X['X5'])
        self.X7_label_encoder.fit(X['X7'])
        return self

    def transform(self, X):
        X['X3'] = X['X3'].str.lower().replace({'lf': 'low fat', 'reg': 'regular'})
        X['X3_encoded'] = self.X3_label_encoder.transform(X['X3'])
        X['X5_encoded'] = self.X5_label_encoder.transform(X['X5'])
        X['X7_encoded'] = self.X7_label_encoder.transform(X['X7'])
        return X

In [59]:
class FeatureDropper(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(['X1', 'X7', 'X10', 'X11', 'X3', 'X5', 'X9'], axis=1, errors="ignore")

In [61]:
pipeline = Pipeline([("OrdinalFeatureEncoder", OrdinalFeatureEncoder()),
                     ("LabelFeatureEncoder", LabelFeatureEncoder()),
                     ("FeatureDropper", FeatureDropper()),
                     ("NullsImputer", NullsImputer())])

data = pipeline.fit_transform(data)
data.head()

Unnamed: 0,X2,X4,X6,X8,Y,X9_encoded,X10_encoded,X11_encoded,X3_encoded,X5_encoded,X7_encoded
0,9.3,0.016047,249.8092,1999.0,8.23,2.0,1.0,2.0,0.0,4.0,9.0
1,5.92,0.019278,48.2692,2009.0,6.09,2.0,3.0,3.0,1.0,14.0,3.0
2,17.5,0.01676,141.618,1999.0,7.65,2.0,1.0,2.0,0.0,10.0,9.0
3,19.2,0.0,182.095,1998.0,6.6,1.0,3.0,1.0,1.0,6.0,0.0
4,8.93,0.0,53.8614,1987.0,6.9,3.0,3.0,2.0,0.0,9.0,1.0


In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   X2           6000 non-null   float64
 1   X4           6000 non-null   float64
 2   X6           6000 non-null   float64
 3   X8           6000 non-null   float64
 4   Y            6000 non-null   float64
 5   X9_encoded   6000 non-null   float64
 6   X10_encoded  6000 non-null   float64
 7   X11_encoded  6000 non-null   float64
 8   X3_encoded   6000 non-null   float64
 9   X5_encoded   6000 non-null   float64
 10  X7_encoded   6000 non-null   float64
dtypes: float64(11)
memory usage: 515.8 KB
