# Manuall Label Encoding Experiment s 

In [1]:
# Importing Libraries 
import os, sys
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline 

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')


from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
# before Label Encoding 
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

target_column_name = 'Item_Outlet_Sales'
input_feature_train_df = train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df = train_df[target_column_name]

input_feature_test_df = test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df = test_df[target_column_name]


# function to get numerical and categorical columns 
def Numerical_categorical_column(dataframe):
    '''
    This function returns the numerical and categorical column 
    return numerical_columns,categorical_columns
    '''
    numerical_columns = [i for i in dataframe.columns if  dataframe[i].dtype != 'O']
    categorical_columns = [i for i in dataframe.columns if  dataframe[i].dtype == 'O']
    return numerical_columns,categorical_columns



#Replacing same values with different names 
input_feature_train_df['Item_Fat_Content'] = input_feature_train_df['Item_Fat_Content'].replace(['LF','low fat','reg'],['Low Fat','Low Fat','Regular'])
input_feature_test_df['Item_Fat_Content'] = input_feature_test_df['Item_Fat_Content'].replace(['LF','low fat','reg'],['Low Fat','Low Fat','Regular'])

#Removing unwanted data from name item_identifier
input_feature_train_df['Item_Identifier'] = input_feature_train_df['Item_Identifier'].apply(lambda x:x[:2])
input_feature_test_df['Item_Identifier'] = input_feature_test_df['Item_Identifier'].apply(lambda x:x[:2])

# Feature Engineering of Outlet Establishment year
input_feature_train_df['Outlet_age'] = 2013 - input_feature_train_df['Outlet_Establishment_Year']
input_feature_train_df.drop(columns=['Outlet_Establishment_Year'],inplace=True)

input_feature_test_df['Outlet_age'] = 2013 - test_df['Outlet_Establishment_Year']
input_feature_test_df.drop(columns=['Outlet_Establishment_Year'],inplace=True)

#Those product who are non-consumbale but have fat content will replace them with non-ediable fat content

input_feature_train_df.loc[input_feature_train_df['Item_Identifier']=='NC','Item_Fat_Content'] = 'Non Edible'
input_feature_test_df.loc[input_feature_test_df['Item_Identifier']=='NC','Item_Fat_Content'] = 'Non Edible'


# dropping unwanted columns will work on it after some time 
input_feature_train_df.drop(columns=['Item_Type','Outlet_Identifier'],inplace = True)
input_feature_test_df.drop(columns=['Item_Type','Outlet_Identifier'],inplace = True)

numerical_features,categorical_features = Numerical_categorical_column(input_feature_train_df)


# Piepline for Numerical Columns 
from sklearn.pipeline import Pipeline
num_pipeline = Pipeline(steps=[
    ('imputer',KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)),
    ('scaler',StandardScaler())
])


cat_pipeline = Pipeline(steps=[
    ('imputer',KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)),
    ('scaler',StandardScaler())
])



from sklearn.compose import ColumnTransformer
preprocessing = ColumnTransformer([
                ('num_pipeline', num_pipeline, numerical_features),
                ('cat_pipeline', cat_pipeline, categorical_features),
            ])


In [3]:
train_num_df = input_feature_train_df[numerical_features]
train_cat_df = input_feature_train_df[categorical_features]


In [4]:
import numpy as np


def encoding(dataframe):
    
    Item_Identifier= {'DR': 0, 'FD': 1, 'NC': 2}
    Item_Fat_Content = {'Low Fat': 0, 'Non Edible': 1, 'Regular': 2}
    Outlet_Size = {'High': 0, 'Medium': 1, 'Small': 2, np.nan: 3}
    Outlet_Location_Type = {'Tier 1': 0, 'Tier 2': 1, 'Tier 3': 2}
    Outlet_Type = {'Grocery Store': 0,
      'Supermarket Type1': 1,
      'Supermarket Type2': 2,
      'Supermarket Type3': 3}
    for i in dataframe.columns:
        if i == "Item_Identifier":
            dataframe[i] = dataframe[i].map(Item_Identifier)
        elif i =="Item_Fat_Content":
            dataframe[i] = dataframe[i].map(Item_Fat_Content)
        elif i =="Outlet_Size":
            dataframe[i] = dataframe[i].map(Outlet_Size)
        elif i =="Outlet_Location_Type":
            dataframe[i] = dataframe[i].map(Outlet_Location_Type)
        elif i =="Outlet_Type":
            dataframe[i] = dataframe[i].map(Outlet_Type)
        else: 
            print("Not Able to Encode")
    return dataframe
train_cat_df = encoding(train_cat_df)
train_cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

In [5]:
train_cat_df

Unnamed: 0,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,1,0,1.0,0,1
1,0,2,1.0,2,2
2,1,0,1.0,0,1
3,1,2,,2,0
4,2,1,0.0,2,1
...,...,...,...,...,...
8518,1,0,0.0,2,1
8519,1,2,,1,1
8520,2,1,2.0,1,1
8521,1,2,1.0,2,2


In [6]:
# Label Encoding on test 


test_num_df = input_feature_test_df[numerical_features]
test_cat_df = input_feature_test_df[categorical_features]
encoding(test_cat_df)
test_cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

In [7]:
# After Label Encoding 


input_feature_train_df = pd.concat([train_num_df,train_cat_df],axis=1)
input_feature_test_df = pd.concat([test_num_df,test_cat_df],axis=1)


train_array = preprocessing.fit_transform(input_feature_train_df)
input_feature_train_df = pd.DataFrame(train_array,columns=input_feature_train_df.columns)
input_feature_train_df



test_array = preprocessing.transform(input_feature_test_df)
input_feature_test_df = pd.DataFrame(test_array,columns=input_feature_test_df.columns)
input_feature_test_df


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_age,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,1.798914,-1.135138,-0.532035,-0.139541,-0.179795,-0.997813,-0.429154,-1.369334,-0.252658
1,-1.047131,-0.536960,-0.861920,-1.095190,-0.179795,1.236942,0.898016,-0.138882,-0.252658
2,0.393036,0.648183,1.618094,-0.020085,1.735696,0.119565,-1.756324,1.091569,-1.508289
3,-1.272300,-0.983503,0.225484,-1.095190,-0.179795,-0.997813,0.898016,-0.138882,-0.252658
4,0.186917,1.016910,1.497272,1.532846,-0.179795,1.236942,-0.429154,1.091569,2.258603
...,...,...,...,...,...,...,...,...,...
5676,-0.544215,-1.020172,0.005181,0.099372,-0.179795,1.236942,0.898016,-1.369334,-0.252658
5677,-1.207149,1.489663,0.452086,-1.334103,-0.179795,1.236942,-0.429154,1.091569,1.002972
5678,-0.658514,0.143358,-0.357287,-0.497909,1.735696,0.119565,0.898016,-0.138882,-0.252658
5679,0.553055,-1.281758,1.182389,-1.095190,-0.179795,1.236942,0.898016,-0.138882,-0.252658
