# Preprocessing

Imports

In [15]:
import pandas as pd
from math import floor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error as mse

File import

In [16]:
filename = 'sales_data.csv'
df_original = pd.read_csv(filename, index_col=0)
df_original.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [17]:
df = df_original.copy()

### Scaling Continuous Variables
Scaling should be done on non-categorical features. Non-categorical features should be isolated first, scaled, then merged back into the dataframe.

Including non-categorical features in the scaling process harms model performance

Isolating and scaling non-categorical featres.

In [18]:
categorical_columns = ['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 
                       'Outlet_Type', 'Item_Type', 'Outlet_Identifier', 'Item_Identifier']
df_categorical = df[categorical_columns]

# Separating the target vector.
target_vector = df['Item_Outlet_Sales'].values
drop_columns = ['Item_Outlet_Sales']

# Dropping categorical features and the target vector.
df.drop(columns=drop_columns, inplace=True)
df.drop(columns=categorical_columns, inplace=True)

# Scaling
scaler = StandardScaler()
scaler.fit(df)
features = scaler.transform(df)

# Merging scaled features and categorical features.
df = pd.DataFrame(features)
df = pd.concat([df, df_categorical], axis=1)

In [19]:
df.head()

Unnamed: 0,0,1,2,3,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type,Outlet_Identifier,Item_Identifier
0,-0.841872,-0.970732,1.747454,0.139541,Low Fat,Medium,Tier 1,Supermarket Type1,Dairy,OUT049,FDA15
1,-1.641706,-0.908111,-1.489023,1.334103,Regular,Medium,Tier 3,Supermarket Type2,Soft Drinks,OUT018,DRC01
2,1.098554,-0.956917,0.01004,0.139541,Low Fat,Medium,Tier 1,Supermarket Type1,Meat,OUT049,FDN15
3,1.500838,-1.281758,0.66005,0.020085,Regular,Medium,Tier 3,Grocery Store,Fruits and Vegetables,OUT010,FDX07
4,-0.929428,-1.281758,-1.39922,-1.293934,Low Fat,High,Tier 3,Supermarket Type1,Household,OUT013,NCD19


### Nominal and Ordinal Variables

Ordinal Categorical Variables

In [20]:
ordinal_dicts = [
    {'Low Fat': 0, 'Regular': 1},
    {'Small': 0, 'Medium': 1, 'High': 2},
    {'Tier 1': 0, 'Tier 2': 1, 'Tier 3': 2},
    {'Grocery Store': 0, 'Supermarket Type1': 1, 'Supermarket Type2': 2, 'Supermarket Type3': 3}
]
ordinal_dict_labels = ['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
for index, ordinal_dict in enumerate(ordinal_dicts):
    df[ordinal_dict_labels[index]].replace(to_replace=ordinal_dict, inplace=True)

In [21]:
df[ordinal_dict_labels].head()

Unnamed: 0,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,0,1,0,1
1,1,1,2,2
2,0,1,0,1
3,1,1,2,0
4,0,2,2,1


Nominal Categorical Variables

In [22]:
nominal_columns = ['Item_Type', 'Outlet_Identifier', 'Item_Identifier']
df = pd.get_dummies(data=df, columns=nominal_columns, drop_first=True)

In [23]:
df.head()

Unnamed: 0,0,1,2,3,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Breads,Item_Type_Breakfast,...,Item_Identifier_NCZ05,Item_Identifier_NCZ06,Item_Identifier_NCZ17,Item_Identifier_NCZ18,Item_Identifier_NCZ29,Item_Identifier_NCZ30,Item_Identifier_NCZ41,Item_Identifier_NCZ42,Item_Identifier_NCZ53,Item_Identifier_NCZ54
0,-0.841872,-0.970732,1.747454,0.139541,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1.641706,-0.908111,-1.489023,1.334103,1,1,2,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.098554,-0.956917,0.01004,0.139541,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.500838,-1.281758,0.66005,0.020085,1,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.929428,-1.281758,-1.39922,-1.293934,0,2,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Models

In [24]:
X = df.to_numpy()
y = target_vector

### KNN Regressor Model

In [25]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X, y)
knn_score = knn.score(X, y)
knn_predictions = knn.predict(X)
print(knn_score)

0.6689971478695613


### Linear Regression Model

In [26]:
reg = LinearRegression()
reg.fit(X, y)
reg_score = reg.score(X, y)
reg_predictions = reg.predict(X)
print(reg_score)

0.6370297306968447


In [27]:
print('True values: ', y)
print('Linear Regression: ', reg_predictions)
print('KNN: ', knn_predictions)
print(f'Linear Regression RMSE: {mse(y_true=y, y_pred=reg_predictions, squared=False)}')
print(f'KNN RMSE: {mse(y_true=y, y_pred=knn_predictions, squared=False)}')

True values:  [3735.138   443.4228 2097.27   ... 1193.1136 1845.5976  765.67  ]
Linear Regression:  [5600.  246. 1812. ... 1102. 1732. 1106.]
KNN:  [3954.71884  819.99928 1946.93236 ... 1632.00896 1492.59044  975.92964]
Linear Regression RMSE: 1028.0547460697242
KNN RMSE: 981.7402198674749


KNN Regression performs better than linear regression. Additionally, linear regression can sometimes output negative values.

In [28]:
print(f'Number of negative values in predictions: {len(reg_predictions[reg_predictions < 0])}')

Number of negative values in predictions: 397
