In [129]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,classification_report,confusion_matrix
import warnings
from joblib import dump,load

warnings.filterwarnings('ignore')

In [130]:
df = pd.read_csv('../../data/raw/Train.csv')
# df.isna().sum()

In [131]:
df['Item_Fat_Content'].replace({'LF':'Low Fat','low fat':'Low Fat','reg':'Regular'},inplace=True)
df['Item_Type'].replace({'Soft Drinks':'Drinks','Hard Drinks':'Drinks','Breads':'Baking Goods','Seafood':'Meat'},inplace=True)

In [132]:
#Handling Missing Values
df['Item_Weight'].fillna(df['Item_Weight'].mean(),inplace=True)
df['Item_Visibility'].replace({0:df['Item_Visibility'].median()},inplace=True)

In [133]:
#Feature Encooding and generation

#Binary Encoding
df['Item_Fat_Content'].replace({'Low Fat':0,'Regular':1},inplace=True)

In [134]:
#One_hot encoding nominal variables

def one_hot(df,columns,prefixes):
    df = df.copy()
    for column,prefix in zip(columns,prefixes):
        dummies = pd.get_dummies(df[column],prefix=prefix,drop_first=True)
        df = pd.concat([df,dummies],axis=1)
        df = df.drop(column,axis=1)
    return df

In [135]:
nominal_features = ['Outlet_Type','Item_Type','Outlet_Identifier']
prefixes = ['out_type','item_type','out_id']

df = one_hot(df,nominal_features,prefixes)

In [136]:
#Ordinal Encoding

def ord_enc(df,col,ord_var):
    df = df.copy()
    df[col].replace(ord_var,inplace=True)
    return df

In [137]:
#Encoding Ordinal columns

outlet_size_ord = {'Small':0,'Medium':1,'High':2}
out_loc_ord = {'Tier 1':2,'Tier 2':1,'Tier 3':0}

df = ord_enc(df,'Outlet_Size',outlet_size_ord)

df = ord_enc(df,'Outlet_Location_Type',out_loc_ord)

In [138]:
#Dropping unwanted/irrelevant column
df.drop(columns=['Item_Identifier'],axis=1,inplace=True)

In [139]:
# Deriving new column called Years_Since_Established from Establishment Year

df['Years_Since_Established'] =df['Outlet_Establishment_Year'].apply(lambda x: 2021 - x) 

In [140]:
# Missing value for OutletSize

out_train_pred_df = df[df['Outlet_Size'].isna()]
# out_test_pred_df =  test_df[test_df['Outlet_Size'].isna()]
out_train_df = df[~df['Outlet_Size'].isna()] #for training
out_train_df.isna().sum()
# out_train_df['Outlet_Size'] = out_train_df['Outlet_Size'].replace({'Small':0,'Medium':1,'High':2})
# out_train_df.drop(columns=['Item_Identifier','Outlet_Identifier'],inplace=True)
X = out_train_df.drop(columns=['Outlet_Size','Item_Outlet_Sales'])
y = out_train_df['Outlet_Size']
trainX,testX,trainY,testY = train_test_split(X,y,random_state=22,test_size=0.2)
cat_model = RandomForestClassifier(random_state=2)
cat_model.fit(trainX,trainY)

pred = cat_model.predict(testX)
out_train_pred = cat_model.predict(out_train_pred_df.drop(columns=['Outlet_Size','Item_Outlet_Sales']))
out_train_pred_df['Outlet_Size'] = out_train_pred

# out_test_pred = cat_model.predict(out_test_pred_df.drop(columns=['Outlet_Size']))
# out_test_pred_df['Outlet_Size'] = out_test_pred

df.dropna(inplace=True)
# test_df.dropna(inplace=True)

df = pd.concat([df,out_train_pred_df])
# test_df = pd.concat([test_df,out_test_pred_df])

In [141]:
#Model Training

X = df.drop(columns=['Item_Outlet_Sales'],axis=1)
y = df['Item_Outlet_Sales']

#Split Dataset
trainX,testX,trainY,testY = train_test_split(X,y,random_state=42,test_size=0.20)



In [142]:
clf = Pipeline([('cat_reg',CatBoostRegressor(random_state=2,iterations=3000,learning_rate=0.002,depth=6,silent=True))])

In [143]:
clf.fit(trainX,trainY)

Pipeline(steps=[('cat_reg',
                 <catboost.core.CatBoostRegressor object at 0x00000260269C0F70>)])

In [144]:
clf.score(trainX,trainY)

0.6185545982080469

In [145]:
dump(clf,'../../models/model.pkl')

['../../models/model.pkl']

In [146]:
clf = load('../../models/model.pkl')

In [147]:
clf.score(testX,testY)

0.6217498826705037