In [12]:
import numpy as np
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import tensorflow as tf
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,PowerTransformer,LabelEncoder
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from scipy.stats import mode
from sklearn.model_selection import GridSearchCV,cross_val_score
import tensorflow as tf
from category_encoders import TargetEncoder

In [31]:
train=pd.read_csv("../input/bigmart-sales-data/Train.csv")
test=pd.read_csv("../input/bigmart-sales-data/Test.csv")
train['file']='train'
test['file']='test'
all_data=pd.concat([train,test],ignore_index=True)

# Missing Value Imputation

In [32]:
item_avg_weight=all_data.pivot_table(values="Item_Weight",index="Item_Identifier")
all_data.loc[all_data['Item_Weight'].isnull(),"Item_Weight"]=all_data.loc[all_data['Item_Weight'].isnull(),"Item_Identifier"].apply(lambda x: item_avg_weight.loc[item_avg_weight.index==x,"Item_Weight"][0])

In [33]:
a=all_data.pivot_table(values="Outlet_Size",index= 'Outlet_Type',aggfunc=lambda x: mode(x).mode[0] )
all_data.loc[all_data['Outlet_Size'].isnull(),"Outlet_Size"]=all_data.loc[all_data['Outlet_Size'].isnull(),"Outlet_Type"].apply(lambda x: a.loc[a.index==x,"Outlet_Size"][0])

In [34]:
item_avg_visibility=all_data.pivot_table(values="Item_Visibility",index="Item_Identifier")
all_data.loc[all_data['Item_Visibility']==0,"Item_Visibility"]=all_data.loc[all_data['Item_Visibility']==0,"Item_Identifier"].apply(lambda x: item_avg_visibility.loc[item_avg_visibility.index==x,"Item_Visibility"][0])

In [35]:
all_data.isnull().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                     0
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
file                            0
dtype: int64

In [36]:
def Feature_Engineering(data):
    dataframe=data.copy()
    dataframe['New_Item_Type']=dataframe['Item_Identifier'].apply(lambda x: x[:2])
    dataframe['New_Item_Type']=dataframe['New_Item_Type'].map({"FD":"Food","NC":"Non Consumable","DR":"Drinks"})
    dataframe['Years_Established']=2021-dataframe['Outlet_Establishment_Year']
    dataframe.loc[dataframe['New_Item_Type']=="Non Consumable","Item_Fat_Content"]="Non-Edible"
    return dataframe
def make_submission(predictions):
    df=pd.read_csv("../input/bigmart-sales-data/Test.csv")
    df['Item_Outlet_Sales']=predictions
    df[['Item_Identifier',"Outlet_Identifier","Item_Outlet_Sales"]].to_csv("submission.csv",index=False)
def cross_validate(model,X,y):
    scores=cross_val_score(model,X,y,scoring='neg_mean_squared_error')
    print(np.sqrt(np.mean(scores)))
def gridsearch(model,param_grid,X,y):
    grid=GridSearchCV(model,param_grid,scoring='neg_mean_squared_error')
    grid.fit(X,y)
    print(np.sqrt(-grid.best_score_))
    print(grid.best_params_)
    

In [37]:
train=all_data[all_data['file']=='train']
test=all_data[all_data['file']=='test']
train=Feature_Engineering(train)
test=Feature_Engineering(test)

In [38]:
y=train['Item_Outlet_Sales'].copy()
train.drop(['Item_Outlet_Sales','file'],axis=1,inplace=True)
test.drop(['Item_Outlet_Sales','file'],axis=1,inplace=True)

In [39]:
num_cols=[col for col in train.columns if train[col].dtype!=object]
cat_cols=[col for col in train.columns if train[col].dtype==object]
preprocessing_pipeline=ColumnTransformer([("Numerical",RobustScaler(),num_cols),("Categorical",OneHotEncoder(handle_unknown="ignore"),cat_cols)])

In [40]:
X=preprocessing_pipeline.fit_transform(train)

In [41]:
gridsearch(LGBMRegressor(max_depth=3,n_estimators=50,learning_rate=0.1,subsample=0.8,min_child_samples=60),{"num_leaves":[31,10,50]},X,y)

1078.8290980592599
{'num_leaves': 31}


In [42]:
gridsearch(RandomForestRegressor(max_depth=5,n_estimators=80),{"max_features":["auto", "sqrt", "log2"]},X,y)

1084.5919929741622
{'max_features': 'auto'}


In [44]:
lgm_predictions=np.zeros((len(test),1))
lgm_val_predictions=np.zeros((len(train),1))
kfold=KFold(n_splits=5,shuffle=True,random_state=42)
for train_index,val_index in kfold.split(train,y):
    train_data=preprocessing_pipeline.fit_transform(train.loc[train_index])
    val_data=preprocessing_pipeline.transform(train.loc[val_index])
    test_data=preprocessing_pipeline.transform(test)
    y_train=y[train_index]
    lgm=LGBMRegressor(max_depth=3,n_estimators=50,learning_rate=0.1,subsample=0.8
                          )
    lgm.fit(train_data,y_train)
    p=lgm.predict(val_data)
    lgm_val_predictions[val_index]+=np.reshape(p,(len(p),1))
    p=lgm.predict(test_data)
    lgm_predictions+=np.reshape(p,(len(p),1))/5
print(np.sqrt(mean_squared_error(y,lgm_val_predictions)))
print(r2_score(y,lgm_val_predictions))

1079.0815840228224
0.6001039351576026


In [45]:
rf_predictions=np.zeros((len(test),1))
rf_val_predictions=np.zeros((len(train),1))
kfold=KFold(n_splits=5,shuffle=True,random_state=42)
for train_index,val_index in kfold.split(train,y):
    train_data=preprocessing_pipeline.fit_transform(train.loc[train_index])
    val_data=preprocessing_pipeline.transform(train.loc[val_index])
    test_data=preprocessing_pipeline.transform(test)
    y_train=y[train_index]
    rf=RandomForestRegressor(max_depth=5,n_estimators=80
                          )
    rf.fit(train_data,y_train)
    p=rf.predict(val_data)
    rf_val_predictions[val_index]+=np.reshape(p,(len(p),1))
    p=rf.predict(test_data)
    rf_predictions+=np.reshape(p,(len(p),1))/5
print(np.sqrt(mean_squared_error(y,rf_val_predictions)))

1082.3716118999216


In [46]:
weights=np.arange(0.01,1,0.01)
all=[]
min_error=10000
for w in weights:
    p=w*rf_val_predictions+(1-w)*lgm_val_predictions
    e=np.sqrt(mean_squared_error(y,p))
    if e<min_error:
        min_error=e
        best_weight=w
print("Best Weight =",best_weight)
print(min_error)
    

Best Weight = 0.29000000000000004
1078.4008095223408


In [47]:
make_submission(0.29*rf_predictions+0.71*lgm_predictions)