### Importing some important Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import VotingClassifier
import os
import datetime as dt
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
#from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

#### Import Data

In [None]:
train = pd.read_csv('../input/bigmart-sales-data/Train.csv')
test = pd.read_csv('../input/bigmart-sales-data/Test.csv')

#### Data type and check for null values

In [None]:
train.dtypes

In [None]:
test.dtypes

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

#### Data Clearning and Preprocessing

In [None]:
train['Item_Weight'].mean()

In [None]:
train['Item_Weight']=train['Item_Weight'].fillna(train['Item_Weight'].mean())

In [None]:
train.isnull().sum()

In [None]:
test['Item_Weight']=test['Item_Weight'].fillna(test['Item_Weight'].mean())

In [None]:
test.isnull().sum()

In [None]:
train['Outlet_Size']

In [None]:
train.head()

In [None]:
train.loc[train['Outlet_Type']=='Grocery Store','Outlet_Size'] = 'Small'

In [None]:
train.isnull().sum()

In [None]:
moda = train['Outlet_Size'].mode()[0]

In [None]:
train['Outlet_Size'] = train['Outlet_Size'].fillna(moda)

In [None]:
train.isnull().sum()

In [None]:
train['Outlet_Size'].value_counts()

In [None]:
test.loc[test['Outlet_Type']=='Grocery Store','Outlet_Size'] = 'Small'

In [None]:
moda = test['Outlet_Size'].mode()[0]
test['Outlet_Size'] = test['Outlet_Size'].fillna(moda)

In [None]:
test['Outlet_Size'].isnull().sum()

In [None]:
test.isna().sum()

In [None]:
train.head()

In [None]:
for col in train.columns:
    print('Value Count is: ',train[col].value_counts())

In [None]:
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace({'LOW FAT':'Low Fat','LF':'Low Fat','Regular':'Regular','reg':'Regular','low fat':'Low Fat'})

In [None]:
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace({'LOW FAT':'Low Fat','LF':'Low Fat','Regular':'Regular','reg':'Regular','low fat':'Low Fat'})

In [None]:
train.head()

In [None]:
train['Item_Identifier'] = train['Item_Identifier'].astype(str).str[:2]

In [None]:
train.head()

In [None]:
train.loc[train['Item_Identifier']=='NC','Item_Fat_Content'] = 'Non-Edible'

In [None]:
test['Item_Identifier'] = test['Item_Identifier'].astype(str).str[:2]

In [None]:
test.head()

In [None]:
print(test.shape)
train.shape

In [None]:
test.loc[test['Item_Identifier']=='NC','Item_Fat_Content'] = 'Non-Edible'

In [None]:
train['Item_Fat_Content'].value_counts()

In [None]:
test['Item_Fat_Content'].value_counts()

#### EDA

In [None]:
test.groupby(['Item_Type'])['Item_Fat_Content'].count().sort_values().plot.bar()

In [None]:
train.groupby(['Item_Type'])['Item_Fat_Content'].count().sort_values().plot.bar()

In [None]:
train.groupby(['Item_Type'])['Item_Visibility'].count().sort_values().plot.bar()

In [None]:
test.groupby(['Item_Type'])['Item_Visibility'].count().sort_values().plot.bar()

In [None]:
sns.distplot(train['Item_MRP'],bins=30)

In [None]:
sns.catplot(x='Outlet_Size',hue='Item_Fat_Content',data=train,kind='count')

In [None]:
sns.catplot(x='Outlet_Establishment_Year',hue='Outlet_Size',data=train,kind='count')

#### Converting categorical values to numeric values

In [None]:
train['Outlet_Establishment_Year'].max()

In [None]:
test['Outlet_Establishment_Year'].max()

In [None]:
train['Outlet_Establishment_Year'] = 2010-train['Outlet_Establishment_Year']

In [None]:
test['Outlet_Establishment_Year'] = 2010-test['Outlet_Establishment_Year']

In [None]:
train.head()

In [None]:
train=train.drop(['Item_Identifier','Outlet_Identifier'],axis=1)

In [None]:
train.shape

In [None]:
test=test.drop(['Item_Identifier','Outlet_Identifier'],axis=1)

In [None]:
test.shape

In [None]:
dtest = pd.get_dummies(test)

In [None]:
dtest.head()

In [None]:
dtrain = pd.get_dummies(train)

In [None]:
dtrain.head()

In [None]:
y=dtrain['Item_Outlet_Sales']

In [None]:
x=dtrain.drop(['Item_Outlet_Sales'],axis=1)
xx=x.copy()

In [None]:
from sklearn.preprocessing import StandardScaler
stds=StandardScaler()
x = stds.fit_transform(x)
#x['Item_Weight']=stds.fit_transform(np.array(x['Item_Weight']).reshape(-1,1))

In [None]:
x=pd.DataFrame(x,columns=xx.columns)

In [None]:
dtestt=dtest.copy()
dtest = stds.fit_transform(dtest)
dtest = pd.DataFrame(dtest,columns=dtestt.columns)

In [None]:
dtest.head()

### Preparing Data for Machine Learning Model (Train, Test, Split)

In [None]:
XX_train,X_test,yy_train,y_test = train_test_split(x,y,test_size=0.2, random_state=42)
X_train,X_cv,y_train,y_cv = train_test_split(XX_train,yy_train, test_size=0.2,random_state=42)

### Random-Forest Regressor

In [None]:
RF = RandomForestRegressor()
RF_est = {'n_estimators':range(10,1500,100),'max_depth':range(1,50)}
RF_model = RandomizedSearchCV(RF,RF_est,scoring='neg_mean_squared_error',cv=5,n_jobs=-1)
RF_model.fit(X_cv,y_cv)

In [None]:
print(RF_model.best_score_)
print(RF_model.best_estimator_)
print(RF_model.best_params_)


In [None]:
RF_Result = RandomForestRegressor(n_estimators=810,max_depth=8)
RF_Result.fit(X_train,y_train)
pred1 = RF_Result.predict(X_test)
np.sqrt(mean_squared_error(y_test,pred1))

### Decision-Tree Regressor

In [None]:
DTR = DecisionTreeRegressor(random_state=42)
DTR_cv = GridSearchCV(DTR,param_grid={'min_samples_split':range(2,20)},scoring='neg_mean_squared_error',cv=5)
DTR_cv.fit(X_cv,y_cv)

In [None]:
print(DTR_cv.best_score_)
print(DTR_cv.best_estimator_)
print(DTR_cv.best_params_)

In [None]:
DTR_result = DecisionTreeRegressor(random_state=42,min_samples_split=19)
DTR_result.fit(X_train,y_train)
pred2 = DTR_result.predict(X_test)
np.sqrt(mean_squared_error(pred2,y_test))

### XG-Boost Regressor

In [None]:
XG = XGBRegressor()
XG.fit(X_train,y_train)
pred3 = XG.predict(X_test)
np.sqrt(mean_squared_error(y_test,pred3))

In [None]:
X_train.shape

### Stacking 

In [None]:
from mlxtend.regressor import StackingCVRegressor

In [None]:
comb = StackingCVRegressor(regressors=(XG,RF,DTR), cv=5,meta_regressor=RF,random_state=42)
comb.fit(X_train,y_train)
X_test.columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32']
comb_pred = comb.predict(X_test)
np.sqrt(mean_squared_error(comb_pred,y_test))

In [None]:
X_test.shape

In [None]:
dtest.shape

### Predicting the dtest values

In [None]:
dtest.columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32']

final_pred = comb.predict(dtest)

In [None]:
final_pred