## 1. Importing packages

In [None]:
# Importing the libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objects as go
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

## 2. Reading data

In [None]:
# Reading the dataset
os.chdir('/kaggle/input/wallmart/')
cal = pd.read_csv('calendar.csv')
sales = pd.read_csv('sales_train_validation.csv')
sell_prices = pd.read_csv('sell_prices.csv')

## 3. Preprocessing data

### 3.1 Calculating quantities

In [None]:
cal['date']=pd.to_datetime(cal['date'])
cal = cal.rename(columns = {'d':'id'})
cal_1913 = cal[:1913]
cal_1913.head()

In [None]:
sales_new = sales.drop(columns = ['item_id','dept_id','cat_id','store_id','state_id'])
sales_new = sales_new.sort_values(['id'])
sales_new = sales_new.set_index(['id'])
sales_new = sales_new.transpose()
sales_new = sales_new.reset_index()
sales_new.head()

In [None]:
cal_sales = pd.concat([cal_1913,sales_new],axis=1)
cal_sales.head()

In [None]:
cal_sales.tail()

### 3.2 Calculating prices

In [None]:
sell_prices['state_id'] = sell_prices.item_id.map(str) \
                          + '_'  + sell_prices.store_id.map(str) + '_validation' \

sell_prices.head()            

In [None]:
sell_prices[(sell_prices['state_id'] == 'HOBBIES_1_001_CA_1_validation') & (sell_prices['wm_yr_wk']>11613)]

In [None]:
main_set = set(cal_sales.columns[15:])         # Set of all items
prices_arr = np.zeros((1,30490))               # Initializing prices with zeros as the first observation
for week_no in cal_sales['wm_yr_wk'].unique() :      
    single_row_sell = sell_prices[sell_prices['wm_yr_wk'] == week_no][['sell_price','state_id']]  # Retrieving prices of all items in the week
    differ_set = main_set.difference(set(single_row_sell['state_id'])) # Finding missing items with no price tag in the week 
    data = {'sell_price':[np.nan]*len(differ_set),'state_id':list(differ_set)} # Assigning NaN values to those missing items
    dfl = pd.DataFrame.from_dict(data) # Converting missing items (with NaN tags) to a dataframe
    new_df = pd.concat([single_row_sell,dfl],axis=0).sort_values('state_id').reset_index().iloc[:,1:] # Concatinating items with and without prices in the week
    prices_arr = np.vstack((prices_arr,np.array(list(new_df.set_index('state_id').T.values)*7))) # Stacking prices of each week

prices_arr = prices_arr[1:-5] # Removing the intial zeros we initialized with and the extra prices in the last week
prices_arr

### 3.3 Multiplying quantities and prices

In [None]:
total_value = cal_sales.iloc[:,15:] * prices_arr

## 4. EDA

In [None]:
df2 = total_value
df2['date'] = cal['date'].iloc[:1913]
df2 = df2.fillna(0)
df2.head()

In [None]:
df2.shape

### 4.1 Adding new features

#### a. Total sales of 10 stores per day

In [None]:
df2['Total'] = 0
for i in range(30490):
    i = df2.columns[i]
    df2['Total'] += df2[i]
df2['Total'].head()

In [None]:
fig = px.line(df2, x='date', y='Total', title='Wallmart Sales 2011-2016/10 stores',width=1200)
fig.update_xaxes(rangeslider_visible=True)
fig.show()

The downward lines in the above chart corresponds to no sales on the Eve of Christmas holiday

#### b. Total sales per day per state

In [None]:
for i in range(30490):
    i = df2.columns[i]
    state = i.split('_')[3]
    if state not in df2.columns:
        df2[state] = 0
for i in range(30490):
    i = df2.columns[i]
    state = i.split('_')[3]
    df2[state] += df2[i]
df2.head()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df2['date'], y=df2['CA'].values,
                    mode='lines',
                    name='CA'))
fig.add_trace(go.Scatter(x=df2['date'], y=df2['TX'].values,
                    mode='lines',
                    name='TX'))
fig.add_trace(go.Scatter(x=df2['date'], y=df2['WI'].values,
                    mode='lines',
                    name='WI'))
fig.update_layout(
    autosize=False,
    width=1000,
    height=700,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
    title="Walmart statewise sales",
    xaxis_title="Date",
    yaxis_title="Sales",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#042a30"
    )
)

fig.update_xaxes(rangeslider_visible=True,)
fig.show()

As there are 4 stores in California compared to 3 stores each in Texas and Wisconsin, California is on top the chart

#### c. Total sales per day per category

In [None]:
for i in range(30490):
    i = df2.columns[i]
    category = i.split('_')[0]
    if category not in df2.columns:
        df2[category] = 0
for i in range(30490):
    i = df2.columns[i]
    category = i.split('_')[0]
    df2[category] += df2[i]
df2.head()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(x=df2['date'], y=df2['FOODS'].values,
                    mode='lines',
                    name='FOODS'))
fig.add_trace(go.Scatter(x=df2['date'], y=df2['HOBBIES'].values,
                    mode='lines',
                    name='HOBBIES'))
fig.add_trace(go.Scatter(x=df2['date'], y=df2['HOUSEHOLD'].values,
                    mode='lines',
                    name='HOUSEHOLD'))
fig.update_layout(
    autosize=False,
    width=1000,
    height=700,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
    title="Walmart category wise sales",
    xaxis_title="Date",
    yaxis_title="Sales",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#042a30"
    )
)


fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
df2.columns

#### d. Total sales per day per state per store

In [None]:
for i in range(30490):
    i = df2.columns[i]
    store = i.split('_')[3] + '_' + i.split('_')[4]
    if store not in df2.columns:
        df2[store] = 0
for i in range(30490):
    i = df2.columns[i]
    store = i.split('_')[3] + '_' + i.split('_')[4]
    df2[store] += df2[i]
df2.head()

In [None]:
for i in range(30490):
    i = df2.columns[i]
    item = i.split('_')[3] + '_' + i.split('_')[4] + '_' + i.split('_')[0]
    if item not in df2.columns:
        df2[item] = 0
for i in range(30490):
    i = df2.columns[i]
    item = i.split('_')[3] + '_' + i.split('_')[4] + '_' + i.split('_')[0]
    df2[item] += df2[i]
df2.head()

### 4.2 Analysis of California stores

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

for i in range(30498,30502): 
    i = df2.columns[i]
    fig.add_trace(go.Scatter(x=df2['date'], y=df2[i].values,
                        mode='lines',
                        name=i.split('_')[0] + ' store ' +i.split('_')[1]))
    
fig.update_layout(
    autosize=False,
    width=1000,
    height=700,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
    title="Walmart California store wise sales",
    xaxis_title="Date",
    yaxis_title="Sales",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#042a30"
    )
)

fig.update_xaxes(rangeslider_visible=True)
fig.show()

#### 4.2.1 Analysis of California Store 3

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

for i in range(30510,30531,10): 
    i = df2.columns[i]
    fig.add_trace(go.Scatter(x=df2['date'], y=df2[i].values,
                        mode='lines',
                        name=i.split('_')[2]))
    
fig.update_layout(
    autosize=False,
    width=1000,
    height=700,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
    title="Walmart California store 3 category wise sales",
    xaxis_title="Date",
    yaxis_title="Sales",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#042a30"
    )
)


fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
df2.columns

### 4.3 Events analysis

In [None]:
cal = pd.read_csv('calendar.csv')
events = cal[['date','event_name_1','event_type_1','event_name_2','event_type_2']]
events = events.fillna(0)
events = events[(events['event_name_1'] != 0) | (events['event_name_2'] != 0)]
events.shape

In [None]:
l = []
c = 0
for x in events['date'].values:
    c +=1
    l.append(
    dict(
        type="line",
        yref='paper',
        y0=0,
        y1=1,
        xref='x1',
        x0=x,
        x1=x,
        line=dict(
            color="Red",
            width=2,
            dash="dashdot",
    )))
print(c)
fig = px.line(df2, x='date', y='CA_3')
fig.update_layout(shapes=l)
fig.show()

In [None]:

events[events['date'] == dt.datetime(2015, 4 , 12)]
events[(events['date'].apply(lambda a : dt.datetime.strptime(a, "%Y-%m-%d").month) == 5) | (events['date'].apply(lambda a : dt.datetime.strptime(a, "%Y-%m-%d").month) == 6)]

In [None]:
df2[['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA', 'snap_TX', 'snap_WI']] = cal[['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA', 'snap_TX', 'snap_WI']]
df2[['date','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA', 'snap_TX', 'snap_WI']]


### 4.4 Disaster analysis

In [None]:
dis = pd.read_csv('us_disasters_m5.csv')
dis.head()

In [None]:
dis_ca = dis[dis['state'] == 'CA']
print(dis_ca.shape)
dis_ca.head()

In [None]:
dis_ca['declaration_date'] = pd.to_datetime(dis_ca['declaration_date'].apply(lambda x : x[:10]))
dis_ca['declaration_date'].head()

In [None]:
dis_ca_timeline = dis_ca[['incident_type','declaration_date']]
dis_ca_timeline['declaration_date'] = pd.to_datetime(dis_ca_timeline['declaration_date'])
dis_ca_timeline['declaration_date'] = dis_ca_timeline['declaration_date'].apply(lambda x : x.strftime("%Y-%m-%d"))

In [None]:

dis_ca_timeline = dis_ca_timeline.reset_index()
dis_ca_timeline = dis_ca_timeline.drop(columns='index') 
dis_ca_timeline.head()

In [None]:
dis_ca_timeline.head()

In [None]:
df2[['date','Total']].set_index('date').head()

In [None]:
l = []
for i in range(54):
    x = dis_ca_timeline['declaration_date'].iloc[i]
    l.append(
    dict(
        type="line",
        yref='paper',
        y0=0,
        y1=1,
        xref='x1',
        x0=x,
        x1=x,
        line=dict(
            color="Red",
            width=2,
            dash="dashdot",
    )))
fig = px.line(df2, x='date', y='Total')

i=40
x = dis_ca_timeline['declaration_date'].iloc[i]
fig.update_layout(shapes=l)    

In [None]:
dis_ca_timeline['declaration_date'].unique()

In [None]:
dis_ca_timeline['Threat level'] = 'Minor'

d = dis_ca_timeline['incident_type'] == 'Tsunami'

dis_ca_timeline.loc[d.values,'Threat level'] = 'Major'

In [None]:
dis_ca_timeline[dis_ca_timeline['incident_type'] != 'Fire']

In [None]:
dis[dis['state'] == 'CA']
dis.iloc[327:329,:]

In [None]:
dis_ca_timeline1 = dis_ca[['incident_type','incident_begin_date']]
dis_ca_timeline1['incident_begin_date'] = pd.to_datetime(dis_ca_timeline1['incident_begin_date'])
dis_ca_timeline1['incident_begin_date'] = dis_ca_timeline1['incident_begin_date'].apply(lambda x : x.strftime("%Y-%m-%d"))
dis_ca_timeline1 = dis_ca_timeline1.reset_index()
dis_ca_timeline1 = dis_ca_timeline1.drop(columns='index') 
dis_ca_timeline1.set_index('incident_begin_date')
#dis_ca_timeline1 = dis_ca_timeline1[dis_ca_timeline1['incident_type'] != 'Fire']
dis_ca_timeline1.head()

In [None]:
dis_ca_timeline2 = pd.read_excel('dis_ca_timeline1_modified.xlsx')
dis_ca_timeline2[dis_ca_timeline2['Threat level'] == 'Medium']['incident_begin_date'].values
dis_ca_timeline2.head()

#### 4.4.1 Disaster is Medium

In [None]:
l = []
for x in dis_ca_timeline2[dis_ca_timeline2['Threat level'] == 'Medium']['incident_begin_date'].values:
    l.append(
    dict(
        type="line",
        yref='paper',
        y0=0,
        y1=1,
        xref='x1',
        x0=x,
        x1=x,
        line=dict(
            color="Red",
            width=2,
            dash="dashdot",
    )))
    
fig = px.line(df2, x='date', y='CA_3')
fig.update_layout(shapes=l)
fig.show()

## 5. Extracting single item 

In [None]:
dataset = pd.concat([cal,total_value['HOBBIES_1_001_CA_1_validation']],1)

In [None]:
# Head of the dataset
dataset.head()

In [None]:
# Shape of the dataset
dataset.shape

In [None]:
# Info of the dataset
dataset.info()

In [None]:
# Summary statistics
dataset.describe()

**As we could see the max and min are almost same for all the columns so no need of scaling**

In [None]:
# Null values check
dataset.isnull().sum()

### 5.1 Data preprocessing

In [None]:
dataset.drop(['wm_yr_wk', 'weekday','d', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_TX', 'snap_WI'],1,inplace=True)

In [None]:
dataset['event_name_1'] = dataset['event_name_1'].fillna(0)
dataset['event_name_1'] = np.where(dataset['event_name_1'] != 0,1,0)

In [None]:
dataset.info()

In [None]:
dataset.head()

In [None]:
dataset['date'] = pd.to_datetime(dataset['date'])
dataset['dayofmonth'] = dataset['date'].dt.day
dom = pd.get_dummies(dataset['dayofmonth'],prefix='dayofmonth_',drop_first=True)
month = pd.get_dummies(dataset['month'],prefix='month_',drop_first=True)
year = pd.get_dummies(dataset['year'],prefix='year_',drop_first=True)
wday = pd.get_dummies(dataset['wday'],prefix='wday_',drop_first=True)
dataset.drop(['month','year','dayofmonth','wday'],1,inplace=True)
dataset = pd.concat([dataset,month,year,dom,wday],axis=1)

In [None]:
dataset = dataset.iloc[896:-28,:]
dataset.head()

In [None]:
split_date = '2016-04-24'
Train = dataset.loc[dataset['date'] <= split_date].copy()
Test = dataset.loc[dataset['date'] > split_date].copy()

In [None]:
Train.drop(['date'],1,inplace=True)
Test.drop(['date'],1,inplace=True)

In [None]:
Train.tail()

In [None]:
x_train = Train.drop(['HOBBIES_1_001_CA_1_validation'],1)
y_train = Train['HOBBIES_1_001_CA_1_validation']
x_test = Test.drop(['HOBBIES_1_001_CA_1_validation'],1)

In [None]:
os.chdir('/kaggle/input/wallmart-sales/')
evaluation_df = pd.read_csv('sales_train_evaluation.csv')

In [None]:
y_test = pd.Series(evaluation_df.iloc[0,-28:].values)

In [None]:
x = pd.concat([x_train,x_test],0)
y = pd.concat([y_train,y_test],0)

In [None]:
x.shape

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=54)
pca.fit_transform(x)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative variance explained')
plt.show()

In [None]:
pca = PCA(n_components=20)
pca.fit(x)
x = pca.transform(x)

In [None]:
x = pd.DataFrame(x)

In [None]:
x_train = x.iloc[:-28,:]
x_test = x.iloc[-28:,:]

In [None]:
y_test *= 8.38

### 5.2 Building models

### 5.2.1 Linear Regression

In [None]:
# Importing Linear Regression and fitting the model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train,y_train)
lr_pred = lr.predict(x_test)

In [None]:
# Importing metrics and evaluating the model
from sklearn import metrics

In [None]:
y_test

In [None]:
# RMSE score 
lr_rmse = np.sqrt(metrics.mean_squared_error(lr_pred,y_test))
lr_rmse

In [None]:
# R2 score
lr_r2score = metrics.r2_score(lr_pred,y_test)
lr_r2score

In [None]:
# Train score
lr_train = lr.score(x_train,y_train)
lr_train

In [None]:
# Test score
lr_test = lr.score(x_test,y_test)
lr_test

### 5.2.2 Decision Tree  <a id='dt'>

In [None]:
# Importing Decision Tree and performing decision tree
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(x_train,y_train)
dt_pred = dt.predict(x_test)

In [None]:
# RMSE score for Decision Tree
dt_rmse = np.sqrt(metrics.mean_squared_error(dt_pred,y_test))
dt_rmse

In [None]:
# R2 score for Decision Tree
dt_r2score = metrics.r2_score(dt_pred,y_test)
dt_r2score

In [None]:
# Train score for Decision Tree
dt_train = dt.score(x_train,y_train)
dt_train

In [None]:
# Test score for Decision Tree
dt_test = dt.score(x_test,y_test)
dt_test

**Parameter Tuning**

In [None]:
# Importing Randomizedsearchcv and finding out optimal parameters for Decision Tree
from sklearn.model_selection import RandomizedSearchCV
params = {'max_depth': np.arange(1,20),'criterion':['mse','mae']}
dt = DecisionTreeRegressor()
tree = RandomizedSearchCV(dt, params, cv=3 , return_train_score = True) # RandomizedSearchCV
tree.fit(x,y)# Fit

In [None]:
# optimal parameters
tree.best_params_

In [None]:
# Fitting the model and training and testing after parameter tuning
dtr = DecisionTreeRegressor(criterion='mse',max_depth=1)
dtr.fit(x_train,y_train)
dtr_pred = dtr.predict(x_test)

In [None]:
# RMSE score for DT after parameter tuning
dt_tune_rmse = np.sqrt(mean_squared_error(dtr_pred,y_test))
dt_tune_rmse

In [None]:
# R2 score for DT after parameter tuning
dt_tune_r2score = r2_score(dtr_pred,y_test)
dt_tune_r2score

In [None]:
# Train score for DT after parameter tuning
dt_tune_train = dtr.score(x_train,y_train)
dt_tune_train

In [None]:
# Test score for DT after parameter tuning
dt_tune_test = dtr.score(x_test,y_test)
dt_tune_test

### 5.2.3 Random Forest  <a id='rf'>

In [None]:
# Importing Random Forest Regressor and fitting the model
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
rf_pred = rf.predict(x_test)

In [None]:
# RMSE score for Random Forest
rf_rmse = np.sqrt(mean_squared_error(rf_pred,y_test))
rf_rmse

In [None]:
# R2 Score for Random Forest
rf_r2score = r2_score(rf_pred,y_test)
rf_r2score

In [None]:
# Train score for Random Forest
rf_train = rf.score(x_train,y_train)
rf_train

In [None]:
# Test score for Random Forest
rf_test = rf.score(x_test,y_test)
rf_test

**Parameter Tuning**

In [None]:
# Using Randomized SearchCV and finding optimal parameters
rf = RandomForestRegressor()
params1 = {'n_estimators': np.arange(1,20),'criterion':['mse','mae']}
forest = RandomizedSearchCV(rf, params, cv=3 , return_train_score = True) # GridSearchCV
forest.fit(x,y)# Fit

In [None]:
# optimal parameters
forest.best_params_

In [None]:
# Random Forest after parameter tuning
rfr = RandomForestRegressor(criterion='mse',max_depth=2)
rfr.fit(x_train,y_train)
rfr_pred = rfr.predict(x_test)

In [None]:
# RMSE score  for Random Forest after parameter tuning
rf_tune_rmse = np.sqrt(metrics.mean_squared_error(rfr_pred,y_test))
rf_tune_rmse

In [None]:
# R2 score for Random Forest after parameter tuning
rf_tune_r2score = metrics.r2_score(rfr_pred,y_test)
rf_tune_r2score

In [None]:
# Train score for Random Forest after parameter tuning
rf_tune_train = rfr.score(x_train,y_train)
rf_tune_train

In [None]:
# Test score for Random Forest after parameter tuning
rf_tune_test = rfr.score(x_test,y_test)
rf_tune_test

###  5.2.4 Support Vector Machine <a id = 'svm'>

In [None]:
# Importing Support Vector Regressor and fitting the model
from sklearn.svm import SVR
svm = SVR()
svm.fit(x_train,y_train)
svm_pred = svm.predict(x_test)

In [None]:
# RMSE score for SVM
svm_rmse = np.sqrt(metrics.mean_squared_error(svm_pred,y_test))
svm_rmse

In [None]:
# R2 score for SVM
svm_r2score = metrics.r2_score(svm_pred,y_test)
svm_r2score

In [None]:
# Train score for SVM
svm_train = svm.score(x_train,y_train)
svm_train

In [None]:
# Test score for SVM
svm_test = svm.score(x_test,y_test)
svm_test

**Parameter Tuning**

In [None]:
# Using Randomized Search cv to find the optimal parameters
params2 = {'kernel':['linear','rbf'],'C': [0.01, 0.1, 1, 10],'gamma': [0.01,0.1,1,10]}
svr = SVR()
support = RandomizedSearchCV(svr, params2, cv=3 , return_train_score = True) # RandomizedSearchCV
support.fit(x,y)# Fit

In [None]:
# optimal parameters
support.best_params_

In [None]:
# Fitting the model and training and testing
svrr = SVR(C = 10,gamma = 10,kernel = 'rbf')
svrr.fit(x_train,y_train)
svrr_pred = svrr.predict(x_test)

In [None]:
# RMSE score for SVM after parameter tuning
svm_tune_rmse = np.sqrt(metrics.mean_squared_error(svrr_pred,y_test))

In [None]:
svm_tune_rmse

In [None]:
# R2 score for SVM after parameter tuning
svm_tune_r2score = metrics.r2_score(svrr_pred,y_test)
svm_tune_r2score

In [None]:
# Train score for SVM after parameter tuning
svm_tune_train = svrr.score(x_train,y_train)
svm_tune_train

In [None]:
# Test score for SVM after parameter tuning
svm_tune_test = svrr.score(x_test,y_test)
svm_tune_test

### 5.2.5 KNearest Neighbors <a id = 'knn'>

In [None]:
# Importing KNearest Neighbors and fitting the model
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit(x_train,y_train)
knn_pred = knn.predict(x_test)

In [None]:
# RMSE score for KNN
knn_rmse = np.sqrt(metrics.mean_squared_error(knn_pred,y_test))
knn_rmse

In [None]:
# R2 score for KNN
knn_r2score = metrics.r2_score(knn_pred,y_test)
knn_r2score

In [None]:
# Train score for KNN
knn_train = knn.score(x_train,y_train)
knn_train

In [None]:
# Test score for KNN
knn_test = knn.score(x_test,y_test)
knn_test

**Parameter Tuning**

In [None]:
# Finding optimal parameters using Randomized Search CV
params4 = {'leaf_size':np.arange(1,50),'n_neighbors':np.arange(1,30),'p':[1,2]}
knn = KNeighborsRegressor()
neighbor = RandomizedSearchCV(knn, params4, cv=3 , return_train_score = True) # RandomizedSearchCV
neighbor.fit(x,y)

In [None]:
# Optimal parameters
neighbor.best_params_

In [None]:
# Fitting the model and training and testing
knn = KNeighborsRegressor(n_neighbors=21,p=1,leaf_size=38)
knn.fit(x_train,y_train)
knnr_pred = knn.predict(x_test)

In [None]:
# RMSE score for KNN after parameter tuning
knn_tune_rmse = np.sqrt(metrics.mean_squared_error(knnr_pred,y_test))
knn_tune_rmse

In [None]:
# R2 score for KNN after parameter tuning
knn_tune_r2score = metrics.r2_score(knnr_pred,y_test)
knn_tune_r2score

In [None]:
# Train score for KNN after parameter tuning
knn_tune_train = knn.score(x_train,y_train)
knn_tune_train

In [None]:
# Test score for KNN after parameter tuning
knn_tune_test = knn.score(x_test,y_test)
knn_tune_test

## 5.2.6 Ada Boost Classifier <a id = 'ada'>

In [None]:
# Fitting the Ada Boost model
from sklearn.ensemble import AdaBoostRegressor
ab = AdaBoostRegressor()
ab.fit(x_train,y_train)
ab_pred = ab.predict(x_test)

In [None]:
# RMSE score for Ada Boost
ab_rmse = np.sqrt(metrics.mean_squared_error(ab_pred,y_test))
ab_rmse

In [None]:
# R2 score for Ada Boost
ab_r2score = metrics.r2_score(ab_pred,y_test)
ab_r2score

In [None]:
# Train score for Ada Boost
ab_train = ab.score(x_train,y_train)
ab_train

In [None]:
# Test score for Ada Boost
ab_test = ab.score(x_test,y_test)
ab_test

**Parameter Tuning**

In [None]:
# Finding the optimal parameters for Ada Boost Regressor using Randomized Search CV

param_grid1 = {"n_estimators": range(5,20,2) ,  
              "learning_rate": [0.01,0.05,0.1,0.5,1],'loss':['linear','square','exponential']}
 

AB = RandomizedSearchCV(ab,param_distributions=param_grid1,
                           cv = 5,
                           n_jobs=-1,
                           verbose=2)
AB.fit(x,y)

In [None]:
# optimal parameters
AB.best_params_

In [None]:
# Fitting the model after parameter tuning
abr = AdaBoostRegressor(n_estimators=9,learning_rate=0.05,loss='linear')
abr.fit(x_train,y_train)
abr_pred = abr.predict(x_test)

In [None]:
# RMSE score for Ada Boost after parameter tuning
ab_tune_rmse = np.sqrt(metrics.mean_squared_error(abr_pred,y_test))
ab_tune_rmse

In [None]:
# R2 score for Ada Boost after parameter tuning
ab_tune_r2score = metrics.r2_score(abr_pred,y_test)
ab_tune_r2score

In [None]:
# Train score for Ada Boost after parameter tuning
ab_tune_train = abr.score(x_train,y_train)
ab_tune_train

In [None]:
# Test score for Ada Boost after parameter tuning
ab_tune_test = abr.score(x_test,y_test)
ab_tune_test

###5.2.7 Gradient Boosting <a id = 'grad'>

In [None]:
# Fitting the Gradient Boost model
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor()
gb.fit(x_train,y_train)
gb_pred = gb.predict(x_test)

In [None]:
# RMSE score for Gradient Boosting
gb_rmse = np.sqrt(metrics.mean_squared_error(gb_pred,y_test))
gb_rmse

In [None]:
# R2 score for Gradient Boosting
gb_r2score = metrics.r2_score(gb_pred,y_test)
gb_r2score

In [None]:
# Train score for Gradient Boosting
gb_train = gb.score(x_train,y_train)
gb_train

In [None]:
# Test score for Gradient Boosting
gb_test = gb.score(x_test,y_test)
gb_test

**Parameter tuning**

In [None]:
# Finding the optimal parameters For Gradient Boosting Regressor using Randomized Search CV

param_grid1 = {"n_estimators": range(5,20,2) ,  
              "learning_rate": [0.01,0.05,0.1,0.5,1]}
 

GB = RandomizedSearchCV(gb,param_distributions=param_grid1,
                           cv = 5,
                           n_jobs=-1,
                           verbose=2)
GB.fit(x,y)

In [None]:
# optimal parameters
GB.best_params_

In [None]:
# Fitting the Gradient model after parameter tuning
gbr = GradientBoostingRegressor(n_estimators=9,learning_rate=0.01)
gbr.fit(x_train,y_train)
gbr_pred = gbr.predict(x_test)

In [None]:
# RMSE score for Gradient Boosting after parameter tuning
gb_tune_rmse = np.sqrt(metrics.mean_squared_error(gbr_pred,y_test))
gb_tune_rmse

In [None]:
# R2 score for Gradient Boosting after parameter tuning
gb_tune_r2score = metrics.r2_score(gbr_pred,y_test)
gb_tune_r2score

In [None]:
# Train score for Gradient Boosting after parameter tuning
gb_tune_train = gbr.score(x_train,y_train)
gb_tune_train

In [None]:
# Test score for parameter tuning
gb_tune_test = gbr.score(x_test,y_test)
gb_tune_test

### 5.2.8 XG Boost

In [None]:
from xgboost.sklearn import XGBRegressor

In [None]:
# Fitting the model
xgb = XGBRegressor()

In [None]:
# Training the model
xgb.fit(x_train, y_train)

In [None]:
# Testing the model
xgb_pred=xgb.predict(x_test)

In [None]:
# RMSE score for XG Boost
xgb_rmse = np.sqrt(metrics.mean_squared_error(xgb_pred,y_test))
xgb_rmse

In [None]:
# R2 score for XG Boost
xgb_r2score = metrics.r2_score(xgb_pred,y_test)
xgb_r2score

In [None]:
# Train score for XG Boost
xgb_train = xgb.score(x_train,y_train)
xgb_train

In [None]:
# Test score for XG Boost
xgb_test = xgb.score(x_test,y_test)
xgb_test

**Parameter Tuning**

In [None]:
# Finding optimal parameters for XG Boost Regressor using Randomized Search CV
param_grid1 = {"max_depth": [10,15,20,30],
              "n_estimators": range(5,20,2) , 
              "gamma": [0.03,0.05], 
              "learning_rate": [0.01,0.05]}
 

XGB = RandomizedSearchCV(xgb,param_distributions=param_grid1,
                           cv = 5)
XGB.fit(x,y)

In [None]:
# optimal parameters
XGB.best_params_

In [None]:
# Fitting the model after parameter tuning
xgbr = XGBRegressor(n_estimators=9,max_depth=10,learning_rate=0.05,gamma=0.03)

In [None]:
# Training the model after parameter tuning
xgbr.fit(x_train,y_train)

In [None]:
# Testing the model after parameter tuning
xgbr_pred = xgbr.predict(x_test)

In [None]:
# RMSE score for XG Boost after parameter tuning
xgb_tune_rmse = np.sqrt(metrics.mean_squared_error(xgbr_pred,y_test))
xgb_tune_rmse

In [None]:
# R2 score for XG Boost after parameter tuning
xgb_tune_r2score = metrics.r2_score(xgbr_pred,y_test)
xgb_tune_r2score

In [None]:
# Train score for XG Boost after parameter tuning
xgb_tune_train = xgbr.score(x_train,y_train)
xgb_tune_train

In [None]:
# Test score for XG Boost after parameter tuning
xgb_tune_test = xgbr.score(x_test,y_test)
xgb_tune_test

### 5.2.9 Arima models

In [None]:
df_arima_train = Train['HOBBIES_1_001_CA_1_validation']

In [None]:
!python3.7 -m pip install --upgrade pip

In [None]:
!pip install pmdarima

In [None]:
from pmdarima.arima import auto_arima
stepwise_model = auto_arima(df_arima_train,start_p=1,start_q=1,max_p=3,max_q=3,m=7,start_P=0,seasonal=True,d=1,D=1,trace=True,error_action='ignore',suppress_warnings=True,stepwise=True)

In [None]:
ar_day_pred = stepwise_model.predict(n_periods=28)
ar_day_rmse = np.sqrt(metrics.mean_squared_error(ar_day_pred, y_test))
ar_day_rmse

In [None]:
stepwise_model1 = auto_arima(df_arima_train,start_p=1,start_q=1,max_p=3,max_q=3,m=12,start_P=0,seasonal=True,d=1,D=1,trace=True,error_action='ignore',suppress_warnings=True,stepwise=True)
ar_month_pred = stepwise_model1.predict(n_periods=28) 

In [None]:
ar_month_rmse = np.sqrt(metrics.mean_squared_error(ar_month_pred,y_test))
ar_month_rmse

### 5.2.10 Comparison Table

In [None]:
# Creating dictionary for all the metrics and models
metrics_dict = {'Metrics': ['Before Parameter Tune Train Score','Before Parameter Tune Test Score','After Parameter Tune Train Score','After Parameter Tune Test Score','Before Parameter Tune RMSE Score','After Parameter Tune RMSE Score','Before Parameter Tune R2 Score','After Parameter Tune R2 Score'],'Linear Regression':[lr_train,lr_test,'NA','NA',lr_rmse,'NA',lr_r2score,'NA'],
          'Decision Tree Regressor':[dt_train,dt_test,dt_tune_train,dt_tune_train,dt_rmse,dt_tune_rmse,dt_r2score,dt_tune_r2score],'Ramdom Forest Regressor':[rf_train,rf_test,rf_tune_train,rf_tune_test,rf_rmse,rf_tune_rmse,rf_r2score,rf_tune_r2score],'Support Vector Regressor':[svm_train,svm_test,'NA','NA',svm_rmse,'NA',svm_r2score,'NA'],
          'KNearestNeighbor Regressor':[knn_train,knn_test,knn_tune_train,knn_tune_test,knn_rmse,knn_tune_rmse,knn_r2score,knn_tune_r2score],
          'XG Boost Regressor':[xgb_train,xgb_test,xgb_tune_train,xgb_tune_test,xgb_rmse,xgb_tune_rmse,xgb_r2score,xgb_tune_r2score],
          'Ada Boost Regressor':[ab_train,ab_test,ab_tune_train,ab_tune_test,ab_rmse,ab_tune_rmse,ab_r2score,ab_tune_r2score],
          'Gradient Boosting Regressor':[gb_train,gb_test,gb_tune_train,gb_tune_test,gb_rmse,gb_tune_rmse,gb_r2score,gb_tune_r2score]}

In [None]:
# Converting dictionary to dataframe
metrics_df = pd.DataFrame(metrics_dict)

In [None]:
# Dataframe of metrics
metrics_df

### 5.3 Perform the Stacking models Voting and Mlxtend and analyze the metrics

#### 5.3.1 Voting Regressor 

In [None]:
# Assigning estimator models for voting classifier
vote_est = [('lr',lr),('ab',ab),('dt',dt)]

In [None]:
# Importing Voting Regressor
from sklearn.ensemble import VotingRegressor
vote = VotingRegressor(estimators=vote_est)

In [None]:
# Fitting the model
vote.fit(x_train,y_train)

In [None]:
# Testing the model
vote_pred = vote.predict(x_test)

In [None]:
# Importing metrics
from sklearn import metrics

In [None]:
# RMSE score for Voting Regressor
vote_rmse = np.sqrt(metrics.mean_squared_error(vote_pred,y_test))
vote_rmse

In [None]:
# R2 score for Voting Regressor
vote_r2score = metrics.r2_score(vote_pred,y_test)
vote_r2score

In [None]:
# Train score for Voting Regressor
vote_train = vote.score(x_train,y_train)
vote_train

In [None]:
# Test score for Voting Regressor
vote_test = vote.score(x_test,y_test)
vote_test

#### 5.3.2 Mlxtend Stacking Regressor

In [None]:
# mlxtend regressor
from mlxtend.regressor import StackingRegressor

In [None]:
# Assigning individual models to variables
xgb = XGBRegressor()
ada = AdaBoostRegressor()
grad = GradientBoostingRegressor()

In [None]:
# Fitting the model
st = StackingRegressor(regressors=[dt,ab,rf,xgb,ada,grad],meta_regressor=lr)

In [None]:
# Training the model
st.fit(x_train,y_train)

In [None]:
# Testing the model
st_pred = st.predict(x_test)

In [None]:
# RMSE score for Stacking Regressor
st_rmse = np.sqrt(metrics.mean_squared_error(st_pred,y_test))
st_rmse

In [None]:
# R2 score for Stacking Regressor
st_r2score = metrics.r2_score(st_pred,y_test)
st_r2score

In [None]:
# Train score for Stacking Regressor
st_train = st.score(x_train,y_train)
st_train

In [None]:
# Test score for Stacking Regressor
st_test = st.score(x_test,y_test)
st_test

### 5.4 Create a dataframe with model Stacking model names and metric scores and compare along with the first dataframe and give inference

In [None]:
# Creating dictionary for all the metrics and converting it to dataframe
metrics_stack = {'Models': ['Voting Regressor','Stacking Regressor'],'RMSE score':[vote_rmse,st_rmse],'R2 Score':[vote_rmse,st_rmse],'Train score':[vote_train,st_train],'Test score':[vote_test,st_test]}

metrics_stack = pd.DataFrame(metrics_stack)

In [None]:
# Dataframe
metrics_stack

The best model being Voting Regressor

### 5.5 Performing Vecstack

In [None]:
# Importing Vecstack
from vecstack import stacking

In [None]:
#1st level model
models = [lr,ab,dt,svm]
S_train, S_test = stacking(models, x_train, y_train, x_test, 
    regression = True, metric = metrics.r2_score, n_folds = 4 , 
    shuffle = True, random_state = 0, verbose = 2)

In [None]:
#2nd level model
models = [knn,xgb,grad,ada]
S_train, S_test = stacking(models, x_train, y_train, x_test, 
    regression = True, metric = metrics.r2_score, n_folds = 4 , 
    shuffle = True, random_state = 0, verbose = 2)