In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
items_cat = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

In [None]:
train.head()

# Merging data into one piece

In [None]:
df_items = pd.merge(left = items,right=items_cat,on = 'item_category_id')

In [None]:
df_shops = pd.merge(left = train,right=shops,on = 'shop_id')

In [None]:
df = pd.merge(left = df_shops , right = df_items,on = 'item_id')

In [None]:
df.head()

# Understanding the data

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include = 'object')

# Data cleaning and EDA

In [None]:
df.columns

In [None]:
df1=df.drop(['shop_id','item_id','item_category_id'],axis=1)

In [None]:
df1['date'] = pd.to_datetime(df1['date'],dayfirst=True ,format = '%d.%m.%Y')

In [None]:
df1['year'] = df1['date'].dt.year
df1['month'] = df1['date'].dt.month

In [None]:
df1.head()

In [None]:
df1['year'].value_counts().plot(kind = 'bar')

In [None]:
df1['month'].value_counts().plot(kind = 'bar')

In [None]:
#As we extracted year and month we don't need  date column any more
df_new = df1.drop('date',axis=1)

In [None]:
df_new['year'] = df_new['year'].replace({2013:1,2014:2,2015:3})

In [None]:
df_new['year'] = df_new['year'].astype('object')

In [None]:
df_new['month'] =  df_new['month'].astype('object')

In [None]:
df_new.columns

In [None]:
features = df_new[['date_block_num','item_price','item_cnt_day','year','month']]

In [None]:
for i in features.columns:
    for j in features.columns:
        if i != j:
            sns.scatterplot(x = df_new[i],y = df_new[j])
            plt.show()

In [None]:
df_new.isnull().sum()

In [None]:
for i in df_new.select_dtypes(include = 'number').columns:
    sns.boxplot(df_new[i])
    plt.show()

In [None]:
#treating outliers 
for i in df_new.select_dtypes(include = 'number').columns:
    if i != 'item_cnt_day':
        q1 = df_new[i].quantile(0.25)
        q3 = df_new[i].quantile(0.75)
        iqr = q3-q1
        ll = q1-(1.5*iqr)
        ul = q3+(1.5*iqr)
        df_new[i] = df_new[(df_new[i]>ll)&(df_new[i]<ul)][i]


In [None]:
df_new.isnull().sum()

In [None]:
df_new['item_price'].skew()

In [None]:
df_new['item_price'] = df_new['item_price'].fillna(df_new['item_price'].median())

In [None]:
df_new['year'] = df_new['year'].astype('int64')

In [None]:
df_new['month'] =  df_new['month'].astype('int64')

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
for i in df_new.select_dtypes('object').columns:
    df_new[i] = le.fit_transform(df_new[i])

In [None]:
df_new.head()

# Converting the data into dependent and independent features

In [None]:
df_new.head()

In [None]:
df_new.info()

In [None]:
x = df_new.drop('item_cnt_day',axis=1)
y = np.log(df_new['item_cnt_day'])
y = y.fillna(y.median())

# Model Building

In [None]:
import statsmodels.api as sm
import sklearn.metrics
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.feature_selection import RFE

In [None]:
xc = sm.add_constant(x)
model = sm.OLS(y,xc).fit()
print(model.summary())

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=0.7,random_state=7)

In [None]:
lr = LinearRegression()
model_v1 = lr.fit(xtrain,ytrain)
ypred = model_v1.predict(xtest)

In [None]:
lr_scores = cross_val_score(estimator = lr,X=x,y=y,scoring = 'neg_root_mean_squared_error',cv = 5)
lr_rmse = np.mean(lr_scores)
print('RMSE = ',lr_rmse)
print(np.std(lr_scores))

In [None]:
rf = RFE(estimator = lr,n_features_to_select=5)
rf.fit(x,y)

In [None]:
rfe_df = pd.DataFrame({'features':x.columns,'ranking':rf.ranking_})

In [None]:
rfe_df

In [None]:
# as item_price and item_name is not important we are going to drop it
x = x.drop(['item_price','item_name'],axis=1)


In [None]:
# We can do OLS again
xc = sm.add_constant(x)
model = sm.OLS(y,xc).fit()
model.summary()

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=0.7,random_state=7)

In [None]:
model_v2 = lr.fit(xtrain,ytrain)
ypred_lr = model_v2.predict(xtest)

In [None]:
lr_scores = cross_val_score(estimator = lr,X=x,y=y,scoring = 'neg_root_mean_squared_error',cv = 5)
lr_rmse = np.mean(lr_scores)
print('RMSE = ',lr_rmse)
print(np.std(lr_scores))

In [None]:
#RMSE and stability improved a little bit

In [None]:
resid = pd.DataFrame({'Predicted' : ypred , 'Actuals':ytest,'Residual':ypred-ytest})

In [None]:
plt.scatter(resid['Predicted'],resid['Residual'])
plt.grid()

In [None]:
# most of the predictions have an error rate between 0 to -6

In [None]:
rid = Ridge()
model_v3 = rid.fit(xtrain,ytrain)
ypred_rid = model_v3.predict(xtest)

In [None]:
rid_scores = cross_val_score(estimator = rid,X=x,y=y,scoring = 'neg_root_mean_squared_error',cv = 5)
rid_rmse = np.mean(rid_scores)
print('RMSE = ',rid_rmse)
print(np.std(rid_scores))

In [None]:
# RMSE is not much different compared to Linear Regression

In [None]:
resid = pd.DataFrame({'Predicted' : ypred_rid , 'Actuals':ytest,'Residual':ypred_rid-ytest})
plt.scatter(resid['Predicted'],resid['Residual'])
plt.grid()

In [None]:
# even with feature selection there is not much of an improvement

In [None]:
# Let's move to non-linear models
x = df_new.drop('item_cnt_day',axis=1)
y = np.log(df_new['item_cnt_day'])
y = y.fillna(y.median())


In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=0.7,random_state=7)

In [None]:
knn = KNeighborsRegressor()
model_v4 = knn.fit(xtrain,ytrain)
ypred_knn = model_v4.predict(xtest)

In [None]:
knn_scores = cross_val_score(estimator = knn,X=x,y=y,scoring = 'neg_root_mean_squared_error',cv = 5)
knn_rmse = np.mean(knn_scores)
print('RMSE = ',knn_rmse)
print(np.std(knn_scores))

In [None]:
# error is increased and stability stability is reduced

In [None]:
resid = pd.DataFrame({'Predicted' : ypred_knn , 'Actuals':ytest,'Residual':ypred_knn-ytest})
plt.scatter(resid['Predicted'],resid['Residual'])
plt.grid()

In [None]:
dt = DecisionTreeRegressor()
model_v5 = dt.fit(xtrain,ytrain)
ypred_dt = model_v5.predict(xtest)

In [None]:
dt_scores = cross_val_score(estimator = dt,X=x,y=y,scoring = 'neg_root_mean_squared_error',cv = 5)
dt_rmse = np.mean(dt_scores)
print('RMSE = ',dt_rmse)
print(np.std(dt_scores))

In [None]:
# here both error rate and stability is depleted

In [None]:
resid = pd.DataFrame({'Predicted' : ypred_dt , 'Actuals':ytest,'Residual':ypred_dt-ytest})
plt.scatter(resid['Predicted'],resid['Residual'])
plt.grid()

In [None]:
gb = GradientBoostingRegressor()
model_v6 = gb.fit(xtrain,ytrain)
ypred_gb = model_v6.predict(xtest)

In [None]:
gb_scores = cross_val_score(estimator = gb,X=x,y=y,scoring = 'neg_root_mean_squared_error',cv = 5)
gb_rmse = np.mean(gb_scores)
print('RMSE = ',gb_rmse)
print(np.std(gb_scores))

In [None]:
resid = pd.DataFrame({'Predicted' : ypred_gb , 'Actuals':ytest,'Residual':ypred_gb-ytest})
plt.scatter(resid['Predicted'],resid['Residual'])
plt.grid()

In [None]:
xgb = XGBRegressor()
model_v7 = xgb.fit(xtrain,ytrain)
ypred_xgb = model_v7.predict(xtest)

In [None]:
xgb_scores = cross_val_score(estimator = xgb,X=x,y=y,scoring = 'neg_root_mean_squared_error',cv = 5)
xgb_rmse = np.mean(xgb_scores)
print('RMSE = ',xgb_rmse)
print(np.std(xgb_scores))

In [None]:
resid = pd.DataFrame({'Predicted' : ypred_xgb , 'Actuals':ytest,'Residual':ypred_xgb-ytest})
plt.scatter(resid['Predicted'],resid['Residual'])
plt.grid()

In [None]:
scores_df = pd.DataFrame({'Models':['Linear Regression','Ridge','KNN','Decision Tree','Gradient Boosting','XG Boosting'],
                         'Scores_RMSE' : [lr_rmse,rid_rmse,knn_rmse,dt_rmse,gb_rmse,xgb_rmse]})

In [None]:
scores_df.sort_values('Scores_RMSE',ascending=False)

In [None]:
# Xg Boost is the best predictor out of all