In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd# data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = "darkgrid")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing machine learning packages

import statsmodels.api as sm
from sklearn.preprocessing import scale
from sklearn import linear_model, metrics
from sklearn import metrics
from sklearn import linear_model
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor

### 1. Loading Data

In [None]:
# declaring dataframe

sales_item = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
sales_item_category = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
sales_shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
sales_train =  pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv', parse_dates=['date'], 
                    dtype={'date': 'str', 'date_block_num': 'int32', 'shop_id': 'int32','item_id': 'int32', 'item_price': 'float32', 'item_cnt_day': 'int32'})
sales_test =  pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')


In [None]:
#checking the file

print(sales_item.shape)
print(sales_item_category.shape)
print(sales_shops.shape)
print(sales_train.shape)
print(sales_test.shape)

In [None]:
print("Items : \n",sales_item.head())
print("Item Category : \n",sales_item_category.head())
print("Shops : \n",sales_shops.head())
print("Train : \n",sales_train.head())
print("Test : \n",sales_test.head())

In [None]:
# checking null value

print("Items : \n",sales_item.isnull().sum())
print("Item Category : \n",sales_item_category.isnull().sum())
print("Shops : \n",sales_shops.isnull().sum())
print("Train : \n",sales_train.isnull().sum())
print("Test : \n",sales_test.isnull().sum())

In [None]:
# duplicate counts in train dataset

print('Duplicates in Train : ',len(sales_train[sales_train.duplicated()]))

In [None]:
sales_train[sales_train.duplicated()]

In [None]:
sales_train.info()

In [None]:
sales_train.describe()

We are downcasting dataframe to save memories

In [None]:
def datatype_downcast(df):
    float_col= [i for i in df if df[i].dtype == 'float64']
    int_col = [i for i in df if df[i].dtype in ['int64','int32']]
    df[float_col] = df[float_col].astype(np.float32)
    df[int_col] = df[int_col].astype(np.int16)
    return df

sales_train = datatype_downcast(sales_train)

sales_train.info()

In [None]:
sales_dataset = sales_train.copy()

In [None]:
sales_dataset[sales_dataset['item_cnt_day'] == 2169.0]

In [None]:
sales_monthly = sales_dataset.groupby(['date_block_num','shop_id','item_id'])['date','item_price','item_cnt_day'].agg({'date':['mean','max'],'item_price':'mean','item_cnt_day':'sum'})


In [None]:
sales_monthly.head()

In [None]:
sales_monthly.columns

### 2.EDA

In [None]:
monthly_sales = sales_dataset.groupby(['date_block_num'])['item_cnt_day'].sum()
monthly_sales.plot()

In [None]:
plt.figure(figsize=(10,10))

sns.heatmap(sales_dataset.corr(),cmap='viridis',annot=True)

In [None]:
plt.rcParams['figure.figsize'] = (25, 10)
sns.barplot(sales_item['item_category_id'],sales_item['item_id'],palette='magma')
plt.title('Number of Item Sold Per Category', fontsize = 40)
plt.xlabel('Item Categories', fontsize = 20)
plt.ylabel('Items', fontsize = 20)
plt.show()

In [None]:
sns.countplot(sales_dataset['date_block_num'],palette='viridis')
plt.show()

In [None]:
sns.countplot(sales_dataset['shop_id'],palette='viridis')
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (30, 20)
sns.barplot(sales_item_category['item_category_name'],sales_item['item_id'],palette='magma')
plt.title('Number of Item Sold Per Category', fontsize = 40)
plt.xlabel('Item Categories', fontsize = 20)
plt.ylabel('Items', fontsize = 20)
plt.show()

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
plt.rcParams['figure.figsize'] = (10, 10)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'white',
                      colormap='magma',
                      max_words = 100, 
                      stopwords = stopwords ,
                      width = 1200,
                      height = 800,
                     random_state = 30).generate(str(sales_shops['shop_name']))


plt.title('Wordcloud for Shop Names', fontsize = 25)
plt.axis('off')
plt.imshow(wordcloud, interpolation = 'bilinear')

In [None]:
plt.rcParams['figure.figsize'] = (10, 10)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'white',
                      colormap='viridis',
                      max_words = 100, 
                      stopwords = stopwords ,
                      width = 1200,
                      height = 800,
                     random_state = 30).generate(str(sales_item_category['item_category_name']))


plt.title('Wordcloud for Item Category Names', fontsize = 25)
plt.axis('off')
plt.imshow(wordcloud, interpolation = 'bilinear')

In [None]:
## Creating Date , Month and Year 

days = []
months = []
years = []

for day in sales_dataset['date']:
    days.append(day.day)
for month in sales_dataset['date']:
    months.append(month.month)
for year in sales_dataset['date']:
    years.append(year.year)

In [None]:
plt.rcParams['figure.figsize'] = (15, 7)
sns.countplot(days, palette= 'mako')
plt.title('The busiest days for the shops', fontsize = 24)
plt.xlabel('Days', fontsize = 12)
plt.ylabel('Frequency', fontsize = 12)

plt.show()

In [None]:
# busy month
plt.rcParams['figure.figsize'] = (15, 7)
sns.countplot(months, palette= 'rocket')
plt.title('The busiest months for the shops', fontsize = 24)
plt.xlabel('Months', fontsize = 12)
plt.ylabel('Frequency', fontsize = 12)

plt.show()


In [None]:

# busy year
plt.rcParams['figure.figsize'] = (15, 7)
sns.countplot(years, palette= 'viridis')
plt.title('The busiest years for the shops', fontsize = 24)
plt.xlabel('Years', fontsize = 12)
plt.ylabel('Frequency', fontsize = 12)

plt.show()

In [None]:
sales_dataset['day'] = days
sales_dataset['month'] = months
sales_dataset['year'] = years

In [None]:
sales_dataset.head()

In [None]:
sns.countplot(sales_dataset[(sales_dataset.month == 2) & (sales_dataset.year == 2013)]['shop_id'], palette='magma')

In [None]:
plt.figure(figsize=(10,5))
plt.xlim(sales_dataset.item_price.min(), sales_dataset.item_price.max()*1.1)
sns.boxplot(x=sales_dataset.item_price)

In [None]:
plt.figure(figsize=(10,4))
plt.xlim(sales_dataset.item_cnt_day.min(), sales_dataset.item_cnt_day.max()*1.1)
sns.boxplot(x=sales_dataset.item_cnt_day)

In [None]:
sales_dataset = sales_dataset[sales_dataset['item_price'] < 100000]
sales_dataset = sales_dataset[sales_dataset['item_cnt_day'] < 1200]

In [None]:
sales_dataset.shape

In [None]:
sales_dataset[sales_dataset['item_price']<0]

In [None]:
sales_median = sales_dataset[(sales_dataset.shop_id==32)&(sales_dataset.item_id==2973)&(sales_dataset.date_block_num==4)&(sales_dataset.item_price>0)].item_price.median()
sales_median

In [None]:
sales_dataset["item_price"] = sales_dataset["item_price"].map(lambda x: sales_median if x<0 else x)

In [None]:
sales_dataset[sales_dataset['item_price']<0]

In [None]:
sales_dataset[sales_dataset['item_cnt_day'] < 0]

In [None]:
sales_dataset['item_cnt_day'] = sales_dataset['item_cnt_day'].map(lambda x:0 if x<0 else x)


In [None]:
sales_dataset[sales_dataset['item_cnt_day'] < 0]

### 3. Data Preprocessing

In [None]:
sales_dataset.head()

In [None]:
# checking the uniqueness in shop and test

print("Unique Item : ",sales_item['item_id'].nunique())
print("Unique Item (Train) : ",sales_dataset['item_id'].nunique())
print("Unique Item (Test) : ",sales_test['item_id'].nunique())

In [None]:
print("Unique Item : ",sales_shops['shop_id'].nunique())
print("Unique Item (Train) : ",sales_dataset['shop_id'].nunique())
print("Unique Item (Test) : ",sales_test['shop_id'].nunique())

In [None]:
test_item_list = [x for x in (np.unique(sales_test['item_id']))]
train_item_list = [x for x in (np.unique(sales_dataset['item_id']))]

missing_item_ids_ = [element for element in test_item_list if element not in train_item_list]
len(missing_item_ids_)

Shop Data Processing

In [None]:
sales_shops

In [None]:
sales_shops['shop_name'] = sales_shops['shop_name'].map(lambda x: x.split('!')[1] if x.startswith('!') else x)

In [None]:
sales_shops['shop_name'] = sales_shops['shop_name'].map(lambda x: 'СергиевПосад ТЦ "7Я"' if x == 'Сергиев Посад ТЦ "7Я"' else x)

In [None]:
sales_shops['shop_name']

In [None]:
sales_shops['shop_city'] = sales_shops['shop_name'].map(lambda x: x.split(" ")[0])
# lets assign code to these city names too
sales_shops['city_code'] = sales_shops['shop_city'].factorize()[0]

In [None]:
sales_shops

In [None]:
for shop_id in sales_shops['shop_id'].unique():
    sales_shops.loc[shop_id,'num_of_product'] = sales_dataset[sales_dataset['shop_id']==shop_id]['item_id'].nunique()
    sales_shops.loc[shop_id,'min_price'] = sales_dataset[sales_dataset['shop_id']==shop_id]['item_price'].min()
    sales_shops.loc[shop_id,'max_price'] = sales_dataset[sales_dataset['shop_id']==shop_id]['item_price'].max()
    sales_shops.loc[shop_id,'mean_price'] = sales_dataset[sales_dataset['shop_id']==shop_id]['item_price'].mean()
    
sales_shops.head()
    

In [None]:
sales_item_category.head()

In [None]:
category_list =[]

for cat_name in sales_item_category['item_category_name']:
    category_list.append(cat_name.split('-'))

category_list

In [None]:
sales_item_category['split'] = (category_list)
sales_item_category['category_type'] = sales_item_category['split'].map(lambda x: x[0])

In [None]:
sales_item_category['category_type_code'] = sales_item_category['category_type'].factorize()[0]

In [None]:
sales_item_category['sub_category_type'] = sales_item_category['split'].map(lambda x: x[1] if len(x)>1 else x[0])

sales_item_category['sub_category_type_code'] = sales_item_category['sub_category_type'].factorize()[0]

In [None]:
sales_item_category.drop('split',axis = 1 ,inplace = True)

sales_item_category.head()

### 4. Creating New Dataframe by merging

In [None]:
sales_dataset = sales_dataset[sales_dataset['item_cnt_day']>0]
sales_dataset.head()

In [None]:
sales_dataset = sales_dataset[["month", "date_block_num", "shop_id", "item_id", "item_price", "item_cnt_day"]].groupby(['date_block_num',"shop_id", "item_id"]).agg({"item_price":"mean","item_cnt_day":"sum","month":"min"}).reset_index()


In [None]:
sales_dataset.head(2)

In [None]:
sales_dataset.rename(columns={"item_cnt_day":"item_cnt_month"},inplace=True)

In [None]:
sales_dataset.head(2)

In [None]:
# merging item , shops and category

sales_dataset = pd.merge(sales_dataset,sales_item,on='item_id',how='inner')

sales_dataset = pd.merge(sales_dataset,sales_shops,on='shop_id',how='inner')

sales_dataset = pd.merge(sales_dataset,sales_item_category,on='item_category_id',how='inner')



In [None]:
sales_dataset.head(2).T

In [None]:
sales_dataset.drop(['item_name','shop_name','shop_city','item_category_name','category_type','sub_category_type'],axis = 1,inplace=True)

In [None]:
sales_dataset.head().T

In [None]:
# test dataset

sales_test.head(2)

In [None]:
sales_test.shape

In [None]:
sales_dataset.shape

In [None]:
sales_dataset = sales_dataset[sales_dataset['shop_id'].isin(sales_test['shop_id'].unique())]
sales_dataset = sales_dataset[sales_dataset['item_id'].isin(sales_test['item_id'].unique())]

In [None]:
sales_dataset.shape

In [None]:
sales_train_new = sales_dataset.copy()

sales_train_new = sales_train_new.pivot_table(index=['item_id','shop_id'], columns = 'date_block_num', values = 'item_cnt_month', fill_value = 0).reset_index()

sales_train_new = pd.merge(sales_test,sales_train_new,on = ['item_id','shop_id'],how = 'left')
sales_train_new.fillna(0,inplace = True)

In [None]:
sales_train_new.head().T

In [None]:
sales_train_new.shape

In [None]:
sales_train_new.columns

In [None]:
sales_train_new.isnull().sum()

### Step 4 : Train Test Split

In [None]:
X_train = sales_train_new.drop(33, axis=1)
y_train = sales_train_new[32]

X_valid = sales_train_new.drop(33, axis=1)
y_valid = sales_train_new[32]

# deleting the column so that it can predict the future sales data
X_test = sales_train_new.drop(0, axis=1)
y_test = sales_train_new[33]

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

**Linear Regression**

In [None]:
# Loading Linear Regression 
lm = LinearRegression()

# Fitting X train and y train
lm.fit(X_train, y_train)


In [None]:
# Predicting the X train and X test

y_pred_train = lm.predict(X_train)
y_pred_test = lm.predict(X_test)

sales_data_lr_metric = []

# calculating r2 square in Train and Test for Linear Regression
print("R2 Square - Linear Regression")
r2_train_lr = r2_score(y_train, y_pred_train)
print("Train : ",r2_train_lr)
sales_data_lr_metric.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print("Test : ",r2_test_lr)
sales_data_lr_metric.append(r2_test_lr)

print('_____________________________________')

#Calculating Residual Sum Of Square (RSS) of Train and Test for Linear Regression

print("Residual Sum Of Square (RSS) - Linear Regression")
rss_train_lr = np.sum(np.square(y_train - y_pred_train))
print("Train : ",rss_train_lr)
sales_data_lr_metric.append(rss_train_lr)

rss_test_lr = np.sum(np.square(y_test - y_pred_test))
print("Test : ",rss_test_lr)
sales_data_lr_metric.append(rss_test_lr)

print('_____________________________________')

# Calculating Mean Squared Error(MSE) of Train and Tes for Linear Regression

print("Mean Squared Error (MSE) - Linear Regression")
mse_train_lr = mean_squared_error(y_train, y_pred_train)
print("Train : ",mse_train_lr)
sales_data_lr_metric.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print("Test : ",mse_test_lr)
sales_data_lr_metric.append(mse_test_lr**0.5)

In [None]:
print(lm.coef_)

In [None]:
rfe = RFE(lm,50)

rfe = rfe.fit(X_train, y_train)

In [None]:
# Assigning the columns selected by RFE to cols

col = X_train.columns[rfe.support_]

# assigning the 50 features selected using RFE to a dataframe

sales_data_temp = pd.DataFrame(list(zip(X_train.columns,rfe.support_,rfe.ranking_)), columns=['Variable', 'rfe_support', 'rfe_ranking'])
sales_data_temp = sales_data_temp.loc[sales_data_temp['rfe_support'] == True]

sales_data_temp

In [None]:
# checking the columns for X train
X_train.columns

In [None]:
# list of alphas for Ridge

params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 
                    9.0, 10.0, 20, 50, 100, 500, 1000 ]}

# Loadinf Ridge

ridge = Ridge()

# cross validation with fold 5 

folds = 5
ridge_model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)  
# Fitting the Ridge model cv with X train and Y train 
ridge_model_cv.fit(X_train, y_train) 


In [None]:
print(ridge_model_cv.best_estimator_)

In [None]:
ridge_cv_results = pd.DataFrame(ridge_model_cv.cv_results_)
ridge_cv_results = ridge_cv_results[ridge_cv_results['param_alpha']<=500]
ridge_cv_results[['param_alpha', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by = ['rank_test_score'])

In [None]:
# plotting mean test and train scoes with alpha = 4

ridge_cv_results['param_alpha'] = ridge_cv_results['param_alpha'].astype('int16')

# plotting

plt.plot(ridge_cv_results['param_alpha'], ridge_cv_results['mean_train_score'])
plt.plot(ridge_cv_results['param_alpha'], ridge_cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper right')
plt.show()

In [None]:
ridge_model_cv.best_params_

In [None]:
alpha = 1000
ridge = Ridge(alpha=alpha)

ridge.fit(X_train, y_train)
ridge.coef_

In [None]:
print(ridge.coef_)

In [None]:
sales_data_ridge = pd.DataFrame({'Features':X_train.columns, 'Coefficient':ridge.coef_.round(4)})
sales_data_ridge.reset_index(drop=True, inplace=True)
sales_data_ridge

In [None]:
ridge_coeff_dict = dict(pd.Series(ridge.coef_.round(5), index = X_train.columns))
ridge_coeff_dict

In [None]:
X_train_ridge = X_train[sales_data_ridge.Features]

lm = LinearRegression()
lm.fit(X_train_ridge, y_train)

# running RFE
rfe = RFE(lm, 15)            
rfe = rfe.fit(X_train_ridge, y_train)

In [None]:
# Creatinf function to gety coefficient value

def ridge_find_coefficient(x):
    return ridge_coeff_dict[x]

# Assign top 15 features to a temp dataframe for further display in the bar plot

sales_data_ridge = pd.DataFrame(list(zip( X_train_ridge.columns, rfe.support_, rfe.ranking_)), columns=['Features', 'rfe_support', 'rfe_ranking'])
sales_data_ridge = sales_data_ridge.loc[sales_data_ridge['rfe_support'] == True]
sales_data_ridge.reset_index(drop=True, inplace=True)

sales_data_ridge['Coefficient'] = sales_data_ridge['Features'].apply(ridge_find_coefficient)
sales_data_ridge = sales_data_ridge.sort_values(by=['Coefficient'], ascending=False)
sales_data_ridge = sales_data_ridge.head(15)
sales_data_ridge

In [None]:
# Doing a RFE to minimise the features to 15
X_train_ridge = X_train[sales_data_ridge.Features]

lm = LinearRegression()
lm.fit(X_train_ridge, y_train)

# running RFE
rfe = RFE(lm, 15)            
rfe = rfe.fit(X_train_ridge, y_train)

In [None]:
# Creatinf function to gety coefficient value

def ridge_find_coefficient(x):
    return ridge_coeff_dict[x]

# Assign top 15 features to a temp dataframe for further display in the bar plot

sales_data_ridge = pd.DataFrame(list(zip( X_train_ridge.columns, rfe.support_, rfe.ranking_)), columns=['Features', 'rfe_support', 'rfe_ranking'])
sales_data_ridge = sales_data_ridge.loc[sales_data_ridge['rfe_support'] == True]
sales_data_ridge.reset_index(drop=True, inplace=True)

sales_data_ridge['Coefficient'] = sales_data_ridge['Features'].apply(ridge_find_coefficient)
sales_data_ridge = sales_data_ridge.sort_values(by=['Coefficient'], ascending=False)
sales_data_ridge = sales_data_ridge.head(15)
sales_data_ridge

In [None]:
plt.figure(figsize = (30,30))

plt.subplot(4,3,1)

sns.barplot(x='Coefficient' , y = 'Features' , palette='viridis',data = sales_data_ridge)

plt.show()

In [None]:
# Predicting the X train and X test in ridge

y_pred_train = ridge.predict(X_train)
y_pred_test = ridge.predict(X_test)

sales_data_ridge_metric = []

# calculating r2 square in Train and Test for Ridge Regression
print("R2 Square - Ridge Regression")
r2_train_ridge = r2_score(y_train, y_pred_train)
print("Train : ",r2_train_ridge)
sales_data_ridge_metric.append(r2_train_ridge)

r2_test_ridge = r2_score(y_test, y_pred_test)
print("Test : ",r2_test_ridge)
sales_data_ridge_metric.append(r2_test_ridge)

print('_____________________________________')

#Calculating Residual Sum Of Square (RSS) of Train and Test for Ridge Regression

print("Residual Sum Of Square (RSS) - Ridge Regression")
rss_train_ridge = np.sum(np.square(y_train - y_pred_train))
print("Train : ",rss_train_ridge)
sales_data_ridge_metric.append(rss_train_ridge)

rss_test_ridge = np.sum(np.square(y_test - y_pred_test))
print("Test : ",rss_test_ridge)
sales_data_ridge_metric.append(rss_test_ridge)

print('_____________________________________')

# Calculating Mean Squared Error(MSE) of Train and Test for Ridge Regression

print("Mean Squared Error (MSE) - Ridge Regression")
mse_train_ridge = mean_squared_error(y_train, y_pred_train)
print("Train : ",mse_train_ridge)
sales_data_ridge_metric.append(mse_train_ridge)

mse_test_ridge = mean_squared_error(y_test, y_pred_test)
print("Test : ",mse_test_ridge)
sales_data_ridge_metric.append(mse_test_ridge)

In [None]:
#Ridge visualization for train and test

colz = [i for i in range(0,y_test.shape[0],1)]

fig = plt.figure(figsize=(10,8))

plt.plot(colz,y_test,color='blue',linewidth=3.0,linestyle='-')

plt.plot(colz,y_pred_test,color='red',linewidth=3.0,linestyle='-')

fig.suptitle('Actual and Predicted', fontsize=20)

plt.xlabel('Index', fontsize=16)
plt.ylabel('Sales Price', fontsize=16) 

In [None]:
#Setting Alphas for Lasso
params = {'alpha': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.001, 0.002, 0.003, 0.004, 0.005, 0.01]}


# loading Lasso

lasso = Lasso()

# cross validation with fold 5

folds = 5

lasso_model_cv = GridSearchCV(estimator = lasso, 
                              param_grid = params,
                              cv = folds,
                              scoring = 'neg_mean_absolute_error',
                              return_train_score = True,
                              verbose = 1)

# fitting the lasso model with X_train and y_train
lasso_model_cv.fit(X_train,y_train)

In [None]:
# Displaying the mean scores of Lasso

lasso_cv_results = pd.DataFrame(lasso_model_cv.cv_results_)
lasso_cv_results[['param_alpha', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by = ['rank_test_score'])

In [None]:
# plotting mean train and test scores with alpha 

lasso_cv_results['param_alpha'] = lasso_cv_results['param_alpha'].astype('float64')

# plotting the train and test data

plt.plot(lasso_cv_results['param_alpha'], lasso_cv_results['mean_train_score'])
plt.plot(lasso_cv_results['param_alpha'], lasso_cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')

plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper right')
plt.show()

In [None]:
# checking the lambda
lasso_model_cv.best_estimator_

In [None]:
#checking the coefficients with lambda = 0.0001
alpha = 0.0001

lasso = Lasso(alpha=alpha)

lasso.fit(X_train,y_train)

lasso.coef_

In [None]:
# Displaying the Shortlisted Features and Coefficients in a house pricing dataframe

sales_data_lasso = pd.DataFrame({'Features':X_train.columns, 'Coefficient':lasso.coef_.round(4)})
sales_data_lasso = sales_data_lasso[sales_data_lasso['Coefficient'] != 0.00]
sales_data_lasso.reset_index(drop=True, inplace=True)
sales_data_lasso

In [None]:
# Adding the lasso features and coefficients in dictionary for graphing purpose

lasso_coeff_dict = dict(pd.Series(lasso.coef_, index = X_train.columns))
lasso_coeff_dict

In [None]:
# Doing the RFE to minimise the features to 15 for Lasso
X_train_lasso = X_train[sales_data_lasso.Features]

lm = LinearRegression()
lm.fit(X_train_lasso, y_train)

# running RFE for Lasso
rfe = RFE(lm, 15)            
rfe = rfe.fit(X_train_lasso, y_train)

In [None]:
def lasso_find_coefficient(x):
    return lasso_coeff_dict[x]

# Assign top 15 features to a temp dataframe for further display in the bar plot

sales_data_lasso = pd.DataFrame(list(zip( X_train_lasso.columns, rfe.support_, rfe.ranking_)), columns=['Features', 'rfe_support', 'rfe_ranking'])
sales_data_lasso = sales_data_lasso.loc[sales_data_lasso['rfe_support'] == True]
sales_data_lasso.reset_index(drop=True, inplace=True)

sales_data_lasso['Coefficient'] = sales_data_lasso['Features'].apply(lasso_find_coefficient)
sales_data_lasso = sales_data_lasso.sort_values(by=['Coefficient'], ascending=False)
sales_data_lasso = sales_data_lasso.head(15)
sales_data_lasso

In [None]:
# Plotting the blot plot to determine the variables that would affect pricing of the house based on Lasso Regression

plt.figure(figsize = (30,30))

plt.subplot(4,3,1)

sns.barplot(x='Coefficient' , y = 'Features' , palette='rocket',data = sales_data_lasso)

plt.show()

In [None]:
# Predicting the X train and X test in lasso

y_pred_train = lasso.predict(X_train)
y_pred_test = lasso.predict(X_test)

sales_data_lasso_metric = []

# calculating r2 square in Train and Test for Lasso Regression
print("R2 Square - Lasso Regression")
r2_train_lasso = r2_score(y_train, y_pred_train)
print("Train : ",r2_train_lasso)
sales_data_lasso_metric.append(r2_train_lasso)

r2_test_lasso = r2_score(y_test, y_pred_test)
print("Test : ",r2_test_lasso)
sales_data_lasso_metric.append(r2_test_lasso)

print('_____________________________________')

#Calculating Residual Sum Of Square (RSS) of Train and Test for Lasso Regression

print("Residual Sum Of Square (RSS) - Lasso Regression")
rss_train_lasso = np.sum(np.square(y_train - y_pred_train))
print("Train : ",rss_train_lasso)
sales_data_lasso_metric.append(rss_train_lasso)

rss_test_lasso = np.sum(np.square(y_test - y_pred_test))
print("Test : ",rss_test_lasso)
sales_data_lasso_metric.append(rss_test_lasso)

print('_____________________________________')

# Calculating Mean Squared Error(MSE) of Train and Test for Lasso Regression

print("Mean Squared Error (MSE) - Lasso Regression")
mse_train_lasso = mean_squared_error(y_train, y_pred_train)
print("Train : ",mse_train_lasso)
sales_data_lasso_metric.append(mse_train_lasso)

mse_test_lasso = mean_squared_error(y_test, y_pred_test)
print("Test : ",mse_test_lasso)
sales_data_lasso_metric.append(mse_test_lasso)

In [None]:
#Lasso visualization for train and test

colz = [i for i in range(0,y_test.shape[0],1)]

fig = plt.figure(figsize=(10,8))

plt.plot(colz,y_test,color='blue',linewidth=3.0,linestyle='-')

plt.plot(colz,y_pred_test,color='red',linewidth=3.0,linestyle='-')

fig.suptitle('Actual and Predicted', fontsize=20)

plt.xlabel('Index', fontsize=16)
plt.ylabel('Sales Price', fontsize=16) 

In [None]:
# Comparing the R2,RSS,MSE of Linear,Ridge and Lasso Regression

lr_table = {'Metric': ['R2 Score (Train)','R2 Score (Test)','RSS (Train)','RSS (Test)',
                      'MSE (Train)','MSE (Test)'], 
            'Linear Regression' : sales_data_lr_metric
        }

      
lr_metric = pd.DataFrame(lr_table ,columns = ['Metric', 'Linear Regression'] )

rg_metric = pd.Series(sales_data_ridge_metric, name = 'Ridge Regression')
ls_metric = pd.Series(sales_data_lasso_metric, name = 'Lasso Regression')

final_metric = pd.concat([lr_metric,rg_metric, ls_metric], axis = 1)

final_metric

In [None]:
model_lgb = LGBMRegressor( n_estimators=200,
                           learning_rate=0.03,
                           num_leaves=32,
                           colsample_bytree=0.9497036,
                           subsample=0.8715623,
                           max_depth=8,
                           reg_alpha=0.04,
                           reg_lambda=0.073,
                           min_split_gain=0.0222415,
                           min_child_weight=40)

In [None]:
model_lgb.fit(X_train, y_train)

In [None]:
ridge.predict(X_test).clip(0,20)

In [None]:
lasso.predict(X_test).clip(0,20)

In [None]:
model_lgb.predict(X_test).clip(0,20)

In [None]:
submission = pd.DataFrame({'ID':X_test.index,'item_cnt_month':model_lgb.predict(X_test)})

In [None]:
submission.shape

In [None]:
from xgboost import XGBRegressor
model = XGBRegressor(
    max_depth=10,
    n_estimators=30,
    min_child_weight=0.5, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,
#     tree_method='gpu_hist',
    seed=42)

model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train),(X_valid,y_valid)], 
    verbose=True, 
    early_stopping_rounds = 20)


In [None]:
y_pred = model.predict(X_train).clip(0, 20)
y_test = model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    "ID": X_test.index, 
    "item_cnt_month": y_test
})
submission.to_csv('submission.csv', index=False)

In [None]:
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

plot_features(model, (10,14))


In [None]:
submission.to_csv('submission.csv', index= False)