<h1>Loading Required Packages</h1>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


#For ML training
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


#For getting the file paths
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<h1>Reading the Input Dataset </h1>

In [None]:
df_train  = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
df_train.head(5)

In [None]:
df_test  = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
df_test.head(5)

In [None]:
df_items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
df_items.head(5)

<h1>Descriptive Analysis</h1>

In [None]:
df_train.info()

In [None]:
#correlation map
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(df_train.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

<h1>Data Pre-processing & Feature Engineering</h1>

In [None]:
df_train.columns

In [None]:
df_test.columns

In [None]:
# Adding date_block_num as 34 for the test dataset(November 2015) & rearranging columns
df_test['date_block_num'] = 34
df_test = df_test[['date_block_num' , 'shop_id' , 'item_id' ]]
df_test.head(5)

In [None]:
# Mapping the latest price for the items in the train data set to the test data set
item_price = dict(df_train.groupby('item_id')['item_price'].last().reset_index().values)
df_test['item_price'] = df_test.item_id.map(item_price)
df_test.head(5)

In [None]:
#Removing irrelevant columns from training dataset
df_train = df_train[df_train.item_id.isin (df_test.item_id)]
df_train = df_train[df_train.shop_id.isin (df_test.shop_id)]

In [None]:
#Digressing to revisit the concept of dictionaries in Python
dictionary = {'spain' : 'madrid','usa' : 'vegas'}
print(dictionary.keys())
print(dictionary.values())
dictionary['spain'] = "barcelona"    # Update entry
print(dictionary)
dictionary['france'] = "paris"       # Add entry
print(dictionary)
del dictionary['spain']              # Remove entry with key 'spain'
print(dictionary)
print('france' in dictionary)        # Check if included

In [None]:
#Re-shaping the train dataset and counting the sum of monthly sales
df_train = df_train.groupby(['date_block_num' , 'shop_id' , 'item_id']).agg({'item_price': 'last', 'item_cnt_day': 'sum'}).reset_index()
df_train.head(5)

In [None]:
#Adding unique feature for the test and train dataset
df_train['shop*item'] = df_train.shop_id *df_train.item_id
df_train.head(5)

In [None]:
df_test['shop*item'] = df_test.shop_id *df_test.item_id
df_test.head(5)

In [None]:
#Mapping categories from item dataset to item_id
df_items.drop('item_name' , axis  = 1 , inplace = True)
item_cat = dict(df_items.values)
df_train['item_cat'] = df_train.item_id.map(item_cat)
df_train.head(5)

In [None]:
#Mapping the categories
df_test['item_cat'] = df_test.item_id.map(item_cat)
df_test.head(5)

In [None]:
#Handling outliers
df = pd.concat([df_train , df_test])
df.item_price = np.log1p(df.item_price) #Normalizing
df.item_price = df.item_price.fillna(df.item_price.mean()) #Replacing missing values
df.item_cnt_day = df.item_cnt_day.apply(lambda x : 10 if x>10 else x) #Removing outliers
df.head(5)

In [None]:
#Encoding columns

def encode_the_numbers (column):
    helper_df = df.groupby(column)['item_cnt_day'].mean().sort_values(ascending = False).reset_index().reset_index()
    maper = helper_df.groupby(column)["index"].mean().to_dict()
    df[f'{column}_mean'] = df[column].map(maper)
    helper_df = df.groupby(column)['item_cnt_day'].sum().sort_values(ascending = False).reset_index().reset_index()
    maper = helper_df.groupby(column)["index"].sum().to_dict()
    df[f'{column}_sum'] = df[column].map(maper)
    helper_df = df.groupby(column)['item_cnt_day'].count().sort_values(ascending = False).reset_index().reset_index()
    maper = helper_df.groupby(column)["index"].count().to_dict()
    df[f'{column}_count'] = df[column].map(maper)



In [None]:
columns_to_encode = ['shop_id', 'item_id','shop*item', 'item_cat']
for column in columns_to_encode:
    encode_the_numbers (column)

In [None]:
corr_df = df.select_dtypes('number').drop('item_cnt_day', axis=1).corrwith(df.item_cnt_day).sort_values().reset_index().rename(columns = {'index':'feature' ,0:'correlation'})

fig , ax = plt.subplots(figsize  = (5,20))
ax.barh(y =corr_df.feature , width = corr_df.correlation )
ax.set_title('correlation between featuer and target'.title() ,
            fontsize = 16 , fontfamily = 'serif' , fontweight = 'bold')
plt.show();

<h1>Forecasting</h1>

In [None]:
#Splitting dataset
df_train = df[df.item_cnt_day.notnull()]
df_train.head(5)

In [None]:
df_test = df[df.item_cnt_day.isnull()]
df_test.drop ('item_cnt_day' , axis = 1 , inplace  = True)
df_test.head(5)

In [None]:
#Preparing predictors and predicted
X = df_train.drop('item_cnt_day' , axis = 1).values
y = df_train.item_cnt_day.values

In [None]:
#Scaling 
SC = MinMaxScaler()
SC.fit(X)
X = SC.transform(X)

In [None]:
x_train , x_test , y_train , y_test = train_test_split(X , y , test_size = 0.20 ,  random_state=10)

In [None]:
import xgboost as xg
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

<h2>Linear Regression Implementation</h2>

In [None]:
lr = LinearRegression()
lr.fit(x_train,y_train)
train_prediction_lr  = lr.predict(x_train)
test_prediction_lr  = lr.predict(x_test)
print("RMSE(Test) : % f" %(np.sqrt(MSE(y_test, test_prediction_lr))))
print("R Squared Value(Test) : %f" %(r2_score(y_test, test_prediction_lr)))

<h2>SVR Implementation</h2>

In [None]:
# from sklearn.svm import SVR
# SVR=SVR(cache_size=7000)
# scaling = MinMaxScaler(feature_range=(-1,1)).fit(x_train) #Re-scaling to speed up SVR modelling
# x_train = scaling.transform(x_train)
# x_test = scaling.transform(x_test)
# SVR.fit(x_train,y_train)
# test_predicition_svr  = SVR.predict(x_test)

# print("RMSE(Test) : % f" %(np.sqrt(MSE(y_test, test_prediction_svr))))
# print("R Squared Value(Test) : %f" %(r2_score(y_test, test_prediction_svr)))


<h2>Random Forest Implementation</h2>

In [None]:
reg = RandomForestRegressor(n_estimators=25 )
reg.fit(x_train,y_train)
test_prediction_rf  = reg.predict(x_test)

print("RMSE(Test) : % f" %(np.sqrt(MSE(y_test, test_prediction_rf))))
print("R Squared Value(Test) : %f" %(r2_score(y_test, test_prediction_rf)))

<h2>XGBoost Implementation</h2>

In [None]:
xgb_r = xg.XGBRegressor(objective ='reg:linear',
                  n_estimators = 400, seed = 123)
  
xgb_r.fit(x_train,y_train) #Fitting the model
test_prediction_xgb = xgb_r.predict(x_test) #Predict the model

print("RMSE(Test) : % f" %(np.sqrt(MSE(y_test, test_prediction_xgb))))
print("R Squared Value(Test) : %f" %(r2_score(y_test, test_prediction_xgb)))

In [None]:
#Preparing test data for submission
X_submission =df_test.values
X_submission = SC.transform(X_submission)

XGBoost selected because of its minimum RMSE and maximum R2 values among tested models(linear, Random Forest and XGBoost)

In [None]:
#Model Selection
pred  = xgb_r.predict(X_submission)
sample_submission  = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')
sample_submission.item_cnt_month = pred
sample_submission.head(5)

In [None]:
sample_submission.to_csv('submission.csv' , index = False)