In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
items = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
sample_submission = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")
item_categories = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
sale_train =pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
shops =pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")
test = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/test.csv")


In [None]:
# function to look at each data
datas = [items, item_categories, sale_train, shops, test]

def look(data):
    print(data.head())
    print("**************************")
    print(data.shape)
    
for d in datas:
    print("\n {} \n".format(d))
    look(d)
    
    

In [None]:
# Merge some data 
# Merge some data

d1 = sale_train.merge(shops, on= 'shop_id')
d2 = items.merge(item_categories, on = 'item_category_id')
df = d1.merge(d2, on ='item_id')
df.head()


In [None]:
df.shape

In [None]:
df.info()

We have some categorical and numerical variables

In [None]:
df.isnull().sum()

No missing values

#  EDA 

We can remove all th features that are unique for a shop, 
'shop_name", "item_category_name", 'item_name"

In [None]:
df1 = df.drop(['shop_name', 'item_category_name', 'item_name'], axis ='columns')
df1.head()

In [None]:
# convert the 'date' features into 'day', 'month' and 'years' features separatly

def convert_date(data): 
    data["day"]=pd.DatetimeIndex(pd.to_datetime(data['date'],format='%d.%m.%Y')).day
    data['month'] = pd.DatetimeIndex(pd.to_datetime(data['date'], format ='%d.%m.%Y')).month
    data['year'] = pd.DatetimeIndex(pd.to_datetime(data['date'], format ='%d.%m.%Y')).year
    return data


df2 = convert_date(df1)
df2.head()

In [None]:
# drop 'date' column
df3 = df2.drop(['date'], axis = 'columns')
df3.head()

In [None]:
# look if we have some duplicate in the data
df3.duplicated().value_counts()


In [None]:
# Remove duplicates rows
df4 = df3.drop_duplicates(subset=None, keep='first', inplace=False)
df4.duplicated().value_counts()


In [None]:
df4.shape

## Visualisation data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 

data = df4.copy()

# look at the sale of each month
plt.figure(figsize = (20,5))
sns.countplot(x='month', data = data)
plt.title("Count of Sales each month")
plt.show()

* We see that 'Jan' have more sales then 'mars' and 'dec"

In [None]:
# sales per years
plt.figure(figsize = (20,5))
sns.countplot(x='year', data = data, palette = 'husl')
plt.title("Count of Sales each year")
plt.show()


   *  The year '2013' have saled the more
   * sales are decreasing over the years



In [None]:
# look at the ddistribution of sale per month for each years

years = data['year'].unique().tolist()
for y in years:
    d = data[data['year']== y]
    print("*** Year {} ****\n".format(y))
    df = d[["month","item_cnt_day"]].groupby(["month"]).sum().reset_index()
    plt.figure(figsize = (20,5)) #plot(df["month"],df["item_cnt_day"])
    sns.countplot(x = 'month', data= d )
    plt.title("Year {} sale per months".format(y))
    plt.show()
    

* 2013 sales more in Dec
* 2014  sales more in Dec
* 2015 sales more in jan 

In [None]:
# item_cnt_day - number of products sold. You are predicting a monthly amount of this measure

# change name of column "item_cnt_day" to "item_cnt_month"
data.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=True)
data.head()

In [None]:
# We can get rid of the day data set, since we are interested in month, years,
data = data.drop(['day'], axis ='columns')
data.head()

In [None]:
# Since test data set does not have item_category, we can also remove that from our data
data = data.drop(['item_category_id'], axis = 'columns')
data.head()

In [None]:
data.info()

* All the data are numerical variables...
* We can identify the dependent and independant variable and start building the model

### We want to forecast the total amount of products sold in every shop for the test data set


# Build the model

In [None]:
from sklearn.model_selection  import train_test_split
from sklearn.preprocessing import StandardScaler


# split the training data into training and validation data sets
y = data['item_cnt_month']
features =  ['date_block_num', 'shop_id', 'item_id', 'item_price', 'month', 'year']
x = data[features]


X_train, X_val, Y_train, Y_val  = train_test_split(x, y, test_size =0.2, random_state =1)

# ### Scale the data before building the model to avoid bias
# std_scale = StandardScaler()
# X_train_scale = std_scale.fit_transform(X_train)
# X_val_scale = std_scale.fit_transform(X_val)

print(X_train.shape, X_val.shape, Y_train.shape, Y_val.shape)

In [None]:
# Build regression models
from sklearn.ensemble  import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# for tuning the models 
from sklearn.model_selection import RandomizedSearchCV


# Model evaluation
from scipy import stats
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# # Random forest regressor build


rfr = RandomForestRegressor(n_estimators=50, random_state=2)
rfr.fit(X_train, Y_train)
rfr.score(X_train, Y_train)

y_predicted = rfr.predict(X_val)
mean_squared_error(Y_val, y_predicted, squared=False)

In [None]:
###  grxgboost boosting

gbr = GradientBoostingRegressor(random_state=0)
gbr.fit(X_train, Y_train)
# gbr.score(X_train_scale, Y_train)

y_pred_gbr = gbr.predict(X_val)
mean_squared_error(Y_val, y_pred_gbr, squared=True)

In [None]:
import xgboost as XGB

xgb = XGB.XGBRegressor()
xgb.fit(X_train,Y_train,eval_set=[(X_train,Y_train), (X_val,Y_val)],eval_metric='rmse', verbose=True)
xgb.score(X_val,Y_val)

pred_xgb = xgb.predict(X_val)
mean_squared_error(Y_val, pred_xgb, squared=False)



In [None]:
# add some features into test data
test["year"] = 2015
test["month"]=11
test["date_block_num"]=34
test['item_price'] = 999
test.head(5)

In [None]:
data.head()

In [None]:
# Write the subimisison file 

features = ["item_id","shop_id","item_price", "month","year","date_block_num"]
X_test = test[features]
pred = xgb.predict(X_test)

# save the submisison file 

submission = pd.DataFrame({"ID": test["ID"],"item_cnt_month" : pred})
submission.to_csv("submission.csv",index=False)



In [None]:
# sub = pd.read_csv("submission.csv")
# sub['item_cnt_month'].unique()