In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# PREPROCESSING
from sklearn.preprocessing import StandardScaler, LabelEncoder

import warnings
warnings.filterwarnings("ignore")
plt.style.use('seaborn-whitegrid')

## Data Description
### File descriptions
- **sales_train.csv** - the training set. Daily historical data from January 2013 to October 2015.
- **test.csv** - the test set. You need to forecast the sales for these shops and products for November 2015.
- **sample_submission.csv** - a sample submission file in the correct format.
- **items.csv** - supplemental information about the items/products.
- **item_categories.csv** - supplemental information about the items categories.
- **shops.csv**- supplemental information about the shops.

### Data fields
- **ID** - an Id that represents a (Shop, Item) tuple within the test set
- **shop_id** - unique identifier of a shop
- **item_id** - unique identifier of a product
- **item_category_id** - unique identifier of item category
- **item_cnt_day** - number of products sold. You are predicting a monthly amount of this measure
- **item_price** - current price of an item
- **date** - date in format dd/mm/yyyy
- **date_block_num** - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
- **item_name** - name of item
- **shop_name** - name of shop
- **item_category_name** - name of item category

In [None]:
sales = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
submission = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")

# Sales

In [None]:
print(f'sales_train.csv : {sales.shape}')
sales.head(3)

In [None]:
sales.info()

In [None]:
sales[['date_block_num','item_price','item_cnt_day']].describe().T

In [None]:
print('There are {} records in our train dataframe.'.format(sales.shape[0]))
print('there are {} unique Item in sales dataset'.format(len(sales['item_id'].unique())))

In [None]:
pd.DataFrame(sales.nunique(),columns={'count'}).sort_values('count', ascending = False)

In [None]:
pd.DataFrame(sales.isnull().sum(),columns={'count'})

There are no missing values in sales dataframe.

## Summary of sales

In [None]:
# inspiration code: https://www.kaggle.com/gaetanlopez/how-to-make-clean-visualizations

records_num = str(int(sales.shape[0]/1000))
columns_num = sales.shape[1]
mean_priced = int(sales['item_price'].describe().loc['mean'])
unique_items = len(sales['item_id'].unique())

fig=plt.figure(figsize=(5,2),facecolor='white')


ax0=fig.add_subplot(1,1,1)
# Hide grid lines and take care of the color
ax0.grid(False)
ax0.set_facecolor('white')
ax0.text(1.1,1,"Key Figures",color='black',fontsize=28, fontweight='bold', fontfamily='monospace',
         ha='center')


ax0.text(0,0.4, records_num+'k' ,color='blue',fontsize=25, fontweight='bold', fontfamily='monospace',ha='center')
ax0.text(0,0.001,"Number of items \nin the train dataset",color='dimgrey',fontsize=17,
         fontweight='light', fontfamily='monospace',ha='center')

ax0.text(0.75,0.4,columns_num ,color='blue',fontsize=25, fontweight='bold', fontfamily='monospace',ha='center')
ax0.text(0.75,0.001,"Number of features \nin the dataset",color='dimgrey',fontsize=17, 
         fontweight='light', fontfamily='monospace',ha='center')

ax0.text(1.5,0.4, mean_priced ,color='blue',fontsize=25, fontweight='bold', fontfamily='monospace',ha='center')
ax0.text(1.5,0.001,"Mean price for \neach item",color='dimgrey',fontsize=17, fontweight='light', 
         fontfamily='monospace',ha='center')

ax0.text(2.25,0.4,unique_items ,color='blue',fontsize=25, fontweight='bold', fontfamily='monospace',ha='center')
ax0.text(2.25,0.001,"Number of unique \nitems",color='dimgrey',fontsize=17, fontweight='light', 
         fontfamily='monospace',ha='center')

ax0.set_yticklabels('')
ax0.tick_params(axis='y',length=0)
ax0.tick_params(axis='x',length=0)
ax0.set_xticklabels('')

for direction in ['top','right','left','bottom']:
    ax0.spines[direction].set_visible(False)

# Test

In [None]:
print(f'test.csv : {test.shape}')
test.head(3)

In [None]:
test.info()

In [None]:
print('There are {} records in our test dataframe.'.format(test.shape[0]))

In [None]:
pd.DataFrame(test.isnull().sum(),columns={'count'})

# Shops

In [None]:
print(f'shops.csv : {shops.shape}')
shops.head(3)

### Extract the city from shop_name column

In [None]:
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops['city'].unique()

We have some identical cities with typo. For example we have `!Якутск` and `Якутск`.Lets take care of this in next block.
We also encode the cities name using the [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html)


In [None]:
shops.loc[shops['city']=='!Якутск', 'city'] = 'Якутск'

# encoding
shops['city_code'] = LabelEncoder().fit_transform(shops['city']).astype(np.int8)
shops.head(3)

# Items

In [None]:
print(f'items.csv : {items.shape}')
items.head(3)

In [None]:
# Create the date the product was first sold as a feature
items['first_sale_date'] = sales.groupby('item_id').agg({'date_block_num': 'min'})['date_block_num']

items.head(3)

In [None]:
items['first_sale_date'].isna().sum()

In [None]:
# Replace NaN of first_sale_date with 34
items['first_sale_date'] = items['first_sale_date'].fillna(34)

# Item Categories

In [None]:
print(f'item_categories.csv : {item_categories.shape}')
item_categories.head(3)

In [None]:
item_categories['item_category_name'][:10]

It seems that we have main category and subcategory with a dash between.
we can extract name of the main category and subcategory from item_category_name and put them in separate columns.

In [None]:
item_categories['item_maincategory_name'] = item_categories['item_category_name'].str.split(' - ').map(lambda x: x[0])
print('We have {} unique main categories.'.format(len(item_categories['item_maincategory_name'].unique())))
item_categories['item_maincategory_name'].unique()

It seems we have some douplicate values here. For example, we have `Игры` and `Игры Android`.Lets take care of these douplicates first then we encode the categories using the LabelEncoder.(Игры means games in Russian!)

In [None]:
item_categories.loc[item_categories['item_maincategory_name']=='Игры Android', 'item_maincategory_name'] = 'Игры'
item_categories.loc[item_categories['item_maincategory_name']=='Игры MAC', 'item_maincategory_name'] = 'Игры'
item_categories.loc[item_categories['item_maincategory_name']=='Игры PC', 'item_maincategory_name'] = 'Игры'

item_categories.loc[item_categories['item_maincategory_name']=='Карты оплаты (Кино, Музыка, Игры)', 'item_maincategory_name'] = 'Карты оплаты'

item_categories.loc[item_categories['item_maincategory_name']=='Чистые носители (шпиль)', 'item_maincategory_name'] = 'Чистые носители'
item_categories.loc[item_categories['item_maincategory_name']=='Чистые носители (штучные)', 'item_maincategory_name'] = 'Чистые носители'


In [None]:
item_categories['item_maincategory_id'] = LabelEncoder().fit_transform(item_categories['item_maincategory_name']).astype(np.int8)
item_categories.head(3)

In [None]:
item_categories.item_maincategory_name.value_counts()

In [None]:
def make_etc(x):
    if len(item_categories[item_categories['item_maincategory_name']==x]) >= 5:
        return x
    else:
        return 'etc'

# Replace with 'etc' if category count is less than 5
item_categories['item_maincategory_name'] = item_categories['item_maincategory_name'].apply(make_etc)

In [None]:
item_categories['item_subcategory_name'] = item_categories['item_category_name'].str.split('-')\
.map(lambda x: '-'.join(x[1:]).strip() if len(x) > 1 else x[0].strip())
print('We have {} unique sub categories.'.format(len(item_categories['item_subcategory_name'].unique())))
item_categories['item_subcategory_name'].unique()

In [None]:
item_categories.item_subcategory_name.value_counts()

In [None]:
item_categories['item_subcategory_id'] = LabelEncoder().fit_transform(item_categories['item_subcategory_name']).astype(np.int8)
item_categories.head(3)

In [None]:
item_categories.item_subcategory_id.value_counts()

# Submition Sample

In [None]:
print(f'sample_submission.csv : {submission.shape}')
submission.head(3)

# Merge and make the train dataframes

In [None]:
# Merge Item and Item Categories dataframes on 'item_category_id'
item_info = pd.merge(items, item_categories, on='item_category_id', how='inner')
item_info.head(2)

In [None]:
# Merge sales and item_info dataframes on 'item_id'
train_tmp = pd.merge(sales,item_info, on='item_id', how='inner')
train_tmp.head(2)

In [None]:
# Merge train_tmp and sales dataframes on 'shop_id'
train = pd.merge(train_tmp, shops, on='shop_id', how='inner')
train.head(3)

In [None]:
train['total_sales'] = train['item_price'] * train['item_cnt_day']
train.head(3)

# Merge and make the test Dataframe

In [None]:
test_tmp = pd.merge(test,item_info, on='item_id', how='inner')
test = pd.merge(test_tmp, shops, on='shop_id', how='inner')
test.head(3)

# Down Cast the Datasets

In [None]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df


In [None]:
train = downcast_dtypes(train)
print(train.info())

In [None]:
test = downcast_dtypes(test)
print(test.info())

# Handeling the outliers

In [None]:
plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
flierprops = dict(marker='o', markerfacecolor='purple', markersize=6,
                  linestyle='none', markeredgecolor='black')
sns.boxplot(x=train.item_cnt_day, flierprops=flierprops)

plt.figure(figsize=(10,4))
plt.xlim(train.item_price.min(), train.item_price.max()*1.1)
sns.boxplot(x=train.item_price, flierprops=flierprops)

In [None]:
print('There are {} records with item price higher than 50000'.format(len(train[train['item_price']>50000])))
train[train['item_price']>50000]

In [None]:
print('There are {} records with item count day higher than 0'.format(len(train[train['item_cnt_day']<0])))
train[train['item_cnt_day']<0][0:3]

In [None]:
print('There are {} records with item count day higher than 1000'.format(len(train[train['item_cnt_day']>1000])))
train[train['item_cnt_day'] > 1000]

We will remove the obvious outliers in the dataset - the items that sold more than 1000 in one day and the item with price greater than 50,000.

In [None]:
# Extract data with a item_price greater than 0
sales_train = train[train['item_price'] > 0]
# Extract data with a item_priceof less than 50,000
sales_train = sales_train[sales_train['item_price'] < 50000]
# Extract data with item_cnt_day greater than 0
sales_train = sales_train[sales_train['item_cnt_day'] > 0]
# Extract data with item_cnt_day less than 1,000
sales_train = sales_train[sales_train['item_cnt_day'] < 1000]

In [None]:
print('Before removing outliers: ', train.shape)
print('After removing outliers: ', sales_train.shape)

In [None]:
# Leaking to imporve performance
# unique_test_shop_id = test['shop_id'].unique()
# sales_train = sales_train[sales_train['shop_id'].isin(unique_test_shop_id)]

## Visualization & Profiling

In [None]:
import pandas_profiling as pp
pp.ProfileReport(sales_train)

## Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor  

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from scipy import optimize, stats 
from keras.utils import np_utils

In [None]:
sales_train.head(3)

In [None]:
train = sales_train.copy()

In [None]:
train.drop(['date_block_num','item_price','item_name','item_category_name','item_maincategory_name',
            'item_subcategory_name','shop_name','city'], axis=1, inplace=True)
train.head()

In [None]:
train['date'] = pd.to_datetime(train['date'], dayfirst=True)
train['date'] = train['date'].apply(lambda x: x.strftime('%Y-%m'))
train.head()

In [None]:
df = train.groupby(['date','shop_id','item_id','item_category_id','first_sale_date',
                       'item_maincategory_id','item_subcategory_id','city_code']).sum()
df = df.pivot_table(index=['shop_id','item_id','item_category_id','first_sale_date',
                       'item_maincategory_id','item_subcategory_id','city_code'],
                    columns='date', values='item_cnt_day', fill_value=0)
df.reset_index(inplace=True)
df.head().T

In [None]:
test.head()

In [None]:
test.drop(['item_name','item_category_name','item_maincategory_name',
            'item_subcategory_name','shop_name','city'], axis=1, inplace=True)

In [None]:
df_test = pd.merge(test, df, on=['shop_id','item_id','item_category_id','first_sale_date',
                       'item_maincategory_id','item_subcategory_id','city_code'], how='left')
df_test.drop(['ID', '2013-01'], axis=1, inplace=True)
df_test = df_test.fillna(0)
df_test.head().T

In [None]:
# split into train and test sets
Y_train = df['2015-10'].values
X_train = df.drop(['2015-10'], axis = 1)
X_test = df_test

print(X_train.shape, Y_train.shape)
print(X_test.shape)

In [None]:
x_train, x_test, y_train, y_test  = train_test_split( X_train, Y_train, test_size=0.20, random_state=1)

In [None]:
print ('Train set:', x_train.shape,  y_train.shape)
print ('Test set:', x_test.shape,  y_test.shape)

In [None]:
%time
ETR = ExtraTreesRegressor(n_estimators=100, random_state=0)
ETR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, ETR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, ETR.predict(x_test)))
print('Test set score:', ETR.score(x_train,y_train))

In [None]:
%time
ADB= AdaBoostRegressor(random_state=0, n_estimators=100)
ADB.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, ADB.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, ADB.predict(x_test)))
print('Test set score:', ADB.score(x_train,y_train))

In [None]:
%time
BYNR = linear_model.BayesianRidge()
BYNR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, BYNR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, BYNR.predict(x_test)))
print('Test set score:', BYNR.score(x_train,y_train))

In [None]:
%time
LR = LinearRegression()
LR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, LR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, LR.predict(x_test)))
print('Test set score:', LR.score(x_train,y_train))

In [None]:
%time
RFR = RandomForestRegressor(n_estimators = 100)
RFR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, RFR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, RFR.predict(x_test)))
print('Test set score:', RFR.score(x_train,y_train))

In [None]:
%time
XGB = XGBRegressor(max_depth=16,n_estimators=200,seed=1)
XGB.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, XGB.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, XGB.predict(x_test)))
print('Test set score:', XGB.score(x_train,y_train))

In [None]:
%time
LGBM = LGBMRegressor(max_depth=16,n_estimators=200,seed=1)
LGBM.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, LGBM.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, LGBM.predict(x_test)))
print('Test set score:', LGBM.score(x_train,y_train))

In [None]:
extraTreesRegressor_score = ETR.score(x_train,y_train)
adaBoostRegressor_score = ADB.score(x_train,y_train)
bayesianRidge_score = BYNR.score(x_train,y_train)
linearRegression_score = LR.score(x_train,y_train)
randomForestRegressor_score = RFR.score(x_train,y_train)
XGBRegressor_score = XGB.score(x_train,y_train)
LGBMRegressor_score = LGBM.score(x_train,y_train) 

results = pd.DataFrame([["ExtraTreesRegressor",extraTreesRegressor_score],["AdaBoostRegressor",adaBoostRegressor_score],
                        [" BayesianRidge",bayesianRidge_score],["LinearRegression",linearRegression_score],
                        ["RandomForestRegressor",randomForestRegressor_score],["XGBRegressor",XGBRegressor_score],
                        ["RLGBMRegressor",LGBMRegressor_score]],
                        columns = ["Models","Score"]).sort_values(by='Score',ascending=False)


results.style.background_gradient(cmap='Blues')

In [None]:
prediction = ETR.predict(X_test)

In [None]:
df_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
print(df_submission.shape)
df_submission.head()

In [None]:
df_submission['item_cnt_month'] = prediction
df_submission.to_csv('prediction.csv', index=False)
df_submission.head()

In [None]:
prediction = XGB.predict(X_test)

In [None]:
prediction = list(map(round, prediction))

In [None]:
df_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
print(df_submission.shape)
df_submission.head()

In [None]:
df_submission['item_cnt_month'] = prediction
df_submission.to_csv('prediction.csv', index=False)
df_submission.head()

In [None]:
prediction = RFR.predict(X_test)

In [None]:
df_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
print(df_submission.shape)
df_submission.head()

In [None]:
df_submission['item_cnt_month'] = prediction
df_submission.to_csv('prediction.csv', index=False)
df_submission.head()