In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import required libraries and get data:

In [None]:
#Library importer

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime as dt
import gc
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

In [None]:
train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
item_cats = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')

**I'm going to use a memory reduction function that downcasts my dataframe columns.**

**First things first! Don't forget to convert the date column to date format.**

In [None]:
train['date'] = train['date'].apply(lambda x: dt.strptime(x, '%d.%m.%Y'))

# Downcasting: 

In [None]:
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df

In [None]:
train = downcast(train)
test = downcast(test)
shops = downcast(shops)
items = downcast(items)
item_cats = downcast(item_cats)

# Removing Outliers:

**Let's remove outliers first. I referred to this kernel for outlier removal: [https://www.kaggle.com/karell/xgb-baseline-advanced-feature-engineering](https://www.kaggle.com/karell/xgb-baseline-advanced-feature-engineering)**

In [None]:
sns.boxplot(x=train.item_cnt_day)

In [None]:
sns.boxplot(x=train.item_price)

**Eliminate all item prices below zero, and set all negative item_cnt_day values to 0.**

In [None]:
train = train[train.item_price > 0].reset_index(drop=True)
train[train.item_cnt_day <= 0].item_cnt_day.unique()
train.loc[train.item_cnt_day < 1, 'item_cnt_day'] = 0

**Several shops are duplicates of each other (according to their names). Fix train and test set.
We add 40 to 39.**

In [None]:
# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 11, 'shop_id'] = 10
test.loc[test.shop_id == 11, 'shop_id'] = 10

train.loc[train.shop_id == 40, 'shop_id'] = 39
test.loc[test.shop_id == 40, 'shop_id'] = 39

# Get all possible data:

**We only have two columns in our test set, shop_id and item_id.**

**This is not enough for us to make predictions. So, we need to put more features.**

In [None]:
new_test = pd.merge(pd.merge(pd.merge(test, items),shops),item_cats)
new_test

**We will do the same thing for the train dataset.**

**We need to get some common categories between them.**

In [None]:
new_train = pd.merge(pd.merge(pd.merge(train, items),shops),item_cats)
new_train

**Now, sales from different shops and different items don't necessarily affect each other. We need to find a relation between them.**

**Finding how different item categories affect the features is a good first step to understanding the data.**

# Data Visualization and Analysis:

In [None]:
aggr = new_train.groupby(['item_category_id']).agg({'item_price':'sum'})
aggr = aggr.reset_index()

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.plot(aggr['item_category_id'], aggr['item_price'])

In [None]:
aggr = new_train.groupby(['item_category_id']).agg({'item_cnt_day':'mean'})
aggr = aggr.reset_index()

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.plot(aggr['item_category_id'], aggr['item_cnt_day'], color='red')

**It looks like the item prices and items sold per day vary heavily by category. There are some spikes, and it shows a parabolic trend. If you look closely, you'll see the parabola.**

**We mustn't forget that we need to find a monthly estimate of the number of items sold.**

**Let's find out the relations between item_cnt_day, item_price and date_block_num.**

In [None]:
aggr = train.groupby(['date_block_num']).agg({'item_cnt_day':'mean'})
aggr = aggr.reset_index()

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.plot(aggr['date_block_num'], aggr['item_cnt_day'], color='brown')

In [None]:
aggr = train.groupby(['date_block_num']).agg({'item_price':'mean'})
aggr = aggr.reset_index()

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.plot(aggr['date_block_num'], aggr['item_price'], color='green')

**By looking closely, we can see some linear relationship in the above two graphs.**

**There are many spikes, but the general trend seems to be an increase.**

In [None]:
aggr = train.groupby(['date_block_num']).agg({'item_price':'mean'})
aggr = aggr.reset_index()

sns.scatterplot(data=aggr, x='date_block_num', y='item_price')

In [None]:
sns.set(style='ticks')

aggr = train.drop(columns = ['date','shop_id','item_id'])
sns.pairplot(aggr)

**The above graphs are just to see what the data looks like when plotted against each other.**

**For example, the bottom most, extreme left graph plots date_block_num(x) against item_cnt_day(y). It shows a slightly linear relationship.**

**Pairplots are basically scatterplot matrices. They save the time and effort of plotting each one against another.**

# Feature Engineering:

In [None]:
#Delete unwanted dataframes

del new_train
del test

gc.collect()

**We're going to do some group-bys to get our monthly sales.**

**First groupby: To get the duration feature!**

In [None]:
aggr = train.groupby(['shop_id','item_id','date_block_num'])['item_price','item_cnt_day','date'].agg({'item_price':['mean'], 'item_cnt_day':['sum'], 'date':['min','max']})

aggr = aggr.reset_index()
aggr

In [None]:
aggr['duration'] = (aggr['date']['max'] - aggr['date']['min']).dt.days.astype(np.int16)
aggr['duration'] += 1
aggr['item_cnt_day_sum'] = aggr['item_cnt_day']['sum']
aggr['item_price_mean'] = aggr['item_price']['mean']

aggr = aggr.drop(columns = ['date', 'item_cnt_day', 'item_price'])

aggr

**Second groupby: To get our actual monthly estimates!**

In [None]:
monthly_sales = aggr.groupby(['shop_id', 'item_id']).agg({'item_cnt_day_sum':['sum'], 'item_price_mean':['mean'], 'duration':['sum']})

monthly_sales = monthly_sales.reset_index()
monthly_sales

In [None]:
monthly_sales['item_cnt_month'] = monthly_sales['item_cnt_day_sum']['sum']/30
monthly_sales['item_price_month'] = monthly_sales['item_price_mean']['mean']
monthly_sales['sales_duration'] = monthly_sales['duration']['sum']

monthly_sales = monthly_sales.drop(columns = ['item_cnt_day_sum','item_price_mean','duration'])
monthly_sales

**There we go! We now have our actual monthly estimates. We need to predict item_cnt_month for the test set.**

**But first merge with shop, items, and item_cats to get remaining features!**

In [None]:
#Delete unwanted dataframes
del aggr

gc.collect()

**When merging two dataframes of different levels, some column names will become tuples. Like this: (shop_id, )**

**To solve this issue, we create the following function. It checks whether the column name is a string or not. If it's not a string, it changes it to a string.**

In [None]:
def colnamecheck(df):
    cols = []
    
    for x in df.columns:
        if type(x)!= str:
            cols.append(''.join(x))
        else:
            cols.append(x)
            
    df.columns = cols

In [None]:
df = pd.merge(monthly_sales, items, on='item_id')
df = df.drop(columns = ['item_id'])

colnamecheck(df)

df1 = pd.merge(df, shops, on='shop_id')
df2 = pd.merge(df1, item_cats, on='item_category_id')

monthly_sales = df2

del df, df1, df2
gc.collect()

In [None]:
def EncodeColumn(df, old_col, new_col):
    
    enc = LabelEncoder()
    
    df[new_col] = enc.fit_transform(df[old_col])

In [None]:
EncodeColumn(monthly_sales, 'item_name', 'item_name_enc')
EncodeColumn(new_test, 'item_name', 'item_name_enc')

EncodeColumn(monthly_sales, 'shop_name', 'shop_name_enc')
EncodeColumn(new_test, 'shop_name', 'shop_name_enc')

EncodeColumn(monthly_sales, 'item_category_name', 'item_category_name_enc')
EncodeColumn(new_test, 'item_category_name', 'item_category_name_enc')

In [None]:
monthly_sales = monthly_sales.drop(columns = ['item_name'])
monthly_sales = monthly_sales.drop(columns = ['shop_name'])
monthly_sales = monthly_sales.drop(columns = ['item_category_name'])

new_test = new_test.drop(columns = ['item_name'])
new_test = new_test.drop(columns = ['shop_name'])
new_test = new_test.drop(columns = ['item_category_name'])

# Modelling:

**We will be using XGBoost to predict item_price_month first, then sales_duration, and finally item_cnt_month.**

In [None]:
xgb = XGBRegressor(
    learning_rate=0.01,
    max_depth=3,
    n_estimators=1000, 
    colsample_bytree=0.8, 
    subsample=0.8,     
)

In [None]:
X = monthly_sales.drop(columns = ['item_price_month', 'sales_duration', 'item_cnt_month'])
y = monthly_sales['item_price_month']

xgb.fit(X, y)

preds = xgb.predict(new_test.drop(columns=['ID']))
new_test['item_price_month'] = preds
monthly_sales = monthly_sales.drop(columns = ['item_price_month'])
monthly_sales['item_price_month'] = y

In [None]:
X = monthly_sales.drop(columns = ['sales_duration', 'item_cnt_month'])
y = monthly_sales['sales_duration']

xgb.fit(X, y)

preds = xgb.predict(new_test.drop(columns=['ID']))
new_test['sales_duration'] = preds
monthly_sales = monthly_sales.drop(columns = ['sales_duration'])
monthly_sales['sales_duration'] = y

In [None]:
X = monthly_sales.drop(columns = ['item_cnt_month'])
y = monthly_sales['item_cnt_month']

xgb.fit(X, y)

preds = xgb.predict(new_test.drop(columns=['ID']))
new_test['item_cnt_month'] = preds

# Preparing Submission:

In [None]:
result = pd.DataFrame({'ID':new_test['ID'], 'item_cnt_month':new_test['item_cnt_month']})

**We don't want values more than 1 in our predictions. So, we round all predictions greater than 1 to 1.**

In [None]:
result

In [None]:
result.to_csv('submission.csv', index=False)