In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math

In [None]:
train_df = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
train_df

In [None]:
test_df = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
test_df

# Exploring and Pre-processing

## Examining 2 numeric columns

In [None]:
plt.boxplot(train_df.item_price);

In [None]:
plt.boxplot(train_df.item_cnt_day);

There are items out of range that can be considered outliers. We will eliminate them.

In [None]:
train_df = train_df[(train_df['item_price'] < 100000)
                   & (train_df['item_cnt_day'] < 1000)]

Let's examine further with describe function.

In [None]:
train_df[['item_price', 'item_cnt_day']].describe()

Both detect negative values. As in `item_cnt_day` the values gonna be clipped into (0, 20) so that would be no big deal, we will consider the `item_price`.

In [None]:
train_df[train_df.item_price < 0]

Let's fill this value with it's median where its price is positive.

In [None]:
median = train_df[(train_df['shop_id'] == 32)
        & (train_df['item_id'] == 2973)
        & (train_df['item_price'] > 0)]['item_price'].median()

In [None]:
train_df.loc[train_df['item_price'] < 0, 'item_price'] = median

In [None]:
train_df.loc[train_df.item_cnt_day < 0, 'item_cnt_day'] = 0

## Clean dubplicated `shop_id`

There are duplicated shop name but with different `shop_id`, we will simply fix this.

In [None]:
# Якутск Орджоникидзе, 56
train_df.loc[train_df.shop_id == 0, 'shop_id'] = 57
test_df.loc[test_df.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train_df.loc[train_df.shop_id == 1, 'shop_id'] = 58
test_df.loc[test_df.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train_df.loc[train_df.shop_id == 10, 'shop_id'] = 11
test_df.loc[test_df.shop_id == 10, 'shop_id'] = 11

## See trend of `item_price` and `item_cnt_day` through years

Before jumping into this, it is dangerous if we consider a lot information of `test` dataset into our training process. However, in this case, by examining `test` set, I acknowledge that there are items that don't present in the `train` set.

In [None]:
train_df1 = train_df[train_df.item_id.isin(test_df.item_id)]
test_df.item_id.nunique() - train_df1.item_id.nunique()

There are 363 items in the `test` set that are not in the `train` set. How are we handling these?

A good idea is to consider any median value of items that have same category type with them. On the other hand, Kaggle target value should be in (0, 20). My take on this is that these items should have the target value of `0`. To ensure this, I will do further analysis on this.

How about the other items that present in the `train` set but not the `test` set? The visualization below might help you get some helpful insights.

In the cell above, `train_df1` is currently the `train` set with only the items in `test` set. We will get the sum attributes of all these items in each month and see their trend in every year.

In [None]:
def see_year_trend(field, df):
    temp = df.copy()
    temp['date'] = pd.to_datetime(temp['date'])
    temp['month'] = pd.to_datetime(train_df1['date']).dt.month
    temp['year'] = pd.to_datetime(train_df1['date']).dt.year
    columns = temp.columns
    fake_data = pd.DataFrame(columns = columns,
                             data = [[pd.to_datetime('2015-11-01', format="%Y-%m-%d"),
                                   34, 0, 0, 0, 0, 11, 2015],
                                   [pd.to_datetime('2015-12-01', format="%Y-%m-%d"),
                                   35, 0, 0, 0, 0, 12, 2015]])
    temp = pd.concat((temp, fake_data), axis=0)
    first_grouped = temp.groupby('month')
    def sum_by_year(df):
        by_year = df.groupby('year')[field].sum()
        return by_year
    
    second_grouped = first_grouped.apply(sum_by_year)
    second_grouped.iloc[-2:, -1] = np.nan
    second_grouped.plot()
    plt.title(f'Trend of {field} throughout 3 years')
    return second_grouped

In [None]:
see_year_trend('item_price', train_df1)
see_year_trend('item_price', train_df);

In [None]:
see_year_trend('item_cnt_day', train_df1)
see_year_trend('item_cnt_day', train_df);

Without those additional items in the `train` set, shops' incomes in 2013 and 2014 seem to exceed that rate of 2015. What does this mean?

The redundant items in the `train` set are actually those that are popular in the past, but not in 2015. If we are building any model requiring the precise trend of 2015 in our `train` set, eliminating those items might be ideal. However, I am working on how `item_price` may affect monthly sales, I'm still gonna keep those items for my models.

## Exploring buyers' behaviour throughout time

Let's examine how many item were sold in the month they last appear.

In [None]:
def get_last_sale(df):
    df.sort_values('date_block_num', inplace=True)
    final_date = df.iloc[-1, :]
    return final_date[['item_cnt_month', 'date_block_num']]

grouped_by_id = train_df.groupby(['item_id', 
                                  'date_block_num'])['item_cnt_day'].sum().reset_index().rename(columns={
    'item_cnt_day': 'item_cnt_month'
})

In [None]:
last_monthly_sales = grouped_by_id.groupby('item_id').apply(get_last_sale)
last_monthly_sales

In [None]:
plt.hist(last_monthly_sales.item_cnt_month, bins=[i for i in range(0, 10)]);

In [None]:
last_monthly_sales.item_cnt_month.value_counts()

In [None]:
plt.hist(last_monthly_sales.date_block_num, bins=[i for i in range(0, 35)]);

In [None]:
last_monthly_sales.date_block_num.value_counts()

The last `item_cnt_month`s of all items converge into 1, and the last `date_block_num` converge into 10/2015.

Knowing that most items have their last purchase in the late 2015, which mean that those items that are only popular in their 2013, 2014 (items that are not present in the `test` set) are not valuable for further survey. We will examine on `train_df1` instead from now on.

Now, we will see how many items appear in each `day_block_num` throughout all the time survey in `train_df1`.

In [None]:
grouped_by_block = train_df1.groupby(['date_block_num', 'item_id'])['item_cnt_day'].sum().reset_index()
grouped_by_block = grouped_by_block.rename(columns={'item_cnt_day': 'item_cnt_month'})
grouped_by_block

In [None]:
plt.figure(figsize=(16, 5))
grouped_by_block['date_block_num'].hist(bins=[i for i in range(0, 35)])
plt.xticks([i for i in range(0, 35)]);

The diversity of items purchased increase each month, however in our yearly trend visualization we did not see significant increase in monthly sales. We can assume that in 2015 people tend to buy more different products, not buy a lot of a single product. That's why the target values are set in (0, 20), and should coverge to (0, 1), which means our target values will have exponential distribution.

In [None]:
plt.figure(figsize=(16, 5))
grouped_by_block['item_cnt_month'].hist(bins=[i for i in range(0, 100)])
expectancy = grouped_by_block['item_cnt_month'].mean()
plt.axvline(expectancy, color='r', linestyle='-')
print('Expectency = ', expectancy)

#### A K-neighbors Regressor is an ideal model for target values that have such distribution

# Model training

## Idea for using `item_price`

As in market field, handling time series is sometimes unnecessary if you narrow the problem down to: If the price of the items are high, not many people would buy it.

In [None]:
grouped = train_df.groupby(['date_block_num', 
                            'shop_id', 
                            'item_id', 
                            'item_price'])['item_cnt_day'].sum().reset_index().rename(columns={'item_cnt_day': 'item_cnt_month'})
grouped

In [None]:
grouped.item_cnt_month.describe()

In [None]:
grouped.item_cnt_month = grouped.item_cnt_month.clip(0, 20)

In [None]:
X1 = grouped['item_price'].to_numpy().reshape(-1, 1)
y1 = grouped.item_cnt_month

In [None]:
plt.hist(y1, bins=100);

## Using K-neighbors Regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

KNR = KNeighborsRegressor(100)
KNR.fit(np.log(X1), y1)

In [None]:
Xt = grouped[grouped.date_block_num == 33]['item_price'].to_numpy().reshape(-1, 1)
yt = KNR.predict(np.log(Xt))

In [None]:
plt.hist(yt, bins=50);

In [None]:
y_val = grouped[grouped.date_block_num == 33].item_cnt_month.to_numpy()
errors =  y_val - yt
RMSE = np.sqrt(np.sum(errors**2) / len(errors))
RMSE

In [None]:
median_price = train_df.groupby(['shop_id', 'item_id']).agg({
    'item_price': 'median',
}).reset_index()
median_price

In [None]:
cate_df = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv').drop('item_name', axis=1)
grouped.merge(cate_df)

In [None]:
gido = grouped.groupby('date_block_num').item_cnt_month.sum()
division = []
for i in gido.index:
    division.append(gido[i]/gido[33])
    
division = np.array(division)
mean_division = division.mean()
    

In [None]:
test1 = test_df.merge(median_price, how='left')
test1

In [None]:
def get_median_of_nan(x):
    return grouped[grouped.item_id == x].item_price.median()

no_shop_item = test1[(test1.item_price.isna()) & (test1.item_id.isin(grouped.item_id))].item_id
median_price_no_shop_item = grouped[grouped.item_id.isin(no_shop_item)].groupby('item_id').item_price.median()
median_price_no_shop_item

In [None]:
for i in median_price_no_shop_item.index:
    test1.loc[test1.item_id == i, 'item_price'] = median_price_no_shop_item[i]

test1

In [None]:
test1.fillna(0, inplace=True)
test_df = test1.copy()

In [None]:
test1 = test_df.copy()
zero_index = test1[test1.item_price == 0].index

In [None]:
X_test1 = np.log(test1.item_price.to_numpy().reshape(-1, 1) + 0.00001)
y_predict = KNR.predict(X_test1)
plt.hist(y_predict, bins=100);

In [None]:
gido = grouped.groupby('date_block_num').item_cnt_month.sum()#.reset_index()
division = []
for i in gido.index:
    division.append(gido[i]/gido[33])
    
division = np.array(division)
division

In [None]:
y_predict = y_predict/mean_division

In [None]:
y_predict[zero_index] = 0
test1['item_cnt_month'] = y_predict

In [None]:
test1.drop(['shop_id', 'item_id', 'item_price'], axis=1, inplace=True)

In [None]:
test1.set_index('ID', inplace=True)

In [None]:
test1['item_cnt_month'].hist(bins=100)

In [None]:
test1.to_csv('KNR_submission.csv') #1.48495

In [None]:
import pickle

pickle.dump(KNR, open('KNR.pkl', 'wb'))

## Using Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
DTR = DecisionTreeRegressor()

In [None]:
for_DTR = grouped.merge(cate_df)
X1 = for_DTR[['shop_id', 'item_id', 'item_price', 'item_category_id']].to_numpy()

In [None]:
DTR.fit(X1, y1)

In [None]:
test_df

In [None]:
test1 = test_df.copy()#.rename(columns = {'ID': 'date_block_num'})
#test1.date_block_num=34

In [None]:
test_DTR = test1.merge(cate_df).drop('ID', axis=1)
test_DTR

In [None]:
X_test_DTR = test_DTR.to_numpy()

In [None]:
y_pred_DTR = DTR.predict(X_test_DTR)
y_pred_DTR[zero_index] = 0
plt.hist(y_pred_DTR, bins=100);

In [None]:
test1['item_cnt_month'] = y_pred_DTR
test1 = test1.drop(['shop_id', 'item_id', 'item_price'], axis=1)

In [None]:
test1 = test1.set_index('ID')
test1.to_csv('DTR_submission.csv') #1.60434

## Using Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

GBR = GradientBoostingRegressor(learning_rate=0.00001)
X1 = for_DTR[['date_block_num', 'shop_id', 'item_id', 'item_price', 'item_category_id']].to_numpy()
X1

In [None]:
GBR.fit(X1, y1)

In [None]:
test1 = test_df.merge(cate_df).rename(columns={'ID': 'item_cnt_month'})
test1['item_cnt_month'] = 34
test1

In [None]:
X_test_GBR = test1.to_numpy()
X_test_GBR

In [None]:
y_pred_GBR = GBR.predict(X_test_GBR)
y_pred_GBR = ((y_pred_GBR - y_pred_GBR.min()) / (y_pred_GBR.max() - y_pred_GBR.min()) + 0.1) * y_pred_GBR.max()
plt.hist(y_pred_GBR, bins=100);

In [None]:
test1['item_cnt_month'] = y_pred_GBR
submission = test1.drop(['shop_id', 'item_id', 'item_price', 'item_category_id'], axis=1)
submission.index.name = 'ID'

submission['item_cnt_month'].value_counts()

In [None]:
submission.to_csv('GBR_submission.csv') #1.2803

In [None]:
submission