In [None]:
#Loading all needed libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import product
import xgboost
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import calendar
from datetime import datetime
import matplotlib.pyplot as plt
from math import ceil
%matplotlib inline

In [None]:
#Loading Data
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
item_cats = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')

In [None]:
grouped = pd.DataFrame(sales.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].sum().reset_index())
fig, axes = plt.subplots(nrows=5, ncols=2, sharex=True, sharey=True, figsize=(16,20))
num_graph = 10
id_per_graph = ceil(grouped.shop_id.max() / num_graph)
count = 0
for i in range(5):
    for j in range(2):
        sns.pointplot(x='date_block_num', y='item_cnt_day', hue='shop_id', data=grouped[np.logical_and(count*id_per_graph <= grouped['shop_id'], grouped['shop_id'] < (count+1)*id_per_graph)], ax=axes[i][j])
        count += 1
    
train = sales.copy()
train = train.set_index('item_id').join(items.set_index('item_id')).drop('item_name', axis=1).reset_index()
train['month'] = train.date.apply(lambda x: datetime.strptime(x, '%d.%m.%Y').strftime('%m'))
train['year'] = train.date.apply(lambda x: datetime.strptime(x, '%d.%m.%Y').strftime('%Y'))

fig, axes = plt.subplots(nrows=5, ncols=2, sharex=True, sharey=True, figsize=(16,20))
num_graph = 10
id_per_graph = ceil(train.item_category_id.max() / num_graph)
count = 0
for i in range(5):
    for j in range(2):
        sns.pointplot(x='month', y='item_cnt_day', hue='item_category_id', 
                      data=train[np.logical_and(count*id_per_graph <= train['item_category_id'], train['item_category_id'] < (count+1)*id_per_graph)], 
                      ax=axes[i][j])
        count += 1
        
fig, axes = plt.subplots(nrows=5, ncols=2, sharex=True, sharey=True, figsize=(16,20))
num_graph = 10
id_per_graph = ceil(train.item_category_id.max() / num_graph)
count = 0
for i in range(5):
    for j in range(2):
        sns.pointplot(x='date_block_num', y='item_cnt_day', hue='item_category_id', 
                      data=train[np.logical_and(count*id_per_graph <= train['item_category_id'], train['item_category_id'] < (count+1)*id_per_graph)], 
                      ax=axes[i][j])
        count += 1


In [None]:
#Checking for item_cnt_day outliers
sns.boxplot(x=sales.item_cnt_day)

In [None]:
#Checking for item_price outliers
sns.boxplot(x=sales.item_price)

In [None]:
#Removing all tuples with item_price over 100,000 or negative price
train = sales[(sales.item_price < 100000) & (sales.item_price > 0)]
#Removing all tuples with more than 1000 units sold per day.
train = train[sales.item_cnt_day < 1001]

In [None]:
#Some shops have multiple shop ids
print(shops[shops.shop_id.isin([0, 57])]['shop_name'])
print(shops[shops.shop_id.isin([1, 58])]['shop_name'])
print(shops[shops.shop_id.isin([40, 39])]['shop_name'])
print(shops[shops.shop_id.isin([10, 11])]['shop_name'])

In [None]:
#Ensuring shops with multiple shop ids a single shop id.
#Specifically, one shop has ids 0 and 57, another has ids 1 and 58, and another has 40 and 39.
#To correct, we set all tuples with shop id 0 to shop id 57, all tuples with shop id 1 to shop id 58,
#and all tuples with shop id 40 to 39.
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57

train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58

train.loc[train.shop_id == 40, 'shop_id'] = 39
test.loc[test.shop_id == 40, 'shop_id'] = 39

train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [None]:
#We now create our basic training dataframe, which we will build on.
#The columns we our dataframe will start with is shop_id, item_id, and date_block_num.
index_cols = ['shop_id', 'item_id', 'date_block_num']

#To build the basic dataframe, we do the following:
#For each block_num in our dataset (i.e., for each month for which we have some sale tuple):
    #Find the unique shop ids of all tuples in our training set with that block_num (month)
    #Find the unique item id of all tuples in our training set with that block_num (month)
    #Calculate the Cartesian product of those unique shop ids, unique item ids, and the block_num.
    #Add the resulting set of tuples to the dataframe we're building.
df = [] 
for block_num in train['date_block_num'].unique():
    cur_shops = train.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = train.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    df.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
    

#Turn the list that we've created into an actual dataframe.
df = pd.DataFrame(np.vstack(df), columns = index_cols,dtype=np.int32)

#Get a count of the the number of items sold for each date_block_num, shop_id, item_id combination in our data set.
#This effectively gives us the sales per month for a given shop_id, item_id pair.
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

#Add these monthly sales to our dataframe by joining on the common columns. By the way, the added column for monthly sales
#is called 'item_cnt_month'
df = pd.merge(df, group, on=index_cols, how='left')
#Fill any null cells in the item_cnt_month column as 0s, but also set the max possible value as 20 and the lowest possible
#value as 0.
df['item_cnt_month'] = (df['item_cnt_month']
                                .fillna(0)
                                .clip(0,20)
                                .astype(np.float16))

In [None]:
#Add the test dataset to our dataframe so that we can make our predictions.
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)
df = pd.concat([df, test], ignore_index=True, sort=False, keys=index_cols)
#Zero out predictions.
df.fillna(0, inplace=True)

In [None]:
#Get the city each shop is located by splitting the shop_name category on spaces and getting the first val in the
#resulting list.
shops['city'] = shops['shop_name'].apply(lambda x: x.split()[0].lower())
#Eliminate the one anomaly of the city that starts with the exclamation point.
shops.loc[shops.city == '!якутск', 'city'] = 'якутск'
#Encode the city names and add a column to our dataframe to reflect each city's code. This code will be used in later
#analysis.
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])

#Hard-code a dict that maps each city used in our dataset to its coordinates and part of the country lies in.
coords = dict()
coords['якутск'] = (62.028098, 129.732555, 4)
coords['адыгея'] = (44.609764, 40.100516, 3)
coords['балашиха'] = (55.8094500, 37.9580600, 1)
coords['волжский'] = (53.4305800, 50.1190000, 3)
coords['вологда'] = (59.2239000, 39.8839800, 2)
coords['воронеж'] = (51.6720400, 39.1843000, 3)
coords['выездная'] = (0, 0, 0)
coords['жуковский'] = (55.5952800, 38.1202800, 1)
coords['интернет-магазин'] = (0, 0, 0)
coords['казань'] = (55.7887400, 49.1221400, 4)
coords['калуга'] = (54.5293000, 36.2754200, 4)
coords['коломна'] = (55.0794400, 38.7783300, 4)
coords['красноярск'] = (56.0183900, 92.8671700, 4)
coords['курск'] = (51.7373300, 36.1873500, 3)
coords['москва'] = (55.7522200, 37.6155600, 1)
coords['мытищи'] = (55.9116300, 37.7307600, 1)
coords['н.новгород'] = (56.3286700, 44.0020500, 4)
coords['новосибирск'] = (55.0415000, 82.9346000, 4)
coords['омск'] = (54.9924400, 73.3685900, 4)
coords['ростовнадону'] = (47.2313500, 39.7232800, 3)
coords['спб'] = (59.9386300, 30.3141300, 2)
coords['самара'] = (53.2000700, 50.1500000, 4)
coords['сергиев'] = (56.3000000, 38.1333300, 4)
coords['сургут'] = (61.2500000, 73.4166700, 4)
coords['томск'] = (56.4977100, 84.9743700, 4)
coords['тюмень'] = (57.1522200, 65.5272200, 4)
coords['уфа'] = (54.7430600, 55.9677900, 4)
coords['химки'] = (55.8970400, 37.4296900, 1)
coords['цифровой'] = (0, 0, 0)
coords['чехов'] = (55.1477000, 37.4772800, 4)
coords['ярославль'] = (57.6298700, 39.8736800, 2) 

#Add on columns representing the the city of each tuples latitude, longitude, and which part of the country it's in.
shops['city_coord_1'] = shops['city'].apply(lambda x: coords[x][0])
shops['city_coord_2'] = shops['city'].apply(lambda x: coords[x][1])
shops['country_part'] = shops['city'].apply(lambda x: coords[x][2])

shops = shops[['shop_id', 'city_code', 'city_coord_1', 'city_coord_2', 'country_part']]

In [None]:
#Merge our dataframe with the augmented shops dataframe we have that includes each shop's city code, latitude, longitude
#and part of the country it is in. Join them on the shop id.
df = pd.merge(df, shops, on=['shop_id'], how='left')

In [None]:
#Create a dict of common categories.
map_dict = {
            'Чистые носители (штучные)': 'Чистые носители',
            'Чистые носители (шпиль)' : 'Чистые носители',
            'PC ': 'Аксессуары',
            'Служебные': 'Служебные '
            }

#Merge items and item_cats on item_category id.
items = pd.merge(items, item_cats, on='item_category_id')

#Augment items by creating a column called item 'item_category' that is the provided item_category that precedes its subcategory
#in the item_category_name column. Item category should be the value that category maps to if its in the dict, and just what
#it is otherwise. Finally, encode these item categories for later analysis.
items['item_category'] = items['item_category_name'].apply(lambda x: x.split('-')[0])
items['item_category'] = items['item_category'].apply(lambda x: map_dict[x] if x in map_dict.keys() else x)
items['item_category_common'] = LabelEncoder().fit_transform(items['item_category'])

#Also encode the full item_category_name of each item.
items['item_category_code'] = LabelEncoder().fit_transform(items['item_category_name'])
items = items[['item_id', 'item_category_common', 'item_category_code']]

In [None]:
#Merge our dataframe with the augmented items dataframe we have that includes each item's item id, common category 
#code, and given category code. Join them on the item id.
df = pd.merge(df, items, on=['item_id'], how='left')

In [None]:
#Helper method to find the number of Saturdays in a month, the number of days in a month, and the month as an integer
#of each date_block_num in our dataset.
def count_days(date_block_num):
    year = 2013 + date_block_num // 12
    month = 1 + date_block_num % 12
    weeknd_count = len([1 for i in calendar.monthcalendar(year, month) if i[6] != 0])
    days_in_month = calendar.monthrange(year, month)[1]
    return weeknd_count, days_in_month, month

#Create a dict that maps date_block_nums in our dataset to the number of Saturdays in that month, the number of days 
#in that month, and the date_block_num as a month as an integer using our count_days method.
map_dict = {i: count_days(i) for i in range(35)}

#For each tuple, find the number of Saturdays in the month in which that sale was made (number of saturdays will be
#thought of as the number of weekends in a relaxed sense). Store these values in a 'weekend_count' column.
df['weeknd_count'] = df['date_block_num'].apply(lambda x: map_dict[x][0])
#For each tuple, find the number of days in the month in which that sale was made. Store these values in a 
#'days_in_month' column.
df['days_in_month'] = df['date_block_num'].apply(lambda x: map_dict[x][1])

In [None]:
#Find the first month for each each item that that item was available.
first_item_block = df.groupby(['item_id'])['date_block_num'].min().reset_index()
first_item_block['item_first_interaction'] = 1

#Find the first month for each item in a shop that that item was sold.
first_shop_item_buy_block = df[df['item_cnt_month'] > 0].groupby(['shop_id', 'item_id'])['date_block_num'].min().reset_index()
first_shop_item_buy_block['first_date_block_num'] = first_shop_item_buy_block['date_block_num']

#Merge our datasets.
df = pd.merge(df, first_item_block[['item_id', 'date_block_num', 'item_first_interaction']], on=['item_id', 'date_block_num'], how='left')
#Merge our datasets.
df = pd.merge(df, first_shop_item_buy_block[['item_id', 'shop_id', 'first_date_block_num']], on=['item_id', 'shop_id'], how='left')
#Add a mask to indicate for each tuple if the item was sold in the first month that the item 
#was sold in that shop or not.
df['first_date_block_num'].fillna(100, inplace=True)
df['shop_item_sold_before'] = (df['first_date_block_num'] < df['date_block_num']).astype('int8')
df.drop(['first_date_block_num'], axis=1, inplace=True)

#Add a mask to indicate for each tuple if the item was sold in the first month the item was
#available or not.
df['item_first_interaction'].fillna(0, inplace=True)
df['shop_item_sold_before'].fillna(0, inplace=True)
df['item_first_interaction'] = df['item_first_interaction'].astype('int8')  
df['shop_item_sold_before'] = df['shop_item_sold_before'].astype('int8')

In [None]:
#Add lag features to our dataframe.
#Helper method to add a lag feature to our dataframe. Given a dataframe, a number of time steps we would like
#to look at, and a column to focus on, we can add a lag feature to our dataframe.
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
        df[col+'_lag_'+str(i)] = df[col+'_lag_'+str(i)].astype('float16')
    return df

#The first lag feature we would like to add is the item_cnt_month feature at 1, 2, and 3 time steps.
df = lag_feature(df, [1, 2, 3], 'item_cnt_month')
df.head(5)

In [None]:
#Get the average price of a given item sold at a given shop in a given month.
index_cols = ['shop_id', 'item_id', 'date_block_num']
group = train.groupby(index_cols)['item_price'].mean().reset_index().rename(columns={"item_price": "avg_shop_price"}, errors="raise")
#Merge the datasets so that we now have for each tuple the average price of the item sold in that shop during that month
#as specified by the tuple.
df = pd.merge(df, group, on=index_cols, how='left')

#For any null values, fill them in with 0s.
df['avg_shop_price'] = (df['avg_shop_price']
                                .fillna(0)
                                .astype(np.float16))

#Get the average price of a given item in a given month.
index_cols = ['item_id', 'date_block_num']
group = train.groupby(['date_block_num','item_id'])['item_price'].mean().reset_index().rename(columns={"item_price": "avg_item_price"}, errors="raise")


#Merge the datasets so that we now have for each tuple the average price of that item during that month across all shops.
df = pd.merge(df, group, on=index_cols, how='left')
#For any null values, fill them in with 0s.
df['avg_item_price'] = (df['avg_item_price']
                                .fillna(0)
                                .astype(np.float16))

#Find how much the the average price of an item for a shop in a given month deviates from the average price of that same
#item across all shops as a percent of the average price of that item across all shops for that month.
df['item_shop_price_avg'] = (df['avg_shop_price'] - df['avg_item_price']) / df['avg_item_price']
#For any null values, fill them in with 0s.
df['item_shop_price_avg'].fillna(0, inplace=True)

#Add this price difference as a lag feature and get rid of the columns we added to compute it.
df = lag_feature(df, [1, 2, 3], 'item_shop_price_avg')
df.drop(['avg_shop_price', 'avg_item_price', 'item_shop_price_avg'], axis=1, inplace=True)

In [None]:
#In order to elicit a relationship between month-item pairs and the amount sold, we need to reduce the pairs to some
#meaningful numerical value. To do this, we map each month-item pair to the average number of units sold that month.
item_id_target_mean = df.groupby(['date_block_num','item_id'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "item_target_enc"}, errors="raise")

#We add this mapping to our data set, joining our tuples on the date_block_num and item_id columns
df = pd.merge(df, item_id_target_mean, on=['date_block_num','item_id'], how='left')

#For any null values, fill them in with 0s.
df['item_target_enc'] = (df['item_target_enc']
                                .fillna(0)
                                .astype(np.float16))
#Add this column as a lag feature and get rid of the columns we added to compute it.
df = lag_feature(df, [1, 2, 3], 'item_target_enc')
df.drop(['item_target_enc'], axis=1, inplace=True)

In [None]:
#Likewise, in order to elicit a relationship between month-item-city pairs and the amount sold, we need to reduce the pairs to some
#meaningful numerical value. To do this, we map each month-item-city pair to the average number of units sold that month in that city.
item_id_target_mean = df.groupby(['date_block_num','item_id', 'city_code'])['item_cnt_month'].mean().reset_index().rename(columns={
    "item_cnt_month": "item_loc_target_enc"}, errors="raise")
#We add this mapping to our data set, joining our tuples on the date_block_num, item_id, and city_code columns
df = pd.merge(df, item_id_target_mean, on=['date_block_num','item_id', 'city_code'], how='left')

#For any null values, fill them in with 0s.
df['item_loc_target_enc'] = (df['item_loc_target_enc']
                                .fillna(0)
                                .astype(np.float16))

#Add this column as a lag feature and get rid of the columns we added to compute it.
df = lag_feature(df, [1, 2, 3], 'item_loc_target_enc')
df.drop(['item_loc_target_enc'], axis=1, inplace=True)

In [None]:
#Finally, in order to elicit a relationship between month-item-shop pairs and the amount sold, we need to reduce the pairs to some
#meaningful numerical value. To do this, we map each month-item-shop pair to the average number of units sold that month in that shop.
item_id_target_mean = df.groupby(['date_block_num','item_id', 'shop_id'])['item_cnt_month'].mean().reset_index().rename(columns={
    "item_cnt_month": "item_shop_target_enc"}, errors="raise")

#We add this mapping to our data set, joining our tuples on the date_block_num, item_id, and city_code columns
df = pd.merge(df, item_id_target_mean, on=['date_block_num','item_id', 'shop_id'], how='left')

#For any null values, fill them in with 0s.
df['item_shop_target_enc'] = (df['item_shop_target_enc']
                                .fillna(0)
                                .astype(np.float16))

#Add this column as a lag feature and get rid of the columns we added to compute it.
df = lag_feature(df, [1, 2, 3], 'item_shop_target_enc')
df.drop(['item_shop_target_enc'], axis=1, inplace=True)

In [None]:
#Now, we want to see how the newness of an item affects sales. To see this, we get tuples corresponding to a first-month 
#sale for an item (meaning the item was sold in the first month it was available), and find the the average number of units
#sold by item category (We must do it by category because looking by item is too specific. We want to predict how this might
#impact sales for a new item in the same category.).
item_id_target_mean = df[df['item_first_interaction'] == 1].groupby(['date_block_num','item_category_code'])['item_cnt_month'].mean().reset_index().rename(columns={
    "item_cnt_month": "new_item_cat_avg"}, errors="raise")

#We add this column to our dataset, once again joining on date_block_num and item_category_code
df = pd.merge(df, item_id_target_mean, on=['date_block_num','item_category_code'], how='left')

#For any null values, fill them in with 0s.
df['new_item_cat_avg'] = (df['new_item_cat_avg']
                                .fillna(0)
                                .astype(np.float16))

#Add this column as a lag feature and get rid of the columns we added to compute it.
df = lag_feature(df, [1, 2, 3], 'new_item_cat_avg')
df.drop(['new_item_cat_avg'], axis=1, inplace=True)


In [None]:
#Now, we want to see how the newness of an item within a category affects sales for specific stores. To see this, 
#we again get tuples corresponding to a first-month sales for an item, and find the the average number of units
#sold by item category and shop id.
item_id_target_mean = df[df['item_first_interaction'] == 1].groupby(['date_block_num','item_category_code', 'shop_id'])['item_cnt_month'].mean().reset_index().rename(columns={
    "item_cnt_month": "new_item_shop_cat_avg"}, errors="raise")

#We add this column to our dataset, once again joining on date_block_num and item_category_code
df = pd.merge(df, item_id_target_mean, on=['date_block_num','item_category_code', 'shop_id'], how='left')

#For any null values, fill them in with 0s.
df['new_item_shop_cat_avg'] = (df['new_item_shop_cat_avg']
                                .fillna(0)
                                .astype(np.float16))

#Add this column as a lag feature and get rid of the columns we added to compute it.
df = lag_feature(df, [1, 2, 3], 'new_item_shop_cat_avg')
df.drop(['new_item_shop_cat_avg'], axis=1, inplace=True)

In [None]:
#We must remove the first tree months of data since we don't have complete data for those tuples (some of the lag
#features will be missing).
df.fillna(0, inplace=True)
df = df[(df['date_block_num'] > 2)]
df.head()

In [None]:
#Save dataset
df.drop(['ID'], axis=1, inplace=True, errors='ignore')
df.to_pickle('df.pkl')

In [None]:
#Training Time
df = pd.read_pickle('df.pkl')

#Create a dataframe containing only tuples of sales that occurred before the 33rd month, but drop item_cnt_month from it
#These will be our inputs.
X_train = df[df.date_block_num < 33].drop(['item_cnt_month'], axis=1)

#Create a dataframe containing only tuples of sales that occurred before the 33rd month, but only keep 
#item_cnt_month in it. These will be our outputs.
Y_train = df[df.date_block_num < 33]['item_cnt_month']

#Finally, we separate the 33rd month in the same way, but use it for evaluation.
X_valid = df[df.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = df[df.date_block_num == 33]['item_cnt_month']
X_test = df[df.date_block_num == 34].drop(['item_cnt_month'], axis=1)
del df

In [None]:
#Get all of our features in a list.
feature_name = X_train.columns.tolist()


#Parameters specific to our lgb.train() function
params = {
    'objective': 'mse',
    'metric': 'rmse',
    'num_leaves': 2 ** 7 - 1,
    'learning_rate': 0.005,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 5,
    'seed': 1,
    'verbose': 1
}

feature_name_indexes = [ 
                        'country_part', 
                        'item_category_common',
                        'item_category_code', 
                        'city_code',
]

#Turning the dataset into an lgb dataset.
lgb_train = lgb.Dataset(X_train[feature_name], Y_train)
#Turning the dataset into an lgb dataset for evaluation.
lgb_eval = lgb.Dataset(X_valid[feature_name], Y_valid, reference=lgb_train)

evals_result = {}
#Building our model
gbm = lgb.train(
        params, 
        lgb_train,
        num_boost_round=3000,
        valid_sets=(lgb_train, lgb_eval), 
        feature_name = feature_name,
        categorical_feature = feature_name_indexes,
        verbose_eval=5, 
        evals_result = evals_result,
        early_stopping_rounds = 100)

In [None]:
#Plot of each feature's importance
lgb.plot_importance(
    gbm, 
    max_num_features=50, 
    importance_type='gain', 
    figsize=(12,8));

#Making our predictions on the test set and outputting results to a csv
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
Y_test = gbm.predict(X_test[feature_name]).clip(0, 20)

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('gbm_submission.csv', index=False)