In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
from pandas import read_csv
import seaborn as sns
import time

from collections import Counter
from scipy import stats
import random


# Exploring and Predicting Sales

## Descrition of this competition:
This challenge serves as final project for the "How to win a data science competition" Coursera course.

In this competition you will work with a challenging time-series dataset consisting of daily sales data, kindly provided by one of the largest Russian software firms - 1C Company. 

We are asking you to predict total sales for every product and store in the next month. By solving this competition you will be able to apply and enhance your data science skills.

<b>Data fields</b><br>
<b>ID</b> - an Id that represents a (Shop, Item) tuple within the test set<br>
<b>shop_id</b> - unique identifier of a shop<br>
<b>item_id</b> - unique identifier of a product<br>
<b>item_category_id</b> - unique identifier of item category<br>
<b>item_cnt_day</b> - number of products sold. You are predicting a monthly amount of this measure<br>
<b>item_price</b> - current price of an item<br>
<b>date</b> - date in format dd/mm/yyyy<br>
<b>date_block_num</b> - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33<br>
<b>item_name</b> - name of item<br>
<b>shop_name</b> - name of shop<br>
<b>item_category_name</b> - name of item category

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:#tqdm(df.columns):
        col_type = df[col].dtypes

        if col_type=='object':
            df[col] = df[col].astype('category')

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [None]:
import pandas as pd
rootfolder = '/kaggle/input/competitive-data-science-predict-future-sales'
items_df = pd.read_csv(f'{rootfolder}/items.csv')
shops_df = pd.read_csv(f'{rootfolder}/shops.csv')
icats_df = pd.read_csv(f'{rootfolder}/item_categories.csv')
sales_train = pd.read_csv(f'{rootfolder}/sales_train.csv')
smpsb_df = pd.read_csv(f'{rootfolder}/sample_submission.csv')
test  = pd.read_csv(f'{rootfolder}/test.csv')

In [None]:
sales_train.head()


In [None]:
test.head()

In [None]:
shops_df.info()

In [None]:
items_df.info()

In [None]:
icats_df.info()


New feature the total price


# Explore target feature



In [None]:


plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
sns.boxplot(x=sales_train.item_cnt_day)

plt.figure(figsize=(10,4))
plt.xlim(sales_train.item_price.min(), sales_train.item_price.max()*1.1)
sns.boxplot(x=sales_train.item_price)



In [None]:
sns.scatterplot(x=sales_train.date,y=sales_train.date_block_num)

In [None]:

sales_train = sales_train[sales_train.item_price<100000]
sales_train = sales_train[sales_train.item_cnt_day<1001]



There is one item with price below zero. Fill it with median.


In [None]:
median = sales_train[(sales_train.shop_id==32)&(sales_train.item_id==2973)&(sales_train.date_block_num==4)&(sales_train.item_price>0)].item_price.median()
sales_train.loc[sales_train.item_price<0, 'item_price'] = median

Month

In [None]:

plt.figure(figsize=(35,10))
sns.countplot(x='date_block_num', data=sales_train);



Shops

In [None]:
plt.figure(figsize=(35,10))
sns.countplot(x='shop_id', data=sales_train)

## Extract feature based on Categories


Function utils

In [None]:
items_categories_merged = pd.merge(icats_df,items_df,on='item_category_id',how='left')

In [None]:
def exclude_preprositions(x):
    x = x.split(' ')
    x = ' '.join(i for i in x if not i in prepositions_to_exclude).strip()
    return x


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
items_categories_merged['type_of_category']=items_categories_merged['item_category_name'].apply(lambda x: x.split(' ')[0].strip())
dict_types = dict(items_categories_merged['type_of_category'].value_counts())
cat, _ = zip(*sorted(dict_types.items(),key=lambda x: x[1])[::-1][:5])
print('Most frequent types of categories : {0}'.format(cat))
num_features = 10
symbols_to_exclude = ['[',']','!','.',',','*','(',')','"',':']
prepositions_to_exclude = ['в','на','у','the','a','an','of','для']
for symbol in symbols_to_exclude:
    items_categories_merged['item_name'] = items_categories_merged['item_name'].str.replace(symbol,'')
items_categories_merged['item_name'] = items_categories_merged['item_name'].str.lower()
items_categories_merged['item_name'] = items_categories_merged['item_name'].str.replace('-',' ')
items_categories_merged['item_name'] = items_categories_merged['item_name'].str.replace('/',' ')
items_categories_merged['item_name'] = items_categories_merged['item_name'].str.strip()
items_categories_merged['item_name'] = items_categories_merged['item_name'].apply(exclude_preprositions)
vectorizer = TfidfVectorizer(max_features=num_features)
res = vectorizer.fit_transform(items_categories_merged['item_name'])
print('Top {0} features of tfidf : {1}'.format(num_features,vectorizer.get_feature_names()))
count_vect_df = pd.DataFrame(res.todense(), columns=vectorizer.get_feature_names())
items_categories_merged = pd.concat([items_categories_merged,count_vect_df],axis=1)


In [None]:
items_categories_merged.drop(columns=['item_name','item_category_name'],inplace=True)

## Features based on name:

In [None]:


import re
def create_city_name(x):
    for i in not_city:
        if i in x:
            return 'unk_city'
    return x.split(' ')[0].strip()
def create_shop_type(x):
    to_return = 'unk_type'
    for i in type_of_shops:
        regex = re.compile(i)
        if re.search(regex,x):
                to_return = i 
    return to_return

not_city = ['Выездная Торговля','Интернет-магазин','Цифровой склад 1С-Онлайн']
type_of_shops = ['ТРЦ', 'ТЦ','ТРК','ТК','МТРЦ']+not_city
shops_df['city_name'] = shops_df['shop_name'].apply(create_city_name)
shops_df['shop_type'] = shops_df['shop_name'].apply(create_shop_type)



In [None]:

shops_df.drop(columns='shop_name',inplace=True)



In [None]:
shops_df.goup()

 ## Aggregate test train data

We going to aggregate the data.


In [None]:
train_sales = sales_train

In [None]:
    train_sales["date"] = pd.to_datetime(train_sales["date"], format="%d.%m.%Y") # seting the column as pandas datetime
    train_sales["month"] = train_sales['date'].dt.day # extracting month
    train_sales.info()

In [None]:


y_hat_mean = (1.41241**2-1.25011**2-1)/-2
print('Mean of target values in public leaderboard is : {0}'.format(y_hat_mean))



In [None]:


len(list(set(test.item_id) - set(test.item_id).intersection(set(test.item_id)))), len(list(set(test.item_id))), len(test)



In [None]:

mean = sales_train.groupby(['date_block_num','shop_id','item_id'])['item_cnt_day'].sum().mean()
print('Mean of target value in train data : {0}'.format(mean))
if np.abs(mean-y_hat_mean)<0.2:
    print('The mean of train and test targets is aligned!')
else:
    print('The mean of train and test targets is not aligned!')



In [None]:
from itertools import product
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train_sales[train_sales.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)

In [None]:
train_sales['revenue'] = train_sales['item_price'] *  train_sales['item_cnt_day']

In [None]:
group = train_sales.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # NB clip target here
                                .astype(np.float16))

In [None]:
train_sales['shop_id'] = train_sales['shop_id'].astype(np.int8)
train_sales['item_id'] = train_sales['item_id'].astype(np.int16)
train_sales['date_block_num'] = train_sales['date_block_num'].astype(np.int8)

Meged test set

In [None]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)



In [None]:
matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace=True) # 34 month

In [None]:
matrix.head()

### Aggregate features 
We going aggregate the features of category month and day

In [None]:
matrix = pd.merge(matrix, shops_df, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items_categories_merged, on=['item_id'], how='left')

In [None]:
matrix

In [None]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [None]:
matrix = lag_feature(matrix, [1,2,3,6,12], 'item_cnt_month')

In [None]:
matrix.head()

### Encoded features using mean

In [None]:
group = matrix.groupby(['date_block_num']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num'], how='left')
matrix['date_avg_item_cnt'] = matrix['date_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_avg_item_cnt')
matrix.drop(['date_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_cnt'] = matrix['date_item_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1,2,3,6,12], 'date_item_avg_item_cnt')
matrix.drop(['date_item_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_shop_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_avg_item_cnt'] = matrix['date_shop_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1,2,3,6,12], 'date_shop_avg_item_cnt')
matrix.drop(['date_shop_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_cat_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_category_id'], how='left')
matrix['date_cat_avg_item_cnt'] = matrix['date_cat_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_cat_avg_item_cnt')
matrix.drop(['date_cat_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_cat_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
matrix['date_shop_cat_avg_item_cnt'] = matrix['date_shop_cat_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_shop_cat_avg_item_cnt')
matrix.drop(['date_shop_cat_avg_item_cnt'], axis=1, inplace=True)

In [None]:
matrix['month'] = matrix['date_block_num'] % 12

In [None]:
def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)         
    return df

matrix = fill_na(matrix)

In [None]:
matrix

In [None]:
matrix.info()

### Feature preproccessing

In [None]:
to_encode = ['city_name','shop_type','type_of_category']
nunique_cat = {}
for i in to_encode:
    matrix[i] = matrix[i].factorize()[0]
    nunique_cat.update({i:matrix[i].nunique()})
nunique_cat.update({'shop_id':matrix['shop_id'].nunique()})
nunique_cat.update({'item_id':matrix['item_id'].nunique()})
nunique_cat.update({'item_category_id':matrix['item_category_id'].nunique()})
print('Factorized all the columns!')

In [None]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

dataset = downcast_dtypes(matrix)

In [None]:
dataset.to_pickle('final_dataset.pkl')

# Machine learning part

In [None]:
import pandas as pd
import lightgbm as lgb
from lightgbm import plot_importance

In [None]:
dataset = pd.read_pickle('final_dataset.pkl')

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
columnsNames = ['date_block_num',                  
'shop_id',                          
'item_id',                          
'item_cnt_month',                                          
'item_category_id',                 
'type_of_category',   
'city_name',               
'bd',                               
'cd',                               
'dvd',                              
'jewel',                            
'mp3',                              
'pc',                               
'версия',                           
'регион',                           
'русская',                          
'цифровая',  
'item_cnt_month_lag_1',           
'item_cnt_month_lag_2',        
'item_cnt_month_lag_3',         
'item_cnt_month_lag_6',         
'item_cnt_month_lag_12',          
'date_avg_item_cnt_lag_1',                  
'date_item_avg_item_cnt_lag_1',     
'date_item_avg_item_cnt_lag_2',     
'date_item_avg_item_cnt_lag_3',     
'date_item_avg_item_cnt_lag_6',     
'date_item_avg_item_cnt_lag_12',    
'date_shop_avg_item_cnt_lag_1',     
'date_shop_avg_item_cnt_lag_2',     
'date_shop_avg_item_cnt_lag_3',     
'date_shop_avg_item_cnt_lag_6',     
'date_shop_avg_item_cnt_lag_12',    
'date_cat_avg_item_cnt_lag_1',      
'date_shop_cat_avg_item_cnt_lag_1', 
'month']         


In [None]:
data = dataset[columnsNames]

In [None]:
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)



In [None]:
from xgboost import XGBRegressor
from xgboost import plot_importance

In [None]:
model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

In [None]:
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    "ID": X_test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('xgb_submission.csv', index=False)