In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import nltk

In [None]:
import os
import re
import pickle
from tqdm import tqdm
from itertools import product

In [None]:
from sklearn.linear_model import(LinearRegression, SGDRegressor)

In [None]:
import xgboost as xgb

In [None]:
#!python3.7 -m pip install --upgrade pip
!pip install pymystem3

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
from sklearn.feature_extraction.text import (
    CountVectorizer, HashingVectorizer, TfidfVectorizer)

In [None]:
from scipy.stats.stats import pearsonr

In [None]:
os.listdir('../input/competitive-data-science-predict-future-sales')

In [None]:
root = '../input/competitive-data-science-predict-future-sales'

In [None]:
df_train = pd.read_csv(f'{root}/sales_train.csv')
df_items = pd.read_csv(f'{root}/items.csv')
df_cat = pd.read_csv(f'{root}/item_categories.csv')
df_shops = pd.read_csv(f'{root}/shops.csv')
df_test = pd.read_csv(f'{root}/test.csv')
sample_submit = pd.read_csv(f'{root}/sample_submission.csv')

In [None]:
df_test.head()

### Understanding `sales_train`

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
# checking and removing NaN values - **No Nan values found**
sns.heatmap(df_train.isna())

#### cleaning impractical values

In [None]:
# item_price can't be negative or zer0
print(df_train['item_price'][df_train['item_price']<=0])
# removing negative priced items
df_train = df_train[df_train['item_price']>0]

In [None]:
# item_cnt_day can't be negative or floating - assuming the items were refunded
# we will set the negative values to zer0
neg_items_sold = df_train['item_cnt_day'][df_train['item_cnt_day']<0].shape[0]
print('Total no. negative values:', neg_items_sold)
print('Percentage of negative values', neg_items_sold*100/df_train.shape[0])
df_train['item_cnt_day'] = df_train['item_cnt_day'].apply(lambda x: 0 if x<0 else x)
df_train['item_cnt_day'] = df_train['item_cnt_day'].apply(lambda x: np.round(x))

In [None]:
df_train['item_cnt_day'].nunique()

In [None]:
print('Min val:', df_train['item_price'].min())
print('Max val:', df_train['item_price'].max())

In [None]:
print('Min val:', df_train['item_cnt_day'].min())
print('Max val:', df_train['item_cnt_day'].max())

#### Removing outliers

In [None]:
fig, axs = plt.subplots(ncols=3, figsize=(20, 6))
sns.distplot(df_train['item_price'], hist=False, ax=axs[0])
sns.boxplot(df_train['item_price'], ax=axs[1])
sns.distplot(df_train['item_price'][df_train['item_price']<50000], ax=axs[2])

In [None]:
# removing all values greater than 100,000 from `item_price` column
df_train = df_train[df_train['item_price']<100000]

## EDA

#### understanding date-based data

In [None]:
df_train_datewise = df_train.copy()
df_train_datewise['date'] =  pd.to_datetime(df_train_datewise['date'])
df_train_datewise.set_index('date', inplace=True)

In [None]:
df_train.drop(['date_block_num','shop_id'], axis=1).groupby(['item_id']).sum().sort_values(by='item_cnt_day', ascending=False)[:20].plot(kind='bar')
plt.title('top 20 items sold (sum)')
plt.ylabel('#no of items sold')

In [None]:
df_train.drop(['date_block_num','item_id'], axis=1).groupby(['shop_id']).mean().sort_values(by='item_cnt_day', ascending=False)[:20].plot(kind='bar')
plt.title('top 20 shops with the highest sale (mean)')
plt.ylabel('#no of items sold')

In [None]:
monthly_sum = df_train_datewise.resample('M').sum()
monthly_sum['month'] = monthly_sum.index.month

monthly_mean = df_train_datewise.resample('M').mean()
monthly_mean['month'] = monthly_mean.index.month

In [None]:
f, axes = plt.subplots(2, 1, figsize=(22, 10), sharex=True)
sns.lineplot(x="month", y="item_cnt_day", data=monthly_mean, ax=axes[0]).set_title("Monthly mean")
sns.lineplot(x="month", y="item_cnt_day", data=monthly_sum, ax=axes[1]).set_title("Monthly sum")
plt.show()

### Understanding `items`

In [None]:
df_items.head()

In [None]:
df_items['item_name']

In [None]:
# creating lemmatizer and stopwords list
mystem = Mystem()
russian_sw = stopwords.words("russian")

In [None]:
# preprocessing russian
def pre_pro_text(text):
    
    # lemmatizing
    tokens = mystem.lemmatize(text.lower())
    
    # removing sw and punctuations
    tokens = [token for token in tokens if \
                token not in russian_sw \
                and token!=" " \
                and token.strip() not in punctuation]
    text = " ".join(tokens)
    
    # removing single letters
    pattern = r"(((?<=^)|(?<= )).((?=$)|(?= )))|[*&^%@#$\(\)+]"
    text = re.sub("\s+", " ", re.sub(pattern, '', text).strip())
    return text

In [None]:
df_items['item_name'] = df_items['item_name'].apply(pre_pro_text)

In [None]:
df_items['item_name'].head(10)

#### Generating text-based features
- Count Vectorizer
- Tf-Idf
- Hash Vectorizer


In [None]:
def create_text_features(df, col='item_name'):
    vectorizers = [ 
                ('cvec', CountVectorizer(analyzer='char_wb',
                                         ngram_range=(1, 2))),
                ('hvec', HashingVectorizer()),
                ('tfidf',TfidfVectorizer(ngram_range=(1,2)))
            ]
    text_features = {}
    for vec, vec_f in vectorizers:
        text_features[vec] = vec_f.fit_transform(df[col])
    
    return text_features

### Undestanding `shops`

In [None]:
df_shops.head(10)

In [None]:
df_shops['city'] = df_shops['shop_name'].apply(lambda x: x.split()[0]).apply(pre_pro_text)

We will try to find out if different `shops_names` are in-fact same the same shop with an extra word or two

In [None]:
def sentence_corr_heat_map(documents, threshold=0.7):
    all_uni_words = list(set(' '.join(documents).split(' ')))
    word_to_num = {w:i for i, w in enumerate(all_uni_words)}

    sim_mat = np.zeros((shop_names_t.shape[0], len(all_uni_words)))

    for i, sentence in enumerate(documents):
        for word in sentence.split():
            sim_mat[i, word_to_num[word]] = 1

    corr_mat = np.zeros((sim_mat.shape[0], sim_mat.shape[0]))
    for i in range(sim_mat.shape[0]):
        for j in range(sim_mat.shape[0]):
            corr_mat[i, j] = pearsonr(sim_mat[i], sim_mat[j])[0]
    
    plt.figure(figsize=(10, 8))
    sns.heatmap((corr_mat>threshold)*corr_mat, linewidths=0.1)

In [None]:
shop_names_t = df_shops['shop_name'].apply(pre_pro_text).values
sentence_corr_heat_map(shop_names_t)

We will map the followins shop_names to a single shop_id as they are essentially the same shops

In [None]:
df_shops['shop_name'][10:12]

In [None]:
df_shops['shop_name'][23:25]

In [None]:
df_shops['shop_name'].loc[[0,57]]

In [None]:
df_shops['shop_name'].loc[[1,58]]

In [None]:
df_shops['shop_name'].loc[[39, 40]]

In [None]:
#10    Жуковский ул. Чкалова 39м?
#11    Жуковский ул. Чкалова 39м²
df_train.loc[df_train.shop_id == 10, 'shop_id'] = 11
df_test.loc[df_test.shop_id == 10, 'shop_id'] = 11
#23    Москва ТК "Буденовский" (пав.А2)
#24    Москва ТК "Буденовский" (пав.К7)
df_train.loc[df_train.shop_id == 23, 'shop_id'] = 25
df_test.loc[df_test.shop_id == 23, 'shop_id'] = 25
#0     !Якутск Орджоникидзе, 56 фран
#57          Якутск Орджоникидзе, 56
df_train.loc[df_train.shop_id == 0, 'shop_id'] = 57
df_test.loc[df_test.shop_id == 0, 'shop_id'] = 57
#39              РостовНаДону ТРК "Мегацентр Горизонт"
#40    РостовНаДону ТРК "Мегацентр Горизонт" Островной
df_train.loc[df_train.shop_id == 39, 'shop_id'] = 40
df_test.loc[df_test.shop_id == 39, 'shop_id'] = 40

In [None]:
# fixing shop name
df_shops.loc[df_shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'

In [None]:
df_shops['city'].value_counts().plot(kind='bar', figsize=(15, 7))
plt.title('Shops per City')
plt.xlabel('Cities')
plt.ylabel('Number of Shops')

In [None]:
# encoding cities
enc_city = LabelEncoder()
df_shops['city_encoded'] = enc_city.fit_transform(df_shops['city'])

In [None]:
df_shops['shop_name'] = df_shops['shop_name'].apply(pre_pro_text)

In [None]:
pre_pro_text(df_shops['shop_name'][1])

In [None]:
df_shops = df_shops.drop('city', axis=1)

### Understanding `Categories`

In [None]:
df_cat['item_cats'] = df_cat['item_category_name']\
                        .apply(lambda x: x.split('-'))
df_cat['main_cat'] = df_cat['item_cats']\
                        .apply(lambda x: x[0].strip())
df_cat['sub_cat'] =  df_cat['item_cats']\
                        .apply(lambda x: x[1].strip() if len(x)>1 else x[0])

In [None]:
df_cat.head()

In [None]:
# encoding main category
enc_mn_cat = LabelEncoder()
df_cat['main_cat_enc'] = enc_mn_cat.fit_transform(df_cat['main_cat'])

# encoding sub category
enc_sb_cat = LabelEncoder()
df_cat['sub_cat_enc'] = enc_sb_cat.fit_transform(df_cat['sub_cat'])


In [None]:
df_cat = df_cat[['item_category_id', 'main_cat_enc',
                 'sub_cat_enc']]

### Undestanding `test`

In [None]:
df_test.info()

In [None]:
df_test.head()

In [None]:
df_test['date_block_num'] = 34
df_test['item_cnt_month'] = 0

In [None]:
df_test = df_test[['date_block_num', 'shop_id', 'item_id', 'item_cnt_month']]

In [None]:
len(set(df_test['item_id']) - set(df_test['item_id']).intersection(df_train['item_id'])), len(set(df_test['item_id'])), len(df_test)

In [None]:
df_train.drop(['date', 'item_price'], axis=1, inplace=True)

In [None]:
%%time

# grouping the data month-wise
matrix = []
cols = ['date_block_num', 'shop_id', 'item_id']

for i in tqdm(range(max(df_train.date_block_num)+1)):
    sales = df_train[df_train.date_block_num==i]
    matrix.append(np.array(list(product([i],
        df_train[df_train.date_block_num==i].shop_id.unique(),
        df_train[df_train.date_block_num==i].item_id.unique())),
                           dtype='int16'))
    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)

matrix.sort_values(cols, inplace=True)


In [None]:
%%time

group = df_train.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day':['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                               .fillna(0)
                               .clip(0, 20)
                               .astype(np.float16))

In [None]:
matrix = matrix.append(df_test)

In [None]:
matrix.item_cnt_month.value_counts().plot(kind='bar')

In [None]:
mat_val_cnt = matrix.item_cnt_month.value_counts()
mat_val_cnt.plot.pie(figsize=(8,8),
    labels=['' for _ in range(matrix.item_cnt_month.nunique())])
plt.legend(mat_val_cnt.index.values, loc='center left',
           bbox_to_anchor=(1.0, 0.5))

In [None]:
sns.boxplot(x=matrix.item_cnt_month)

### adding encoded features

In [None]:
matrix = pd.merge(matrix, df_shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, df_items, on=['item_id'], how='left')
matrix = pd.merge(matrix, df_cat, on=['item_category_id'], how='left')

matrix['city_encoded'] = matrix['city_encoded'].astype(np.int8)
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['main_cat_enc'] = matrix['main_cat_enc'].astype(np.int8)
matrix['sub_cat_enc'] = matrix['sub_cat_enc'].astype(np.int8)

In [None]:
### helper functions
def get_month_year(num):

    init_year = 2013
    return int(num%12)+1, int(num/12)+init_year

def downcast_dtypes(df):
    
    float_cols = [col for col in df.columns if df[col].dtype=='float64']
    int_cols = [col for col in df.columns if df[col].dtype=='int64']
    
    df[float_cols] = df[float_cols].astype(np.float16)
    df[int_cols] = df[int_cols].astype(np.int16)
    
    return df

In [None]:
matrix = matrix.drop(['shop_name','item_name'], axis=1)

In [None]:
matrix = downcast_dtypes(matrix)

In [None]:
matrix.dtypes

### generating lag feats

In [None]:
def generate_lag(df, col_list, lags, col):
    for l in tqdm(lags):
        df_shift = df[col_list+[col]].copy()
        df_shift.columns = col_list+[col+'_lag_'+str(l)]
        df_shift['date_block_num'] +=l
        df = pd.merge(df, df_shift,
                      on=col_list,
                     how='left')
    return df

In [None]:
matrix = generate_lag(matrix, ['date_block_num', 'shop_id', 'item_id'],
                      [1,2,3,4,5,6,12], 'item_cnt_month')

In [None]:
matrix.head()

In [None]:
%%time
new_col = 'item_month_mean'
group = matrix.groupby(['date_block_num', 'item_id'])['item_cnt_month']\
            .mean().rename(new_col).reset_index()
matrix = pd.merge(matrix, group, on=['date_block_num', 'item_id'],
                 how='left')
matrix = generate_lag(matrix, ['date_block_num', 'shop_id', 'item_id'],
                     [1,2,3,6,12], new_col)
matrix.drop([new_col],axis=1, inplace=True)

In [None]:
%%time
new_col = 'shop_month_mean'
group = matrix.groupby(['date_block_num', 'shop_id'])['item_cnt_month']\
            .mean().rename(new_col).reset_index()
matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id'],
                 how='left')
matrix = generate_lag(matrix, ['date_block_num', 'shop_id', 'item_id'],
                     [1,2,3,6,12], new_col)
matrix.drop([new_col],axis=1, inplace=True)

In [None]:
matrix.head()

In [None]:
%%time
new_col = 'shop_category_month_mean'
group = matrix.groupby(['date_block_num', 'shop_id', 'item_category_id'])['item_cnt_month']\
            .mean().rename(new_col).reset_index()
matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id','item_category_id'],
                 how='left')
matrix = generate_lag(matrix, ['date_block_num', 'shop_id', 'item_id'],
                     [1,2,12], new_col)
matrix.drop([new_col],axis=1, inplace=True)

In [None]:
%%time
new_col = 'main_category_month_mean'
group = matrix.groupby(['date_block_num', 'main_cat_enc'])['item_cnt_month']\
            .mean().rename(new_col).reset_index()
matrix = pd.merge(matrix, group, on=['date_block_num', 'main_cat_enc'],
                 how='left')
matrix = generate_lag(matrix, ['date_block_num', 'shop_id', 'item_id'],
                     [1], new_col)
matrix.drop([new_col],axis=1, inplace=True)

In [None]:
%%time
new_col = 'sub_category_month_mean'
group = matrix.groupby(['date_block_num', 'sub_cat_enc'])['item_cnt_month']\
            .mean().rename(new_col).reset_index()
matrix = pd.merge(matrix, group, on=['date_block_num', 'sub_cat_enc'],
                 how='left')
matrix = generate_lag(matrix, ['date_block_num', 'shop_id', 'item_id'],
                     [1], new_col)
matrix.drop([new_col],axis=1, inplace=True)

In [None]:
matrix.head()

In [None]:
matrix['month'] = matrix['date_block_num'].apply(lambda x: get_month_year(x)[0])
matrix['year'] = matrix['date_block_num'].apply(lambda x: get_month_year(x)[1])

In [None]:
# number of public holidays in russia every month
num_holidays_dict = {
    1: 6,
    2: 3,
    3: 2,
    4: 8,
    5: 3,
    6: 3,
    7: 2,
    8: 8,
    9: 4,
    10: 8,
    11: 5,
    12: 4,
}

In [None]:
matrix['public_holidays'] = matrix['month'].map(num_holidays_dict)

In [None]:
# stock exchange trading volume(in Trillions)
moex = {
    12: 659, 13: 640, 14: 1231,
    15: 881, 16: 764, 17: 663,
    18: 743, 19: 627, 20: 692,
    21: 736, 22: 680, 23: 1092,
    24: 657, 25: 863, 26: 720,
    27: 819, 28: 574, 29: 568,
    30: 633, 31: 658, 32: 611,
    33: 770, 34: 723,
}

In [None]:
matrix['moex_val'] = matrix['date_block_num'].map(moex)

In [None]:
matrix = matrix[matrix.date_block_num > 11]

In [None]:
# filling nan values with 0
for c in matrix.columns:
    if ('_lag_' in c) & (matrix[c].isnull().any()):
        matrix[c].fillna(0, inplace=True)

In [None]:
matrix.isnull().any()

In [None]:
matrix.date_block_num

### Training

In [None]:
def xgtrain():
    reg = xgb.XGBRegressor(n_estimators=5000,
                           learning_rate=0.01,
                           max_depth=10,
                           subsample=0.5,
                           colsample_bytree=0.5)
    
    reg_ = reg.fit(matrix[matrix.date_block_num<33]\
                   .drop(['item_cnt_month'], axis=1).values,
                  matrix[matrix.date_block_num<33]['item_cnt_month'].values,
                  eval_metric='rmse',
                   eval_set=[(matrix[matrix.date_block_num<33]\
                             .drop(['item_cnt_month'], axis=1).values,
                             matrix[matrix.date_block_num<33]['item_cnt_month'].values),
                            (matrix[matrix.date_block_num==33]\
                             .drop(['item_cnt_month'], axis=1).values,
                             matrix[matrix.date_block_num==33]['item_cnt_month'].values)],
                  verbose=True,
                  early_stopping_rounds=50)
    return reg_

In [None]:
%%time
reg_ = xgtrain()

In [None]:
file_name = "xgb_reg.pkl"

# save
pickle.dump(reg_, open(file_name, "wb"))

In [None]:
# load
#reg_ = pickle.load(open(file_name, "rb"))
predictions = reg_.predict(matrix[matrix.date_block_num==34]\
                           .drop(['item_cnt_month'], axis=1).values)

In [None]:
from matplotlib import rcParams
rcParams['figure.figsize'] = 11.7,8.27

cols = matrix.drop('item_cnt_month', axis = 1).columns
feat_importances = [(c,f_i) for ]
plt.barh(cols, reg_.feature_importances_)
plt.show()

In [None]:
sample_submit['item_cnt_month'] = predictions

In [None]:
sample_submit.to_csv('sales_first_base', index=False)