In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from itertools import product
import keras
from keras import Model, Sequential,regularizers
from keras.layers import Dense, Flatten, Embedding, LeakyReLU, Input, Dropout
from keras.metrics import RootMeanSquaredError
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras import backend as K
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler,LabelEncoder, MinMaxScaler
from IPython.display import clear_output

def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float` type to `float16`
                `int`   type to `int16`
    '''    
    # Select columns to downcast
    float_cols = [c for c in df if (df[c].dtype == "float64") | (df[c].dtype == "float32")]
    int_cols =   [c for c in df if (df[c].dtype == "int64") | (df[c].dtype == "int32")]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float16)
    df[int_cols]   = df[int_cols].astype(np.int16)
    return df

# Function by Vadim Sokolov
# https://www.kaggle.com/vadimsokolov/sales-predictions-final-project-in-course-hse/data#Prepare-and-feature-engineering-data
def lag_feature(data, lags, column):
    temp = data[['date_block_num', 'shop_id', 'item_id', column]]
    for lag in lags:
        shifted = temp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', column + '_lag_' + str(lag)]
        shifted['date_block_num'] += lag
        data = pd.merge(data, shifted, on=['date_block_num','shop_id','item_id'], how='left')
        data[column+'_lag_'+str(lag)] = data[column+'_lag_'+str(lag)].astype('float32')
    return data

#Split a list into parts of roughly equal length
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

def assemble_df_for_nn(df_for_nn):
    #Normalize numerical features
    scaler = StandardScaler()
    #scaler = MinMaxScaler(feature_range=(-10,10))
    cols_to_scale = df_for_nn.select_dtypes(include=[np.number]).columns.to_list()
    not_to_scale = ['item_cnt_month','shop_id','item_id', 'date_block_num','item_category_id']
    cols_to_scale = [c for c in cols_to_scale if c not in not_to_scale]
    split_col_list = list(split(cols_to_scale, 25))
    # Scale columns in batches to avoid out-of-memory exception
    for cols in cols_to_scale:
        df_for_nn[cols] = scaler.fit_transform(df_for_nn[cols].values.reshape(-1, 1))

    print('downcasting dtypes:')
    df_for_nn = downcast_dtypes(df_for_nn)
    print('Separate prediction frame:')
    df_for_prediction = df_for_nn.loc[df_for_nn.date_block_num==34].reset_index().drop('index',axis=1)
    df_for_prediction['ID'] = test['ID']
    return df_for_nn, df_for_prediction

def assemble_df_for_nn_tanh(df_for_nn):
    #Normalize numerical features
    cols_to_scale = df_for_nn.select_dtypes(include=[np.number]).columns.to_list()
    not_to_scale = ['item_cnt_month','item_cnt_month_lag_1','item_cnt_month_lag_2','item_cnt_month_lag_3',
                    'shop_id','item_id', 'date_block_num','item_category_id',
                    'item_new_in_store','item_new_in_store_lag_1','item_new_in_store_lag_2','item_new_in_store_lag_3'
                    'opening_month','opening_month_lag_1','opening_month_lag_2','opening_month_lag_3']
    cols_to_scale = [c for c in cols_to_scale if c not in not_to_scale]
    print('Scaling columns')
    for column_name in cols_to_scale:
        # using tanh estimators for scaling
        column = df_for_nn[column_name].astype(float)
        std = column.std()
        mean = column.mean()
        tanh_estimators = 0.5* ( np.tanh( 0.01*((column-mean)/(std+0.0001)) ) + 1)
        df_for_nn[column_name] = tanh_estimators

    #print('downcasting dtypes')
    #df_for_nn = downcast_dtypes(df_for_nn)
    print('Separate prediction frame')
    df_for_prediction = df_for_nn.loc[df_for_nn.date_block_num==34].reset_index().drop('index',axis=1)
    df_for_prediction['ID'] = test['ID']
    return df_for_nn, df_for_prediction

def assemble_df_tanh_standard(df_for_nn):
    #Normalize numerical features
    scaler = StandardScaler()
    cols_to_scale = df_for_nn.select_dtypes(include=[np.number]).columns.to_list()
    not_to_scale = ['item_cnt_month','item_cnt_month_lag_1','item_cnt_month_lag_2','item_cnt_month_lag_3',
                    'shop_id','item_id', 'date_block_num','item_category_id',
                    'item_new_in_store','item_new_in_store_lag_1','item_new_in_store_lag_2','item_new_in_store_lag_3'
                    'opening_month','opening_month_lag_1','opening_month_lag_2','opening_month_lag_3']
    cols_to_scale = [c for c in cols_to_scale if c not in not_to_scale]
    print('Scaling columns')
    for column_name in cols_to_scale:
        # using tanh estimators for scaling
        column = df_for_nn[column_name].astype(float)
        std = column.std()
        mean = column.mean()
        tanh_estimators = 0.5* ( np.tanh( 0.01*((column-mean)/(std+0.0001)) ) + 1)
        #df_for_nn[column_name] = tanh_estimators
        df_for_nn[column_name] = scaler.fit_transform(tanh_estimators.values.reshape(-1, 1))

    print('downcasting dtypes')
    df_for_nn = downcast_dtypes(df_for_nn)
    print('Separate prediction frame')
    df_for_prediction = df_for_nn.loc[df_for_nn.date_block_num==34].reset_index().drop('index',axis=1)
    df_for_prediction['ID'] = test['ID']
    return df_for_nn, df_for_prediction

def root_mean_squared_error(y_true, y_pred):
    from keras import backend as K
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

data_path = '/kaggle/input/competitive-data-science-predict-future-sales'
path = '/kaggle/input/combined-version-eda-fe-model-training'
items = pd.read_csv('%s/items.csv' % data_path)
item_categories  = pd.read_csv('%s/item_categories.csv' % data_path)
shops = pd.read_csv('%s/shops.csv' % data_path)
transactions  = pd.read_csv('%s/sales_train.csv' % data_path)
test  = pd.read_csv('%s/test.csv' % data_path)

## Exploratory Data Analysis

### Plotting data as graphs

In [None]:
# Monthly sales of shops over time
fig2 = plt.figure()
ax2 = fig2.add_subplot(111)

monthly_shop_sales = transactions.groupby(['date_block_num','shop_id']).sum()
for _,df in monthly_shop_sales.reset_index().groupby('shop_id'):
    ax2.plot(df['date_block_num'], df['item_cnt_day'])

In [None]:
# Item price over time
fig3 = plt.figure()
ax3 = fig3.add_subplot(111)
transactions_with_items = pd.merge(transactions,items, on='item_id')
mean_price_of_itemcat_over_time = transactions_with_items.groupby(['date_block_num','item_category_id'])['item_price'].mean()
for _,df in mean_price_of_itemcat_over_time.reset_index().groupby('item_category_id'):
    ax3.plot(df['date_block_num'], df['item_price'].clip(0,30000))

In [None]:
mean_price_of_itemcat_over_time.reset_index().groupby(['item_category_id'])['item_price'].std().sort_values(ascending=False)

In [None]:
mean_price_of_itemcat16_over_time = transactions_with_items.loc[transactions_with_items.item_category_id == 16].groupby(['date_block_num','item_category_id'])['item_price'].mean()
fig3 = plt.figure()
ax3 = fig3.add_subplot(111)
for _,df in mean_price_of_itemcat16_over_time.reset_index().groupby('item_category_id'):
    ax3.plot(df['date_block_num'], df['item_price'].clip(0,30000))

#### Take-aways
- Significant sales boost in december (as expected)
- Shops and items are not continuous over all 33 months (gaps in graphs)
- Item cat 16 either got mislabelled prices in the first two months, or the item price changed drastically (e.g. by adding some very expensive items to that category)

In [None]:
del transactions_with_items

### Histograms of various columns

In [None]:
# All
transactions.hist()

In [None]:
# Sales
transactions.item_cnt_day.hist(bins=20)

In [None]:
transactions.item_cnt_day.clip(0,10).hist(bins=10)

In [None]:
# Price
transactions.item_price.hist(bins=30)

In [None]:
transactions.item_price.clip(0,7000).hist(bins=30)

In [None]:
# Total sales of shops per month
monthly_shop_sales.item_cnt_day.hist(bins = 60)

In [None]:
# Total sales of shops
transactions.groupby(['shop_id']).sum().item_cnt_day.hist(bins = 60)

#### Take-Away
- Item_cnt_month and item_price has some extreme outliers that need to be clipped

In [None]:
# Note: apparently this one item with very high sales is a shopping bag
# see: https://www.google.com/search?q=%D0%A4%D0%B8%D1%80%D0%BC%D0%B5%D0%BD%D0%BD%D1%8B%D0%B9+%D0%BF%D0%B0%D0%BA%D0%B5%D1%82+%D0%BC%D0%B0%D0%B9%D0%BA%D0%B0+1%D0%A1+%D0%98%D0%BD%D1%82%D0%B5%D1%80%D0%B5%D1%81+%D0%B1%D0%B5%D0%BB%D1%8B%D0%B9&client=firefox-b-d&source=lnms&tbm=isch&sa=X&ved=2ahUKEwitvdeVhd7yAhV5RvEDHaDSAqUQ_AUoAXoECAEQAw&biw=1920&bih=927
print(transactions.loc[transactions['item_id']==20949].item_cnt_day.sum())
print(items.loc[items['item_id']==20949].item_name)

### Quick look at the test set 

In [None]:
print(transactions.item_id.value_counts().size)
print(transactions.shop_id.value_counts().size)
print('====================================')
print(test.item_id.nunique())
print(test.shop_id.nunique())
test.groupby(['shop_id','item_id']).nunique().max()

### Take-Aways
- Not all 60 shops and all 21807 items are present in the test set
- Every combination of these 5100 items at these 42 shops will be asked for in the test set
- Not all shops will sell every item -> high percentage of zeros (that are not present in the training data)

In [None]:
transactions.info()

## Preprocessing of items and shops as suggested by Gordon Henderson

In [None]:
# Code written by Gordon Henderson
# https://www.kaggle.com/gordotron85/future-sales-xgboost-top-3#Cleaning-Shop-Data
# Якутск Орджоникидзе, 56
transactions.loc[transactions.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
transactions.loc[transactions.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
transactions.loc[transactions.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

shops.loc[ shops.shop_name == 'Сергиев Посад ТЦ "7Я"',"shop_name" ] = 'СергиевПосад ТЦ "7Я"'
shops["city"] = shops.shop_name.str.split(" ").map( lambda x: x[0] )
shops["category"] = shops.shop_name.str.split(" ").map( lambda x: x[1] )
shops.loc[shops.city == "!Якутск", "city"] = "Якутск"

category = []
for cat in shops.category.unique():
    if len(shops[shops.category == cat]) >= 5:
        category.append(cat)
shops.category = shops.category.apply( lambda x: x if (x in category) else "other" )

shops["shop_category"] = LabelEncoder().fit_transform( shops.category )
shops["shop_city"] = LabelEncoder().fit_transform( shops.city )
shops = shops[["shop_id", "shop_category", "shop_city"]]

#Cleaning item category data
item_categories["type_code"] = item_categories.item_category_name.apply( lambda x: x.split(" ")[0] ).astype(str)
item_categories.loc[ (item_categories.type_code == "Игровые")| (item_categories.type_code == "Аксессуары"), "category" ] = "Игры"

category = []
for cat in item_categories.type_code.unique():
    if len(item_categories[item_categories.type_code == cat]) >= 5: 
        category.append( cat )
item_categories.type_code = item_categories.type_code.apply(lambda x: x if (x in category) else "etc")

item_categories.type_code = LabelEncoder().fit_transform(item_categories.type_code)
item_categories["split"] = item_categories.item_category_name.apply(lambda x: x.split("-"))
item_categories["subtype"] = item_categories.split.apply(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
item_categories["subtype_code"] = LabelEncoder().fit_transform( item_categories["subtype"] )
item_categories = item_categories[["item_category_id", "subtype_code", "type_code"]]

# Cleaning item data
import re
def name_correction(x):
    x = x.lower() # all letters lower case
    x = x.partition('[')[0] # partition by square brackets
    x = x.partition('(')[0] # partition by curly brackets
    x = re.sub('[^A-Za-z0-9А-Яа-я]+', ' ', x) # remove special characters
    x = x.replace('  ', ' ') # replace double spaces with single spaces
    x = x.strip() # remove leading and trailing white space
    return x

# split item names by first bracket
items["name1"], items["name2"] = items.item_name.str.split("[", 1).str
items["name1"], items["name3"] = items.item_name.str.split("(", 1).str

# replace special characters and turn to lower case
items["name2"] = items.name2.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
items["name3"] = items.name3.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()

# fill nulls with '0'
items = items.fillna('0')

items["item_name"] = items["item_name"].apply(lambda x: name_correction(x))

# return all characters except the last if name 2 is not "0" - the closing bracket
items.name2 = items.name2.apply( lambda x: x[:-1] if x !="0" else "0")

# Clean item type
items["type"] = items.name2.apply(lambda x: x[0:8] if x.split(" ")[0] == "xbox" else x.split(" ")[0] )
items.loc[(items.type == "x360") | (items.type == "xbox360") | (items.type == "xbox 360") ,"type"] = "xbox 360"
items.loc[ items.type == "", "type"] = "mac"
items.type = items.type.apply( lambda x: x.replace(" ", "") )
items.loc[ (items.type == 'pc' )| (items.type == 'pс') | (items.type == "pc"), "type" ] = "pc"
items.loc[ items.type == 'рs3' , "type"] = "ps3"

group_sum = items.groupby(["type"]).agg({"item_id": "count"})
group_sum = group_sum.reset_index()
drop_cols = []
for cat in group_sum.type.unique():
    if group_sum.loc[(group_sum.type == cat), "item_id"].values[0] <40:
        drop_cols.append(cat)
items.name2 = items.name2.apply( lambda x: "other" if (x in drop_cols) else x )
items = items.drop(["type"], axis = 1)

items.name2 = LabelEncoder().fit_transform(items.name2)
items.name3 = LabelEncoder().fit_transform(items.name3)

items.drop(["item_name", "name1"],axis = 1, inplace= True)

items = items.merge(item_categories,on='item_category_id')

items.head()

## Feature extraction, encoding of categorical features, preprocessing and scaling

In [None]:
load_data_frame = True
if not load_data_frame:
    # include month 34 so it will have the aggregated statistics such as lag features
    test_subset = test[['shop_id','item_id']]
    test_subset['date_block_num']=34
    transactions = transactions.append(test_subset,ignore_index=True)
    del test_subset

    # aggregate the monthly transaction statistic for each item in each shop
    monthly_trans = transactions.groupby(['date_block_num','shop_id','item_id'])[["item_cnt_day"]].sum()
    monthly_trans = monthly_trans.reset_index()

    # since it's summed for the entire month, item_cnt_day is now actually item_cnt_month
    monthly_trans.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)

    print(monthly_trans.shape)

    # Fill missing item/shop transactions with item_cnt_month = 0
    # Code adapted from user'c14103'
    # https://www.kaggle.com/c/competitive-data-science-predict-future-sales/discussion/57123
    # Instead of looking through ALL possible shop/item pairs I include only the ones present in the test set. 
    # This makes the model more vulnerable to changes in the test set and worse at generalizing, but this is necessary due to very limited RAM
    extend_frame_with_0_sales = True
    if extend_frame_with_0_sales:
        grid = []    
        for month in monthly_trans['date_block_num'].drop_duplicates():
            shop = test['shop_id'].drop_duplicates()   
            item = test['item_id'].drop_duplicates()
            grid.append( np.asarray(   list( product( *[shop,item,[month]] ) )    )  )     
        cols = ['shop_id','item_id','date_block_num']   
        grid = pd.DataFrame(np.vstack(grid), columns = cols, dtype=np.int32)    
        monthly_trans = pd.merge(grid,monthly_trans, on = cols, how = 'left').fillna(0)
        del grid
        del item
        del shop

    print('filled missing:')
    print(monthly_trans.shape)

    #include mean item price for a given shop/item pair per month
    monthly_trans = monthly_trans.merge(transactions.groupby(['date_block_num','shop_id','item_id'])[["item_price"]].mean().reset_index(),how='left',on=['date_block_num','shop_id','item_id']).fillna(0)
    print('item_price:')
    print(monthly_trans.shape)
    print(monthly_trans.item_price.max())


    # clip extreme outliers in the prices
    q_hi  = monthly_trans['item_price'].quantile(0.95)
    monthly_trans['item_price'] = monthly_trans['item_price'].clip(0, q_hi)

    # clip item_sales close to the range used in the competition
    monthly_trans['item_cnt_month'] = monthly_trans['item_cnt_month'].clip(0, 20)

    # Extend the unknown item prices for month 0 with the first known price
    first_price = transactions.sort_values('date_block_num', ascending=True).drop_duplicates(['shop_id','item_id'])[['shop_id','item_id','item_price','date_block_num']].fillna(0)
    first_price['date_block_num']=0
    unknown_prices_month_0 = monthly_trans.loc[((monthly_trans.date_block_num==0))][['shop_id','item_id','item_price']]
    unknown_prices_month_0 = unknown_prices_month_0.merge(first_price,on=['shop_id','item_id'],how='outer',suffixes=['_DROP',''])
    monthly_trans = monthly_trans.merge(unknown_prices_month_0[['shop_id','item_id','item_price','date_block_num']],on=['shop_id','item_id','date_block_num'],suffixes=['_DROP',''],how='left') 
    monthly_trans['item_price']=monthly_trans['item_price'].fillna(monthly_trans['item_price_DROP'])
    monthly_trans = monthly_trans.drop(['item_price_DROP'],axis=1)

    # Extend the item prices for months 0-34 with the last known item price at that shop
    for i in monthly_trans.date_block_num.unique():
        last_known_price = monthly_trans.loc[((monthly_trans.date_block_num<=(i))&(monthly_trans.item_price!=0))].sort_values('date_block_num', ascending=False).drop_duplicates(['shop_id','item_id'])[['shop_id','item_id','item_price','date_block_num']]
        unknown_prices = monthly_trans.loc[((monthly_trans.date_block_num==(i))&(monthly_trans.item_price==0))][['shop_id','item_id','item_price']]
        unknown_prices = unknown_prices.merge(last_known_price,on=['shop_id','item_id'],how='outer',suffixes=['_DROP',''])
        unknown_prices = unknown_prices.drop('item_price_DROP',axis=1)
        unknown_prices['date_block_num'] = (i)
        monthly_trans = monthly_trans.merge(unknown_prices[['shop_id','item_id','item_price','date_block_num']],on=['shop_id','item_id','date_block_num'],suffixes=['_DROP',''],how='left')
        monthly_trans['item_price']=monthly_trans['item_price'].fillna(monthly_trans['item_price_DROP'])
        monthly_trans = monthly_trans.drop(['item_price_DROP'],axis=1)
    del unknown_prices
    del last_known_price
    del unknown_prices_month_0

    # calculate monthly revenue for item at shop, then keep only the lag values (revenue will be wrong for month 34)
    monthly_trans["revenue"] = monthly_trans["item_cnt_month"] * monthly_trans["item_price"]
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'revenue').fillna(0)
    monthly_trans = monthly_trans.drop('revenue',axis=1)

    print('extended_price:')
    print(monthly_trans.shape)

    # include more data about the items 
    monthly_trans = pd.merge(monthly_trans,items, on='item_id')
    # include additional shop info such as city or 
    monthly_trans = monthly_trans.merge(shops,on='shop_id')

    print('item data')
    print(monthly_trans.shape)

    # rolling average of item sales at shop    
    item_shop_group = monthly_trans.groupby(['shop_id','item_id'])
    rolling = item_shop_group[['date_block_num','item_cnt_month']].rolling(2, min_periods=1,on='date_block_num').mean()
    rolling['rolling_average_2'] = rolling['item_cnt_month'].shift(1,fill_value=0)
    monthly_trans = monthly_trans.merge(rolling.drop('item_cnt_month',axis=1),on=['shop_id','item_id','date_block_num'])

    item_shop_group = monthly_trans.groupby(['shop_id','item_id'])
    rolling = item_shop_group[['date_block_num','item_cnt_month']].rolling(3, min_periods=1,on='date_block_num').mean()
    rolling['rolling_average_3'] = rolling['item_cnt_month'].shift(1,fill_value=0)
    monthly_trans = monthly_trans.merge(rolling.drop('item_cnt_month',axis=1),on=['shop_id','item_id','date_block_num'])

    item_shop_group = monthly_trans.groupby(['shop_id','item_id'])
    rolling = item_shop_group[['date_block_num','item_cnt_month']].rolling(4, min_periods=1,on='date_block_num').mean()
    rolling['rolling_average_4'] = rolling['item_cnt_month'].shift(1,fill_value=0)
    monthly_trans = monthly_trans.merge(rolling.drop('item_cnt_month',axis=1),on=['shop_id','item_id','date_block_num'])

    del rolling

    print('rolling_average:')
    print(monthly_trans.shape)


    # Month 0-11
    monthly_trans['month_id']=monthly_trans['date_block_num']%12
    # Year 0-2
    monthly_trans['year_id']=monthly_trans['date_block_num'].floordiv(12)
    # How many days in that month
    days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
    monthly_trans["days"] = monthly_trans["month_id"].map(days).astype(np.int8)

    # Compare the monthly sales total (over all shops and items) to the yearly average
    encodings_year = (monthly_trans.groupby('year_id')['item_cnt_month'].sum()/12).rename('yearly_total_sales_all_shops_all_items')
    encodings = monthly_trans.groupby(['month_id','year_id'])['item_cnt_month'].sum().rename('monthly_total_sales_all_shops_all_items').replace(0, np.nan)

    # Make a prediction of the total sales for November and December 2014 based on previous years trends
    oct_to_nov_scale_year_0 = encodings.loc[10][0]/encodings.loc[9][0]
    oct_to_nov_scale_year_1 = encodings.loc[10][1]/encodings.loc[9][1]
    nov_year_2_estimate = encodings.loc[9][2]* 0.5*(oct_to_nov_scale_year_0+oct_to_nov_scale_year_1)
    encodings.loc[10][2] = nov_year_2_estimate
    dec_year_2_estimate = encodings.loc[10][2]* 0.5*(encodings.loc[11][0]/encodings.loc[10][0]+encodings.loc[11][1]/encodings.loc[10][1])

    # Update the total sales for year 2 so that it includes the estimated values of November and December
    encodings_year = (monthly_trans.groupby('year_id')['item_cnt_month'].sum()/12).rename('yearly_total_sales_all_shops_all_items')
    encodings_year[2]+=((nov_year_2_estimate+dec_year_2_estimate)/12)

    monthly_trans = monthly_trans.merge(encodings_year,on='year_id')
    monthly_trans = monthly_trans.merge(encodings,on=['month_id','year_id']).sort_values(['date_block_num','shop_id','item_id'])

    # By how much do sales of a given month differ compared to the yearly average
    monthly_trans['dif_of_month_to_yearly_average'] = monthly_trans['monthly_total_sales_all_shops_all_items']-monthly_trans['yearly_total_sales_all_shops_all_items']
    monthly_trans = monthly_trans.drop(['monthly_total_sales_all_shops_all_items','yearly_total_sales_all_shops_all_items'],axis=1)

    del encodings_year
    del encodings

    print('Monthly difference to yearly average:')
    print(monthly_trans.shape)

    # If a shop sells items for the first time it probably just opened that month (this will be wrong for month 0, so I really should drop the data from month 0)
    encodings = monthly_trans.loc[monthly_trans.item_cnt_month>0].groupby('shop_id')['date_block_num'].min()
    monthly_trans['opening_month'] =  monthly_trans['shop_id'].map(encodings).fillna(0)
    monthly_trans['opening_month'] = (monthly_trans['opening_month'] == monthly_trans['date_block_num']).astype(np.int8)

    # If a shop sells an item for the first time it is assumed that the item is new in this store
    encodings = monthly_trans.loc[monthly_trans.item_cnt_month>0].groupby(['shop_id','item_id'])['date_block_num'].min().rename('item_new_in_store')
    monthly_trans =  monthly_trans.merge(encodings.reset_index(),on=['shop_id','item_id'],how='left').fillna(0)
    monthly_trans['item_new_in_store'] = (monthly_trans['item_new_in_store'] == monthly_trans['date_block_num']).astype(np.int8)

    # calculates the monthly mean sales for each item. 
    encodings = monthly_trans.groupby(['item_id','date_block_num'])['item_cnt_month'].mean()
    encodings = encodings.reset_index()
    encodings['monthly_mean_sales_of_item'] = encodings['item_cnt_month']
    encodings.drop('item_cnt_month',axis=1, inplace=True)
    monthly_trans = pd.merge(monthly_trans,encodings, on=['date_block_num','item_id'])

    # calculates the monthly mean sales for each shop
    encodings = monthly_trans.groupby(['shop_id','date_block_num'])['item_cnt_month'].mean()
    encodings = encodings.reset_index()
    encodings['monthly_mean_sales_at_shop'] = encodings['item_cnt_month']
    encodings.drop('item_cnt_month',axis=1, inplace=True)
    monthly_trans = pd.merge(monthly_trans,encodings, on=['date_block_num','shop_id'])

    # calculates the monthly total sales for each item. 
    encodings = monthly_trans.groupby(['item_id','date_block_num'])['item_cnt_month'].sum()
    encodings = encodings.reset_index()
    encodings['monthly_total_sales_of_item'] = encodings['item_cnt_month']
    encodings.drop('item_cnt_month',axis=1, inplace=True)
    monthly_trans = pd.merge(monthly_trans,encodings, on=['date_block_num','item_id'])

    # calculates the monthly total sales for each shop
    encodings = monthly_trans.groupby(['shop_id','date_block_num'])['item_cnt_month'].sum()
    encodings = encodings.reset_index()
    encodings['monthly_total_sales_at_shop'] = encodings['item_cnt_month']
    encodings.drop('item_cnt_month',axis=1, inplace=True)
    monthly_trans = pd.merge(monthly_trans,encodings, on=['date_block_num','shop_id'])

    del encodings


    #Create lag features different sales attributes
    #monthly_trans = lag_feature(monthly_trans, [1,2,3], 'opening_month').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'item_new_in_store').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'item_cnt_month').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'item_price').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'monthly_mean_sales_of_item').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'monthly_mean_sales_at_shop').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'monthly_total_sales_of_item').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'monthly_total_sales_at_shop').fillna(0)

    print('lag feature:')
    print(monthly_trans.shape)

    # shop/item sales trend
    monthly_trans['sale_trend_1_month'] = monthly_trans['item_cnt_month_lag_1']-monthly_trans['item_cnt_month_lag_2']
    monthly_trans['sale_trend_2_month'] = monthly_trans['item_cnt_month_lag_1']-monthly_trans['item_cnt_month_lag_3']

    # shop price trend
    monthly_trans['price_trend_1_month'] = monthly_trans['item_price']-monthly_trans['item_price_lag_1']

    # Item sales trend
    monthly_trans['item_trend_1_month'] = monthly_trans['monthly_total_sales_of_item_lag_1']-monthly_trans['monthly_total_sales_of_item_lag_2']
    monthly_trans['item_trend_2_month'] = monthly_trans['monthly_total_sales_of_item_lag_1']-monthly_trans['monthly_total_sales_of_item_lag_3']

    # Shop sales trend
    monthly_trans['shop_trend_1_month'] = monthly_trans['monthly_total_sales_at_shop_lag_1']-monthly_trans['monthly_total_sales_at_shop_lag_2']
    monthly_trans['shop_trend_2_month'] = monthly_trans['monthly_total_sales_at_shop_lag_1']-monthly_trans['monthly_total_sales_at_shop_lag_3']


    # Lag trends
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'sale_trend_1_month').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'sale_trend_2_month').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'price_trend_1_month').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'item_trend_1_month').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'item_trend_2_month').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'shop_trend_1_month').fillna(0)
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'shop_trend_2_month').fillna(0)

    # Columns for current month have to be dropped, since all these values are unknown for month 34 (except for item_cnt_month, this is the y needed for training/evaluation)
    # Lag features are not as important now since we have trends instead (but keep most item_cnt_month_lag features, since this is what we are trying to predict)
    # As long as we are not out of memory I'll keep most of the lag features for now
    cols_to_drop = [
        #'item_price_lag_1','item_price_lag_2','item_price_lag_3'
        'monthly_mean_sales_of_item','monthly_mean_sales_at_shop','monthly_total_sales_of_item','monthly_total_sales_at_shop',
        'monthly_total_sales_of_item_lag_1','monthly_total_sales_of_item_lag_2','monthly_total_sales_of_item_lag_3',
        'monthly_total_sales_at_shop_lag_1','monthly_total_sales_at_shop_lag_2','monthly_total_sales_at_shop_lag_3'
    ]
    monthly_trans = monthly_trans.drop(cols_to_drop,axis=1)

    del transactions
    del shops
    del items

    # Use frequency encoding for the item category id
    monthly_trans['freq_encoded_item_id'] = monthly_trans.item_id.map(monthly_trans.groupby('item_id').size()/len(monthly_trans))
    monthly_trans['freq_encoded_item_category_id'] = monthly_trans.item_category_id.map(monthly_trans.groupby('item_category_id').size()/len(monthly_trans))
    monthly_trans['freq_encoded_shop_id'] = monthly_trans.shop_id.map(monthly_trans.groupby('shop_id').size()/len(monthly_trans))

    print('frequency encodings done')

    # Use mean encoding for several categories
    mean_encodings = monthly_trans.groupby(['item_id'])['item_cnt_month'].mean().to_dict()
    monthly_trans['mean_encoded_item_id'] =  monthly_trans['item_id'].map(mean_encodings)

    mean_encodings = monthly_trans.groupby(['shop_id'])['item_cnt_month'].mean().to_dict()
    monthly_trans['mean_encoded_shop_id'] =  monthly_trans['shop_id'].map(mean_encodings)

    mean_encodings = monthly_trans.groupby(['date_block_num'])['item_cnt_month'].mean()
    monthly_trans['mean_encoded_prev_month'] =  monthly_trans['date_block_num'].map(mean_encodings.shift().fillna(0).to_dict())

    mean_encodings = monthly_trans.groupby(['item_category_id'])['item_cnt_month'].mean().to_dict()
    monthly_trans['mean_encoded_item_category_id'] =  monthly_trans['item_category_id'].map(mean_encodings)

    # Lag ONLY mean_encoded_prev_month here, that's the only one that depends on date_block_month
    monthly_trans = lag_feature(monthly_trans, [1,2,3], 'mean_encoded_prev_month').fillna(0)

    #Potentially irrelevant features
    #######################################################################################
    mean_encodings = monthly_trans.groupby(['name2'])['item_cnt_month'].mean().to_dict()
    monthly_trans['mean_encoded_name2'] =  monthly_trans['name2'].map(mean_encodings)
    #
    mean_encodings = monthly_trans.groupby(['name3'])['item_cnt_month'].mean().to_dict()
    monthly_trans['mean_encoded_name3'] =  monthly_trans['name3'].map(mean_encodings)
    #
    mean_encodings = monthly_trans.groupby(['type_code'])['item_cnt_month'].mean().to_dict()
    monthly_trans['mean_encoded_type_code'] =  monthly_trans['type_code'].map(mean_encodings)
    #
    mean_encodings = monthly_trans.groupby(['subtype_code'])['item_cnt_month'].mean().to_dict()
    monthly_trans['mean_encoded_subtype_code'] =  monthly_trans['subtype_code'].map(mean_encodings)
    #
    mean_encodings = monthly_trans.groupby(['shop_city'])['item_cnt_month'].mean().to_dict()
    monthly_trans['mean_encoded_shop_city'] =  monthly_trans['shop_city'].map(mean_encodings)
    #
    mean_encodings = monthly_trans.groupby(['shop_category'])['item_cnt_month'].mean().to_dict()
    monthly_trans['mean_encoded_shop_category'] =  monthly_trans['shop_category'].map(mean_encodings)
    #
    mean_encodings = monthly_trans.groupby(['item_category_id'])['item_cnt_month'].mean().to_dict()
    monthly_trans['mean_encoded_item_category_id'] =  monthly_trans['item_category_id'].map(mean_encodings)
    #######################################################################################
    del mean_encodings

    print('mean encodings done')

    #Encodings for item_at_shop
    #############################
    # Important: These seem to have a negative impact on the validation. These values, especially mean and var, are very important features for the training prediction,
    # but the regressor focusses way too much on them and doesn't learn more general features.
    # Removing them improves the prediction result
    #############################
    #mean_encodings = monthly_trans.groupby(['item_id','shop_id'])['item_cnt_month'].mean().rename('mean_encoded_item_at_shop',inplace=True)
    #monthly_trans =  monthly_trans.merge(mean_encodings,on=['item_id','shop_id'])
    #mean_encodings = monthly_trans.groupby(['item_id','shop_id'])['item_cnt_month'].median().rename('median_encoded_item_at_shop',inplace=True)
    #monthly_trans =  monthly_trans.merge(mean_encodings,on=['item_id','shop_id'])
    #mean_encodings = monthly_trans.groupby(['item_id','shop_id'])['item_cnt_month'].var().rename('var_encoded_item_at_shop',inplace=True)
    #monthly_trans =  monthly_trans.merge(mean_encodings,on=['item_id','shop_id'])
    #mean_encodings = monthly_trans.groupby(['item_id','shop_id'])['item_cnt_month'].std().rename('std_encoded_item_at_shop',inplace=True)
    #monthly_trans =  monthly_trans.merge(mean_encodings,on=['item_id','shop_id'])

    # Don't lag these features, they are not month dependent
    
    #del mean_encodings
    #print('encodings for item_at_shop done')

    #Variance encoding for the same categories
    variance_encoded_item_id = monthly_trans.groupby(['item_id'])['item_cnt_month'].var().to_dict()
    monthly_trans['variance_encoded_item_id'] =  monthly_trans['item_id'].map(variance_encoded_item_id).fillna(0)
    del variance_encoded_item_id

    variance_encoded_shop_id = monthly_trans.groupby(['shop_id'])['item_cnt_month'].var().to_dict()
    monthly_trans['variance_encoded_shop_id'] =  monthly_trans['shop_id'].map(variance_encoded_shop_id).fillna(0)
    del variance_encoded_shop_id

    var_encoded_city_cat = monthly_trans.groupby(['shop_city'])['item_cnt_month'].var().to_dict()
    monthly_trans['variance_encoded_city'] =  monthly_trans['shop_city'].map(var_encoded_city_cat)
    del var_encoded_city_cat

    var_encoded_shopcat_cat = monthly_trans.groupby(['shop_category'])['item_cnt_month'].var().to_dict()
    monthly_trans['variance_encoded_shop_category'] =  monthly_trans['shop_category'].map(var_encoded_shopcat_cat)
    del var_encoded_shopcat_cat

    #Drop the original (not encoded) categorical features
    monthly_trans = monthly_trans.drop(["shop_category", "shop_city","item_category_id", "subtype_code", "type_code",'name2','name3'],axis=1)

    print('variance encodings done')
    
    # Remove the first three months from the dataframe (since lag features and trends will be wrong for them)
    monthly_trans = monthly_trans.loc[monthly_trans.date_block_num>2]


    #df_for_nn, df_for_predictions = assemble_df_for_nn_tanh(monthly_trans)
    #df_for_nn, df_for_predictions = assemble_df_for_nn(monthly_trans)
    df_for_nn, df_for_predictions = assemble_df_tanh_standard(monthly_trans)
    df_for_nn.to_pickle('df_for_nn.pkl')
    df_for_predictions.to_pickle('df_for_predictions.pkl')

    print(monthly_trans.shape)
    monthly_trans.head()
else:
    df_for_nn = pd.read_pickle(path + "/df_for_nn.pkl")
    df_for_predictions = pd.read_pickle(path + "/df_for_predictions.pkl")
    #df_for_nn = pd.read_pickle("df_for_nn.pkl")
    #df_for_predictions = pd.read_pickle("df_for_predictions.pkl")

    #df_for_nn = df_for_nn.drop(['item_id','mean_encoded_item_id','variance_encoded_item_id'],axis=1)
    #df_for_predictions = df_for_predictions.drop(['mean_encoded_item_id','variance_encoded_item_id'],axis=1)
    print(list(set(df_for_predictions.columns)-set(df_for_nn.columns)))

In [None]:
#Sanity checks - no nans, no +- inf
#[monthly_trans[column].isna().max() for column in monthly_trans.columns]
#[monthly_trans[column].max() for column in monthly_trans.select_dtypes(include=np.number).columns]
#[monthly_trans[column].min() for column in monthly_trans.select_dtypes(include=np.number).columns]

In [None]:
df_for_nn.head().T

## Neural network approach

In [None]:
load_model_bool = False
if not load_model_bool:
    # Setting up a train-test split according to date_block_num (train on 0-32, validate on 33, test on 34)
    df_for_nn_train = df_for_nn.loc[(df_for_nn['date_block_num']<33)]
    df_for_nn_test = df_for_nn.loc[df_for_nn['date_block_num']==33]

    df_for_nn_train_x = df_for_nn_train.drop('item_cnt_month', axis=1)
    df_for_nn_train_y = df_for_nn_train['item_cnt_month']
    df_for_nn_test_x = df_for_nn_test.drop('item_cnt_month', axis=1)
    df_for_nn_test_y = df_for_nn_test['item_cnt_month']

    del df_for_nn_train
    del df_for_nn_test

    train_x = df_for_nn_train_x.values
    train_y = df_for_nn_train_y.values
    test_x = df_for_nn_test_x.values
    test_y = df_for_nn_test_y.values

    # Include callbacks for model checkpoints and early stopping
    best_model_name = 'best_model_foldno_' + str(0)+'.h5'
    es = EarlyStopping(monitor='val_root_mean_squared_error', mode='min', patience=100,min_delta=0.001, verbose=1)
    mc = ModelCheckpoint(best_model_name, monitor='val_root_mean_squared_error', mode='min', save_best_only=True, verbose=1)
    cb_list = [es, mc]

    # Create a multilayer perceptron neural network
    model = Sequential()
    model.add(Input(shape=train_x.shape[1]))

    model.add(Dense(140, activation=None,
                    #kernel_regularizer=regularizers.l1_l2(l1=0.000001, l2=0.000001),
                    #bias_regularizer=regularizers.l2(0.000001),
                    #activity_regularizer=regularizers.l2(0.000001)
                   ))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.1))

    model.add(Dense(70, activation=None,
                    #kernel_regularizer=regularizers.l1_l2(l1=0.000001, l2=0.000001),
                    #bias_regularizer=regularizers.l2(0.000001),
                    #activity_regularizer=regularizers.l2(0.000001)
                   ))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.1))

    model.add(Dense(1, activation=None))
    model.compile(loss=root_mean_squared_error, optimizer=keras.optimizers.Adam(learning_rate=0.001), metrics=[RootMeanSquaredError()])

    # Train the model with high batchsize and high learning rate
    print('Starting training')
    history = model.fit(x=train_x,y=train_y,batch_size=32000,validation_data=(test_x,test_y), epochs=2000, callbacks=cb_list, verbose = 0)
    score=model.evaluate(test_x,test_y)
    model.save('nn_model')

    # summarize history for root_mean_squared_error
    plt.close()
    plt.plot(history.history['root_mean_squared_error'][3:])
    plt.plot(history.history['val_root_mean_squared_error'][3:])
    plt.title('model accuracy')
    plt.ylabel('root_mean_squared_error')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'][3:])
    plt.plot(history.history['val_loss'][3:])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

    print('_____________________________________________')

#### Fine plots:

In [None]:
if not load_model_bool:
    plt.close()
    plt.plot(history.history['root_mean_squared_error'][13:])
    plt.plot(history.history['val_root_mean_squared_error'][13:])
    plt.title('model RMSE')
    plt.ylabel('root_mean_squared_error')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'][128:])
    plt.plot(history.history['val_loss'][128:])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

### Fine training with different batch size
Usually not necessary, but occasionally it brings a significant improvement

In [None]:
continue_training = False
if continue_training:
    # Loading the model overrides the optimizer status
    #model = load_model('best_model_foldno_0.h5')
    #model = load_model('nn_model')
    from keras import backend as K
    K.set_value(model.optimizer.learning_rate, 0.001)

    es = EarlyStopping(monitor='val_root_mean_squared_error', mode='min', verbose=1, patience=50,min_delta=0.001)
    mc = ModelCheckpoint('fine_'+best_model_name, monitor='val_root_mean_squared_error', mode='min', save_best_only=True, verbose=1)
    cb_list = [es, mc]

    history = model.fit(x=train_x,y=train_y,batch_size=1000,validation_data=(test_x,test_y), epochs=2000, callbacks=cb_list, verbose = 0)
    score=model.evaluate(test_x,test_y)

    # summarize history for accuracy
    plt.close()
    plt.plot(history.history['root_mean_squared_error'])
    plt.plot(history.history['val_root_mean_squared_error'])
    plt.title('model accuracy')
    plt.ylabel('root_mean_squared_error')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

    print('_____________________________________________')

### Make predictions using the model with the best evaluation score

In [None]:
if load_model_bool:
    best_model = load_model(path+'best_model_foldno_0.h5', custom_objects={'root_mean_squared_error': root_mean_squared_error})
else:
    best_model = load_model('best_model_foldno_0.h5', custom_objects={'root_mean_squared_error': root_mean_squared_error})
predictions = best_model.predict(df_for_predictions.drop(['ID','item_cnt_month'],axis=1).values)
df_for_predictions['item_cnt_month'] = predictions

In [None]:
# Prepare and save the dataframe for submission
df_for_submission = test.copy()
df_for_submission = df_for_submission.merge(df_for_predictions, how='left', suffixes=('','_'),on=['item_id','shop_id']).fillna(0)[['ID','item_cnt_month']]

print('Mean: ',df_for_submission['item_cnt_month'].mean(),'|| Min: ',df_for_submission['item_cnt_month'].min(),'|| Max: ', df_for_submission['item_cnt_month'].max(),'|| Sum: ', df_for_submission['item_cnt_month'].sum())
print('Percentage of predicted values: ', predictions.size/df_for_submission['item_cnt_month'].size) 

df_for_submission['item_cnt_month'] = df_for_submission['item_cnt_month'].clip(0,20).round(3)

df_for_submission.to_csv('df_for_submission.csv', index=False)

# Rounding is usually detrimental for the result, I recommend using the unrounded results
df_for_submission_rounded = df_for_submission.copy()
df_for_submission_rounded['item_cnt_month'] = df_for_submission_rounded['item_cnt_month'].round()

df_for_submission_rounded.to_csv('df_for_submission_rounded.csv', index=False)

print('===========================================================================')
print('Mean: ',df_for_submission['item_cnt_month'].mean(),'|| Min: ',df_for_submission['item_cnt_month'].min(),'|| Max: ', df_for_submission['item_cnt_month'].max(),'|| Sum: ', df_for_submission['item_cnt_month'].sum())
print('Percentage of zeros: ',df_for_submission['item_cnt_month'].round().value_counts()[0]/df_for_submission['item_cnt_month'].size) 
print('Percentage of predicted values: ', predictions.size/df_for_submission['item_cnt_month'].size) 

### Check for high memory consumption and potentially delete some for the next step
Although it might be better to run the xgboost version only if you didn't run the neural network version. Due to memory constraints I can only run either the feature extraction, or the NN, or the xgboost model.
This was no problem when these were 3 different notebooks, but I tried to combine those into one for the Coursera submission and now I can not save the dataframes, the NN and the xgboost model all as output.

In [None]:
import sys

# These are the usual ipython objects
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

In [None]:
del df_for_nn_train_x
del df_for_nn_train_y
del transactions
del df_for_nn_test_x
del df_for_nn_test_y
del df_for_submission
del df_for_submission_rounded
del item_categories
del shops

## XGB based approach - very fast with good (but not best) results, has feature importance plots

In [None]:
use_pca = False
if use_pca:
    from sklearn.decomposition import PCA
    n_components = 10
    df = df_for_nn.drop(['item_cnt_month','date_block_num','shop_id','item_id','item_category_id'],axis=1)

    pca = PCA(n_components)
    pca.fit(df)

    plt.plot(pca.explained_variance_ratio_)
    plt.ylabel('Explained Variance')
    plt.xlabel('Components')
    plt.show()

    df_pca = pd.DataFrame(pca.transform(df), columns=['PCA%i' % i for i in range(n_components)], index=df.index)
    df_pca[['item_cnt_month','date_block_num','shop_id','item_id','item_category_id']] = df_for_nn[['item_cnt_month','date_block_num','shop_id','item_id','item_category_id']]
    df_pca.to_pickle('df_for_pca.pkl')

    df = df_for_predictions.drop(['ID','item_cnt_month','date_block_num','shop_id','item_id','item_category_id'], axis=1)
    df_pca_test = pd.DataFrame(pca.transform(df), columns=['PCA%i' % i for i in range(n_components)], index=df.index)
    df_pca_test[['item_cnt_month','date_block_num','shop_id','item_id','item_category_id']] = df_for_predictions[['item_cnt_month','date_block_num','shop_id','item_id','item_category_id']]
    df_pca_test.to_pickle('df_pca_predictions.pkl')
    df_for_predictions = df_pca_test
    
    data_pred_dmatrix = xgb.DMatrix(data=df_pca_test.drop(['item_cnt_month'], axis=1),label=None)
    data_train_dmatrix = xgb.DMatrix(data=df_pca.loc[df_pca['date_block_num']<33].drop('item_cnt_month', axis=1),label=df_pca.loc[df_pca['date_block_num']<33]['item_cnt_month'])
    data_test_dmatrix = xgb.DMatrix(data=df_pca.loc[df_pca['date_block_num']==33].drop('item_cnt_month', axis=1),label=df_pca.loc[df_pca['date_block_num']==33]['item_cnt_month'])
    
    del df_pca_test
    del df

else:
    data_pred_dmatrix = xgb.DMatrix(data=df_for_predictions.drop(['item_cnt_month','ID'], axis=1),label=None)
    data_train_dmatrix = xgb.DMatrix(data=df_for_nn.loc[df_for_nn['date_block_num']<33].drop('item_cnt_month', axis=1),label=df_for_nn.loc[df_for_nn['date_block_num']<33]['item_cnt_month'])
    data_test_dmatrix = xgb.DMatrix(data=df_for_nn.loc[df_for_nn['date_block_num']==33].drop('item_cnt_month', axis=1),label=df_for_nn.loc[df_for_nn['date_block_num']==33]['item_cnt_month'])


params = {"objective":"reg:squarederror",'colsample_bytree': 1,'learning_rate': 0.1,
                'max_depth': 11, 'alpha': 2,'lambda': 1,'subsample': 1.0, 'gamma': 1,'tree_method':'gpu_hist'}

xg_reg = xgb.train(params=params, dtrain=data_train_dmatrix, evals=[(data_train_dmatrix, 'train'), (data_test_dmatrix, 'valid')], verbose_eval = 20, num_boost_round=2000, early_stopping_rounds=50)

In [None]:
predictions = xg_reg.predict(data_pred_dmatrix)

df_for_submission = test.copy()

df_for_predictions['item_cnt_month'] = predictions
print(df_for_predictions['item_cnt_month'].mean())

df_for_submission = df_for_submission.merge(df_for_predictions, how='left', suffixes=('','_'),on=['item_id','shop_id']).fillna(0)[['ID','item_cnt_month']]

print('Mean: ',df_for_submission['item_cnt_month'].mean(),'|| Min: ',df_for_submission['item_cnt_month'].min(),'|| Max: ', df_for_submission['item_cnt_month'].max(),'|| Sum: ', df_for_submission['item_cnt_month'].sum())
#print('Percentage of zeros: ',df_for_submission['item_cnt_month'].value_counts()[0]/df_for_submission['item_cnt_month'].size) 
print('Percentage of predicted values: ', predictions.size/df_for_submission['item_cnt_month'].size) 
df_for_submission['item_cnt_month'] = df_for_submission['item_cnt_month'].clip(0, 20).round(3)

df_for_submission.to_csv('xgb_df_for_submission.csv', index=False)

df_for_submission_rounded = df_for_submission.copy()
df_for_submission_rounded['item_cnt_month'] = df_for_submission_rounded['item_cnt_month'].round()

df_for_submission_rounded.to_csv('xgb_df_for_submission_rounded.csv', index=False)

print('Mean: ',df_for_submission['item_cnt_month'].mean(),'|| Min: ',df_for_submission['item_cnt_month'].min(),'|| Max: ', df_for_submission['item_cnt_month'].max(),'|| Sum: ', df_for_submission['item_cnt_month'].sum())
print('Percentage of zeros: ',df_for_submission['item_cnt_month'].round().value_counts()[0]/df_for_submission['item_cnt_month'].size) 
print('Percentage of predicted values: ', predictions.size/df_for_submission['item_cnt_month'].size) 

### Plot decision tree and feature importance
... and save them as a PDF because they are not readable inside the notebook

In [None]:
plt.rcParams['figure.figsize'] = [200, 100]
xgb.plot_tree(xg_reg)
plt.savefig("temp.pdf")

In [None]:
xgb.plot_importance(xg_reg)
plt.savefig("temp2.pdf")