In [None]:
# Reference link:
# https://www.kaggle.com/gordotron85/future-sales-xgboost-top-3
# https://www.kaggle.com/szhou42/predict-future-sales-top-11-solution
# https://www.kaggle.com/pavansanagapati/14-simple-tips-to-save-ram-memory-for-1-gb-dataset

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import time

from math import sqrt
from numpy import loadtxt
from itertools import product
from tqdm import tqdm
from sklearn import preprocessing
from xgboost import plot_tree
from matplotlib import pyplot

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

import gc

kernel_with_output = False # use to contro the code

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Data Loading**

In [None]:
print("Loading data ... ...")

sales_train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
sample_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')

print('Done.')

# **Preprocessing**

In [None]:
# Remove duplicate of train data
from sklearn.preprocessing import LabelEncoder

print('Data processing ...')

subset = ['date','date_block_num','shop_id','item_id','item_cnt_day']
print(sales_train.duplicated(subset=subset).value_counts())
sales_train.drop_duplicates(subset=subset, inplace=True)


# Drop outlier
sales_train.drop(sales_train[sales_train['item_price']>300000].index, inplace=True)
sales_train.drop(sales_train[sales_train['item_cnt_day']>1000].index, inplace=True)

sales_train = sales_train[sales_train.item_price > 0].reset_index(drop = True)

sales_train.loc[sales_train.item_cnt_day < 1, "item_cnt_day"] = 0

'''
# clean shop data
# Якутск Орджоникидзе, 56
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

print('Done.')
'''

In [None]:
# drop shops&items not in test data: this doesn't help
# test_shops = test.shop_id.unique()
# test_items = test.item_id.unique()

# sales_train = sales_train[sales_train.shop_id.isin(test_shops)]
# sales_train = sales_train[sales_train.item_id.isin(test_items)]

# print('train:', sales_train.shape)

# **Create grid, aggregate data**

In [None]:
print("Creating grid ... ...")

# Create grid of all combinations of shops/items from that month
grid = []
for block_num in sales_train['date_block_num'].unique():
    cur_shops = sales_train[sales_train['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales_train[sales_train['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
    
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

print('Done.')

In [None]:
print("Aggregating and merging ... ...")

# trim the daily sales: why 20?
sales_train['item_cnt_day'] = sales_train['item_cnt_day'].clip(0,20)

# Group trian data; and aggregate daily sales (sum) and daily price (mean)
groups = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'])

trainset = groups.agg({'item_cnt_day':'sum', 'item_price':'mean'}).reset_index()
# Rename the column names to make more sense
trainset = trainset.rename(columns = {'item_cnt_day' : 'item_cnt_month'})

# trim the monthl sales: again, why 20?
trainset['item_cnt_month'] = trainset['item_cnt_month'].clip(0,20)


# merge the grid and trainset
trainset = pd.merge(grid,trainset,how='left',on=index_cols)
trainset.item_cnt_month = trainset.item_cnt_month.fillna(0)

# merge category id
trainset = pd.merge(trainset, items[['item_id', 'item_category_id']], on = 'item_id')

print("Writing trainset to csv ... ... ")
trainset.to_csv('trainset_with_grid.csv')

print('Done.')

In [None]:
# free some memory
del grid
del sales_train
del groups
del trainset

gc.collect()

In [None]:
# test_size_mb = trainset.memory_usage().sum() / 1024 / 1024
# print("Test memory size: %.2f MB" % test_size_mb)

# **Feature Engineering**

In [None]:
# some variables;
# Set seeds and options
np.random.seed(10)
pd.set_option('display.max_rows', 231)
pd.set_option('display.max_columns', 100)

# Feature engineering list
new_features = []
# enable_feature_idea = [True, True, True, True, True, True, True, True, True, True]

# Some parameters(maybe add more periods, score will be better) [1,2,3,12]
lookback_range = [1,2,3,4,5,6,7,8,9,10,11,12]

tqdm.pandas()

In [None]:
# Load train data; convert data type; take the useful base features;
print('Loading train data ...')

# Load data
trainset = pd.read_csv('/kaggle/working/trainset_with_grid.csv')

items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Clean shop data

# Якутск Орджоникидзе, 56
trainset.loc[trainset.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
trainset.loc[trainset.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
trainset.loc[trainset.shop_id == 10, 'shop_id'] = 11


# Clean/Add shop city and category
shops.loc[ shops.shop_name == 'Сергиев Посад ТЦ "7Я"',"shop_name" ] = 'СергиевПосад ТЦ "7Я"'

city = shops.shop_name.apply(lambda x: str.replace(x, '!', '')).apply(lambda x: x.split(' ')[0])
shops['city'] = pd.Categorical(city).codes

shops["category"] = shops.shop_name.str.split(" ").map( lambda x: x[1] )
shops["shop_category"] = LabelEncoder().fit_transform( shops.category )

shops.loc[shops.city == "!Якутск", "city"] = "Якутск"

# Only keep shop category if there are 5 or more shops of that category, the rest are grouped as "other".
category = []
for cat in shops.category.unique():
    if len(shops[shops.category == cat]) >= 5:
        category.append(cat)
shops.category = shops.category.apply( lambda x: x if (x in category) else "other" )

from sklearn.preprocessing import LabelEncoder
shops["shop_category"] = LabelEncoder().fit_transform( shops.category )

shops = shops[["shop_id", "shop_category", "city"]]

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Add revenue to train data;
trainset["revenue"] = trainset["item_cnt_month"] * trainset["item_price"]


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Add city info to train data;
trainset = pd.merge(trainset, shops[['shop_id', 'city']], on = 'shop_id')

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print('Converting data type ...')

trainset['shop_id'] = trainset['shop_id'].astype('int16')
trainset['item_id'] = trainset['item_id'].astype('int16')
trainset['date_block_num'] = trainset['date_block_num'].astype('int16')
trainset['item_cnt_month'] = trainset['item_cnt_month'].astype('int16')
trainset['item_price'] = trainset['item_price'].astype('float32')
trainset['item_category_id'] = trainset['item_category_id'].astype('int16')
trainset['revenue'] = trainset['revenue'].astype('float32')
trainset['city'] = trainset['city'].astype('int16')

# Take the data with feature set
# And choose data within months (?)
start_month = 0
end_month = 33

trainset = trainset[['shop_id', 'item_id', 'item_category_id', 'date_block_num', 'item_price', 'item_cnt_month', 'revenue', 'city']]

trainset = trainset[(trainset.date_block_num >= start_month) & (trainset.date_block_num <= end_month)]

print('Done.')

In [None]:
trainset.head()

In [None]:
# Load test data; merge test data;
print('Loading test set...')

test_dataset = loadtxt('/kaggle/input/competitive-data-science-predict-future-sales/test.csv', delimiter="," ,skiprows=1, usecols = (1,2), dtype=int)
testset = pd.DataFrame(test_dataset, columns = ['shop_id', 'item_id'])


# Merge test data
print('Merging with other datasets...')

# Get item category id into test_df
testset = testset.merge(items[['item_id', 'item_category_id']], on = 'item_id', how = 'left')

testset['date_block_num'] = 34
    
# Make testset contains same column as trainset so we can concatenate them row-wise
testset['item_cnt_month'] = -1

# Merge city info;
testset = pd.merge(testset,shops[['shop_id','city']],how='left',on='shop_id')

# Якутск Орджоникидзе, 56
testset.loc[testset.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
testset.loc[testset.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
testset.loc[testset.shop_id == 10, 'shop_id'] = 11

print('Done.')

In [None]:
# Combine train/test data: concatenate;
# Notice: test data has NaN in column item_price;
train_test_set = pd.concat([trainset, testset], axis = 0) 

# Check the length of data;
len(trainset)+len(testset) == len(train_test_set)

In [None]:
# free memory
del trainset
del testset
del test_dataset

gc.collect()

# **Add New Features**

In [None]:
item_cat = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')

# Let's use the upper level item category names;
item_cat.head()

In [None]:
# Translate the item categoty group names to english;
l_cat = list(item_cat.item_category_name)

for ind in range(0,1):
    l_cat[ind] = 'PC Headsets / Headphones'
for ind in range(1,8):
    l_cat[ind] = 'Access'
l_cat[8] = 'Tickets (figure)'
l_cat[9] = 'Delivery of goods'
for ind in range(10,18):
    l_cat[ind] = 'Consoles'
for ind in range(18,25):
    l_cat[ind] = 'Consoles Games'
l_cat[25] = 'Accessories for games'
for ind in range(26,28):
    l_cat[ind] = 'phone games'
for ind in range(28,32):
    l_cat[ind] = 'CD games'
for ind in range(32,37):
    l_cat[ind] = 'Card'
for ind in range(37,43):
    l_cat[ind] = 'Movie'
for ind in range(43,55):
    l_cat[ind] = 'Books'
for ind in range(55,61):
    l_cat[ind] = 'Music'
for ind in range(61,73):
    l_cat[ind] = 'Gifts'
for ind in range(73,79):
    l_cat[ind] = 'Soft'
for ind in range(79,81):
    l_cat[ind] = 'Office'
for ind in range(81,83):
    l_cat[ind] = 'Clean'
l_cat[83] = 'Elements of a food'


# Encode the information
lb = preprocessing.LabelEncoder()


# Now add item categoty group names and id;
item_cat['item_category_id_fix'] = lb.fit_transform(l_cat)
item_cat['item_category_name_fix'] = l_cat



In [None]:
# Merge info to train/test
train_test_set = train_test_set.merge(item_cat[['item_category_id', 'item_category_id_fix']], on = 'item_category_id', how = 'left')


# Drop item category id; rename column;
_ = train_test_set.drop(['item_category_id'],axis=1, inplace=True)
train_test_set.rename(columns = {'item_category_id_fix':'item_category_id'}, inplace = True)

# Drop item_cat columns redundant info;
_ = item_cat.drop(['item_category_id'],axis=1, inplace=True)
_ = item_cat.drop(['item_category_name'],axis=1, inplace=True)

# Rename and drop ducplicated info;
item_cat.rename(columns = {'item_category_id_fix':'item_category_id'}, inplace = True)
item_cat.rename(columns = {'item_category_name_fix':'item_category_name'}, inplace = True)
item_cat = item_cat.drop_duplicates()
item_cat.index = np.arange(0, len(item_cat))

In [None]:
# So far, train and test are in same df: because we did the item category group info, and also
# will add new features (especially lag info)

In [None]:
# Convert data type to save memory;
train_test_set['shop_id'] = train_test_set['shop_id'].astype('int16')
train_test_set['item_id'] = train_test_set['item_id'].astype('int16')
train_test_set['date_block_num'] = train_test_set['date_block_num'].astype('int16')
train_test_set['item_price'] = train_test_set['item_price'].astype('float32')
train_test_set['item_cnt_month'] = train_test_set['item_cnt_month'].astype('int16')
train_test_set['revenue'] = train_test_set['revenue'].astype('float32')
train_test_set['city'] = train_test_set['city'].astype('int16')
train_test_set.info()

In [None]:
item_cat.head()

In [None]:
del lb
gc.collect()

New feature: test 0

Use lagged 12 month shop/item sales;

In [None]:

# Add prev month sales as new features;
lookback_range = [1,2,3]

for diff in tqdm(lookback_range):
    feature_name = 'prev_shopitem_sales_' + str(diff)
    
    trainset2 = train_test_set.copy()
    trainset2.loc[:, 'date_block_num'] += diff
    
    trainset2.rename(columns={'item_cnt_month': feature_name}, inplace=True)
    
    train_test_set = train_test_set.merge(trainset2[['shop_id', 'item_id', 'date_block_num', feature_name]], on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    
    new_features.append(feature_name)
    
    # free memory
    del trainset2
    
# Save some memory
train_test_set[new_features] = train_test_set[new_features].astype('int16')
gc.collect()


In [None]:
'''
# YG: try add mean, max, min of last 12 months sales;

train_test_set['sales_mean'] = train_test_set[new_features].mean()
train_test_set['sales_max'] = train_test_set[new_features].max()
train_test_set['sales_min'] = train_test_set[new_features].min()

train_test_set['sales_mean'] = train_test_set['sales_mean'].fillna(0)
train_test_set['sales_min'] = train_test_set['sales_min'].fillna(0)
train_test_set['sales_max'] = train_test_set['sales_max'].fillna(0)

train_test_set['sales_mean'] = train_test_set['sales_mean'].astype('int16')
train_test_set['sales_min'] = train_test_set['sales_min'].astype('int16')
train_test_set['sales_max'] = train_test_set['sales_max'].astype('int16')

new_features.append('sales_mean')
new_features.append('sales_max')
new_features.append('sales_min')
'''

New feature: test 1

Use lagged 12 month item sales;

In [None]:

lookback_range = [1,2,3]

groups = train_test_set.groupby(by = ['item_id', 'date_block_num'])

for diff in tqdm(lookback_range):
    feature_name = 'prev_item_sales_' + str(diff)
            
    result = groups.agg({'item_cnt_month':'mean'})
    result = result.reset_index()
    result.loc[:, 'date_block_num'] += diff
    result.rename(columns={'item_cnt_month': feature_name}, inplace=True)
    
    train_test_set = train_test_set.merge(result, on = ['item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    
    # YG
    train_test_set[feature_name] = train_test_set[feature_name].astype('int16')
    
    new_features.append(feature_name)    
    
    del result
    
# sae some memory
del groups
gc.collect()


New feature: test 2

Use lagged shop/item price

In [None]:
'''
lookback_range = [1,2,3]

print('Adding shop/item lagged prices features ...')

groups = train_test_set.groupby(by = ['shop_id', 'item_id', 'date_block_num'])

for diff in tqdm(lookback_range):
    feature_name = 'prev_shopitem_price_' + str(diff)
    result = groups.agg({'item_price':'mean'})
    result = result.reset_index()
    result.loc[:, 'date_block_num'] += diff
    result.rename(columns={'item_price': feature_name}, inplace=True)
    
    train_test_set = train_test_set.merge(result, on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    new_features.append(feature_name)        
    
    del result
    
# Save some memory
del groups
gc.collect()

print('Done.')
'''

New feature: test 3

Use lagged 12 month price -- YG

In [None]:
lookback_range = [1,2,3]

# Add prev month price as new features;

for diff in tqdm(lookback_range):
    feature_name = 'prev_shopitem_price_' + str(diff)
    
    trainset2 = train_test_set.copy()
    trainset2.loc[:, 'date_block_num'] += diff
    
    trainset2.rename(columns={'item_price': feature_name}, inplace=True)
    
    train_test_set = train_test_set.merge(trainset2[['shop_id', 'item_id', 'date_block_num', feature_name]], on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    
    new_features.append(feature_name)
        
    # free memory
    del trainset2

gc.collect()


New  feature: test 4 
 
Use lagged 12 month item price

In [None]:
lookback_range = [1,2,3]

print('Adding item lagged price features ... ')

groups = train_test_set.groupby(by = ['item_id', 'date_block_num'])
        
for diff in tqdm(lookback_range):
    feature_name = 'prev_item_price_' + str(diff)
    
    result = groups.agg({'item_price':'mean'})
    result = result.reset_index()
    result.loc[:, 'date_block_num'] += diff
    result.rename(columns={'item_price': feature_name}, inplace=True)
    
    train_test_set = train_test_set.merge(result, on = ['item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    
    # YG
    train_test_set[feature_name] = train_test_set[feature_name].astype('float32')
    
    new_features.append(feature_name)    
    
    del result
    
# Save some memory
del groups
gc.collect()

print('Done.')


New feature: test 5 

Use Number of month from last sale of shop/item

In [None]:

print('Adding # of month from last sale feature ...')

lookback_range = [1,2,3,4,5,6,7,8,9,10,11,12]

def create_last_sale_shop_item(row):
    for diff in range(1,33+1):
        feature_name = '_prev_shopitem_sales_' + str(diff)
        if row[feature_name] != 0.0:
            return diff
    return np.nan

lookback_range = list(range(1, 33 + 1))

for diff in tqdm(lookback_range):
    feature_name = '_prev_shopitem_sales_' + str(diff)
    trainset2 = train_test_set.copy()
    trainset2.loc[:, 'date_block_num'] += diff
    trainset2.rename(columns={'item_cnt_month': feature_name}, inplace=True)
    train_test_set = train_test_set.merge(trainset2[['shop_id', 'item_id', 'date_block_num', feature_name]], on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
    train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    
    train_test_set[feature_name] = train_test_set[feature_name].astype('int16')
    del trainset2

train_test_set.loc[:, 'last_sale_shop_item'] = train_test_set.progress_apply (lambda row: create_last_sale_shop_item(row),axis=1)
new_features.append('last_sale_shop_item')

gc.collect()

print('Done.')


New feature: test 6 

Use text info;

In [None]:
'''
print('Add text features ...')

items_subset = items[['item_id', 'item_name']]
feature_count = 25

tfidf = TfidfVectorizer(max_features=feature_count)
items_df_item_name_text_features = pd.DataFrame(tfidf.fit_transform(items_subset['item_name']).toarray())

cols = items_df_item_name_text_features.columns
for i in range(feature_count):

    feature_name = 'item_name_tfidf_' + str(i)
    items_subset[feature_name] = items_df_item_name_text_features[cols[i]]
    
    # YG
    items_subset[feature_name] = items_subset[feature_name].astype('int16')
    
    new_features.append(feature_name)

    
items_subset.drop('item_name', axis = 1, inplace = True)
train_test_set = train_test_set.merge(items_subset, on = 'item_id', how = 'left')

# Save some memory
del items_subset
del items_df_item_name_text_features

gc.collect()

print('Done.')
'''

new feature: test 7 

delta revenue

In [None]:
group = train_test_set.groupby( ["date_block_num","shop_id"] ).agg({"revenue": ["sum"] })
group.columns = ["date_shop_revenue"]
group.reset_index(inplace = True)

train_test_set = train_test_set.merge( group , on = ["date_block_num", "shop_id"], how = "left" )
train_test_set['date_shop_revenue'] = train_test_set['date_shop_revenue'].astype(np.float32)

group = group.groupby(["shop_id"]).agg({ "date_block_num":["mean"] })
group.columns = ["shop_avg_revenue"]
group.reset_index(inplace = True )

train_test_set = train_test_set.merge( group, on = ["shop_id"], how = "left" )
train_test_set["shop_avg_revenue"] = train_test_set.shop_avg_revenue.astype(np.float32)
train_test_set["delta_revenue"] = (train_test_set['date_shop_revenue'] - train_test_set['shop_avg_revenue']) / train_test_set['shop_avg_revenue']
train_test_set["delta_revenue"] = train_test_set["delta_revenue"]. astype(np.float32)

# shift 1 month
feature_name = 'delta_revenue_lag_1'
trainset2 = train_test_set.copy()
trainset2.loc[:, 'date_block_num'] += 1
trainset2.rename(columns={'delta_revenue': feature_name}, inplace=True)

train_test_set = train_test_set.merge(trainset2[['shop_id', 'item_id', 'date_block_num', feature_name]], on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
    
train_test_set[feature_name] = train_test_set[feature_name].astype('float32')
train_test_set.drop( ["date_shop_revenue", "shop_avg_revenue", "delta_revenue"] ,axis = 1, inplace = True)

# Scale the revenue
train_test_set[feature_name] = (train_test_set[feature_name] - train_test_set[feature_name].mean()) / (train_test_set[feature_name].max()-train_test_set[feature_name].min())

del trainset2
del group
gc.collect()

new_features.append(feature_name)

Select feature and Organize data

In [None]:
# Fill nan first
train_test_set = train_test_set.fillna(0)
train_test_set.isnull().values.any()

# Let's use the last month as validation;

baseline_features = ['shop_id', 'item_id', 'item_category_id', 'date_block_num', 'city'] +  new_features + ['item_cnt_month']

In [None]:
# ~~~~~~~~~~~~~~~~~~~~~ Test scaling ~~~~~~~~~~~~~~~~~~~~~~
# scaler = StandardScaler()
# train_test_set[new_features] = scaler.fit_transform(train_test_set[new_features])

# **Train/Test Data Setup**

In [None]:

# Set up train/val/test data;

print('Set up train/val/test data ...')

# Clipping to range 0-20
train_test_set['item_cnt_month'] = train_test_set.item_cnt_month.fillna(0).clip(0,20)

# train: want rows with date_block_num from 0 to 31
train_time_range_lo = (train_test_set['date_block_num'] >= 0)
train_time_range_hi =  (train_test_set['date_block_num'] <= 32)

# val: want rows with date_block_num from 33
validation_time =  (train_test_set['date_block_num'] == 33)

# test: want rows with date_block_num from 34
test_time =  (train_test_set['date_block_num'] == 34)


# Retrieve rows for train set, val set, test set
cv_trainset = train_test_set[train_time_range_lo & train_time_range_hi]

cv_valset = train_test_set[validation_time]
cv_trainset = cv_trainset[baseline_features]
cv_valset = cv_valset[baseline_features]

testset = train_test_set[test_time]
testset = testset[baseline_features]

print('Done.')


In [None]:
# free memory
del train_test_set
del train_time_range_lo
del train_time_range_hi
del validation_time
del test_time

gc.collect()

# **Feature Selection**

In [None]:
#https://scikit-learn.org/stable/auto_examples/feature_selection/plot_select_from_model_diabetes.html#sphx-glr-auto-examples-feature-selection-plot-select-from-model-diabetes-py
#https://scikit-learn.org/stable/modules/feature_selection.html
#https://towardsdatascience.com/feature-selection-using-regularisation-a3678b71e499

In [None]:
'''
# Load library
# from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn import linear_model
'''

In [None]:
# Take part of data to get most important features; (memory limit)
# cv_trainset = cv_trainset[cv_trainset['date_block_num']>15]
# cv_trainset['date_block_num'].unique()

In [None]:
'''
# Construct train set and targets;
cv_trainset_vals = cv_trainset.values
# del cv_trainset

trainx = cv_trainset_vals[:, 0:len(baseline_features) - 1]
trainy = cv_trainset_vals[:, len(baseline_features) - 1]

del cv_trainset_vals


# Method 1
# Rank the importance of feature;
clf = LassoCV(alphas=[.005]).fit(trainx, trainy)
importance = np.abs(clf.coef_)

print(importance)

figure = plt.figure()
plt.plot(importance)
plt.show()

# Method 2
# Use Lasso to select features;
clf = linear_model.Lasso(alpha=0.005).fit(trainx, trainy)

model = SelectFromModel(clf, prefit=True)

# Visualize which features are selected
model.get_support()

# See the names of selected features
# selected_feat = cv_trainset.columns[(model.get_support())]

'''

# **XGBoost Model**

In [None]:
# Prepare numpy arrays for training/val/test
# cv_trainset_vals = cv_trainset.values.astype(int)

cv_trainset_vals = cv_trainset.values
trainx = cv_trainset_vals[:, 0:len(baseline_features) - 1]
trainy = cv_trainset_vals[:, len(baseline_features) - 1]

del cv_trainset
del cv_trainset_vals

In [None]:
# cv_valset_vals = cv_valset.values.astype(int)

cv_valset_vals = cv_valset.values
valx = cv_valset_vals[:, 0:len(baseline_features) - 1]
valy = cv_valset_vals[:, len(baseline_features) - 1]

del cv_valset
del cv_valset_vals

In [None]:
# testset_vals = testset.values.astype(int)

testset_vals = testset.values
testx = testset_vals[:, 0:len(baseline_features) - 1]

del testset
del testset_vals

In [None]:
gc.collect()

In [None]:
# Fitting the model
print('Fitting...')
model = xgb.XGBRegressor(
    max_depth = 11, 
    min_child_weight=0.5, 
    subsample = 1, 
    eta = 0.3, 
    num_round = 1000, 
    seed = 1, 
    nthread = 16)

model.fit(trainx, trainy, eval_metric='rmse')

Predict

In [None]:
# Use validation data to test
preds = model.predict(valx)
# Clipping to range 0-20
preds = np.clip(preds, 0,20)
print('val set rmse: ', sqrt(mean_squared_error(valy, preds)))

In [None]:
# predict test data and generate submission file
preds = model.predict(testx)
    
# Clipping to range 0-20
preds = np.clip(preds, 0,20)
df = pd.DataFrame(preds, columns = ['item_cnt_month'])
df['ID'] = df.index
df = df.set_index('ID')

df.to_csv('test_preds.csv')
print('test predictions written to file')

# **Feature Importance Analysis**

In [None]:
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

plot_features(model, (10,14))