In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import gc
import xgboost as xgb
from sklearn.linear_model import ElasticNet, Lasso, Ridge, LinearRegression
import pickle
from sklearn.model_selection import KFold


In [3]:
transactions    = pd.read_csv('sales_train.csv.gz')
items           = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
test            = pd.read_csv('test.csv.gz')

In [59]:
transactions = pd.merge(transactions, items, on='item_id', how='left')
transactions = transactions.drop('item_name', axis=1)
transactions = transactions[transactions['date_block_num'] > 12]
transactions.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id_x,y,item_cnt_sum,item_cnt_mean,item_price_mean,shop_cnt_sum,shop_cnt_mean,shop_price_mean,category_cnt_sum,category_cnt_mean,category_price_mean,item_category_id_y,item_category_id
0,13,27,15242,699.0,1.0,63,2.0,8.0,1.0,666.75,4208.0,1.298365,952.135647,1281.0,1.051724,780.414491,63,63
1,13,27,15200,299.0,1.0,69,1.0,1.0,1.0,299.0,4208.0,1.298365,952.135647,887.0,1.251058,396.1441,69,69
2,13,27,15279,799.0,1.0,63,2.0,48.0,1.043478,799.0,4208.0,1.298365,952.135647,1281.0,1.051724,780.414491,63,63
3,13,27,15202,299.0,1.0,69,1.0,3.0,1.0,365.666667,4208.0,1.298365,952.135647,887.0,1.251058,396.1441,69,69
4,13,27,14888,549.0,1.0,55,1.0,19.0,1.0,545.526316,4208.0,1.298365,952.135647,9442.0,1.018005,286.466544,55,55


In [5]:
train_item_ids = transactions['item_id'].unique()
train_shop_ids = transactions['shop_id'].unique()
train_blocks = transactions['date_block_num'].unique()

In [6]:
train_combos = [[item, shop, block] for item in train_item_ids for shop in train_shop_ids for block in train_blocks]

In [7]:
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()

In [8]:
test_combos = [[item, shop, block] for item in test_item_ids for shop in test_shop_ids for block in train_blocks]

In [53]:
all_combos = pd.DataFrame(np.unique(np.vstack([train_combos, test_combos]), axis=0),dtype=np.int8, columns=['item_id', 'shop_id', 'date_block_num'])

In [10]:
len(all_combos)

19414332

In [54]:
all_combos.head()

Unnamed: 0,item_id,shop_id,date_block_num
0,0,2,13
1,0,2,14
2,0,2,15
3,0,2,16
4,0,2,17


In [60]:
transactions.drop('date', inplace=True, axis=1)

KeyError: "['date'] not found in axis"

In [61]:
transactions['y'] = transactions.groupby(['date_block_num','shop_id','item_id'])['item_cnt_day'].transform(np.sum)

In [62]:
gc.collect()

transactions['item_cnt_sum'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['item_cnt_mean'] = transactions.groupby(['item_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['item_price_mean'] = transactions.groupby(['item_id', 'date_block_num'])['item_price'].transform(np.mean)

transactions['shop_cnt_sum'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['shop_cnt_mean'] = transactions.groupby(['shop_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['shop_price_mean'] = transactions.groupby(['shop_id', 'date_block_num'])['item_price'].transform(np.mean)

transactions['category_cnt_sum'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.sum)
transactions['category_cnt_mean'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_cnt_day'].transform(np.mean)
transactions['category_price_mean'] = transactions.groupby(['item_category_id', 'date_block_num'])['item_price'].transform(np.mean)

training = pd.merge(all_combos,transactions,on=['item_id', 'shop_id', 'date_block_num'],how='left').fillna(0)

In [24]:
training = training.sample(frac=1)

In [63]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

training_int = training.select_dtypes(include=['int'])
converted_int = training_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(training_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([training_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

148.99 MB
148.99 MB


Unnamed: 0,before,after


In [64]:
training_float = training.select_dtypes(include=['float'])
converted_float = training_float.apply(pd.to_numeric,downcast='float')

print(mem_usage(training_float))
print(mem_usage(converted_float))

compare_floats = pd.concat([training_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

2383.89 MB
1266.44 MB


Unnamed: 0,before,after
float32,,15.0
float64,15.0,


In [65]:
optimized_training = training.copy()

optimized_training[converted_int.columns] = converted_int
optimized_training[converted_float.columns] = converted_float

print(mem_usage(training))
print(mem_usage(optimized_training))

2439.76 MB
1322.32 MB


In [66]:
training = optimized_training
del optimized_training
gc.collect()

70

In [67]:
len(training)

19528844

In [68]:
lag_columns = training.columns[7:]
lags = [1,2,3,6,12]

In [69]:
gc.collect()
def lagged_name(lag_column, lag):
    return "%s_lag_%d" % (lag_column, lag)

for lag in lags:
    lagged = transactions.copy()
    training['lagged_block'] = training['date_block_num'] - lag
    lagged_names = [lagged_name(c,lag) for c in lag_columns]
    lag_mapping = dict(zip(lag_columns, lagged_names))
    lagged.rename(lag_mapping,inplace=True)
    training = pd.merge(training,lagged,left_on='lagged_block',right_on='date_block_num')
    del lagged
    gc.collect()

MemoryError: 