In [8]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model, metrics
import gc; gc.enable()
import random
import time, datetime

from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import TheilSenRegressor, BayesianRidge

from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.tree import DecisionTreeRegressor

np.random.seed(1122)


# Util functions
def print_runtime(start, end):
    print("runtime: {}".format( datetime.timedelta(seconds=(end-start)/60)))

def print_dataframe_size(name, df):
    print("size of {}: {:.3f} MB".format(name, df.memory_usage(index=True).sum()/1E6))


# Read datasets
print('Reading datasets...')
start = time.time()

dtypes = {'id':'uint32', 'item_nbr':'uint16', 'store_nbr':'int8', 'onpromotion':str}

print('Reading train and test...')
train = pd.read_csv('../input/train.csv', dtype=dtypes, parse_dates=['date'])
train['onpromotion'] = train['onpromotion'].map({'False': 0, 'True': 1})
test = pd.read_csv('../input/test.csv', dtype=dtypes, parse_dates=['date'])
test['onpromotion'] = test['onpromotion'].map({'False': 0, 'True': 1})
print('Reading others...')
items = pd.read_csv('../input/items.csv', dtype={'item_nbr':'uint16', 'perishable':bool})
stores = pd.read_csv('../input/stores.csv', dtype={'store_nbr':'uint8', 'cluster':'uint8' })
transactions = pd.read_csv('../input/transactions.csv', dtype={'store_nbr':'uint8'}, parse_dates=['date'])
holidays = pd.read_csv('../input/holidays_events.csv', dtype={'transferred':bool}, parse_dates=['date'])
oil = pd.read_csv('../input/oil.csv', parse_dates=['date'])

print_runtime(start, time.time())


# Dataset processing
print('Datasets processing...'); 
start_dp = time.time()

## Reduce training dataset
#train = train[(train['date'].dt.month == 8) & (train['date'].dt.day > 15)]
train = train[(train['date'].dt.month == 8)]

#train2017 = train[(train['date'].dt.year == 2017)]
#train201608 = train[(train['date'].dt.year == 2016) & (train['date'].dt.day > 15)]
#train2016 = train[(train['date'].dt.year == 2016))]
#train = pd.concat([train2017,train2016])
#del train2017, train2016; gc.collect();

#train[(train['date'].dt.year == 2016) | (train['date'].dt.year == 2017)]
#train = train[(train['date'].dt.year == 2017)]

#train = train[(train['date'].dt.month >= 5)]

# Transform target
target = train['unit_sales'].values
target[target < 0.] = 0.
train['unit_sales'] = np.log1p(target)
# Transforma transactions
tx = transactions['transactions'].values
transactions[tx < 0.] = 0.
transactions['transactions'] = np.log1p(tx)


def df_lbl_enc(df):
    for c in df.columns:
        if df[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            df[c] = lbl.fit_transform(df[c])
            print(c)
    return df

def df_lbl_enc_2(df, cols):
    for c in cols:
        lbl = preprocessing.LabelEncoder()
        df[c] = lbl.fit_transform(df[c])
    return df

def df_transform_date(df):
    df['date'] = pd.to_datetime(df['date'])
    df['yea'] = df['date'].dt.year
    df['mon'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['date'] = df['date'].dt.dayofweek # Replace!!!
    df = df.fillna(-1) #TODO: WTF???
    return df

# Encode categorical variables
# a)
items = df_lbl_enc_2(items, ['family'])
stores = df_lbl_enc_2(stores, ['type','city','state'])

# b)
# items = pd.get_dummies(items, columns = ['family'] )
# stores = pd.get_dummies(stores, columns = ['type','city','state'] ) #TODO: encode 

items['perishable_w'] = items['perishable'].map({False:1.0, True:1.25}) # Weight for calculating the error metric

# Create national holidays field
holidays['national_holiday'] = False 
holidays.loc[
    lambda df: (df.type=='Holiday') & (df.locale=='National') & ~(df.transferred), 'national_holiday'
    ] = True
holidays.loc[
    lambda df: (df.type=='Transfer') & (df.locale=='National'), 'national_holiday'
    ] = True


# Merge dataframes
train = pd.merge(train, items, how='left', on=['item_nbr'])
train = pd.merge(train, transactions, how='left', on=['date','store_nbr'])
train = pd.merge(train, stores, how='left', on=['store_nbr'])
train = pd.merge(train, holidays[['date','national_holiday']], how='left', on=['date'])
train = pd.merge(train, oil, how='left', on=['date'])
train = df_transform_date(train)

test = pd.merge(test, items, how='left', on=['item_nbr'])
test = pd.merge(test, transactions, how='left', on=['date','store_nbr'])
test = pd.merge(test, stores, how='left', on=['store_nbr'])
test = pd.merge(test, holidays[['date','national_holiday']], how='left', on=['date'])
test = pd.merge(test, oil, how='left', on=['date'])
test = df_transform_date(test)

del items, transactions, stores, holidays, oil; gc.collect();
print_dataframe_size("train", train)
print_dataframe_size("test", test)
print_runtime(start_dp, time.time())                                                                                                                                                                                                                                                                       


### Predict future transactions 

# Error metric
def NWRMSLE(y, pred, w):
    return metrics.mean_squared_error(y, pred, sample_weight=w)**0.5

col = [c for c in train if c not in ['id', 'unit_sales','perishable_w','transactions']]
print(col)

x1 = train[(train['yea'] != 2017)]
x2 = train[(train['yea'] == 2017)]
#x1 = train[(train['mon'] != 8)]
#x2 = train[(train['mon'] == 8)]
y1 = x1['transactions'].values
y2 = x2['transactions'].values
del train; gc.collect();

# debug
x1.to_csv('../output/x1_v2.csv')
x2.to_csv('../output/x2_v2.csv')
#raise Exception('debug')


Reading datasets...
Reading train and test...
Reading others...
runtime: 0:00:01.622278
Datasets processing...
size of train: 1556.867 MB
size of test: 484.822 MB
runtime: 0:00:00.483482
['date', 'store_nbr', 'item_nbr', 'onpromotion', 'family', 'class', 'perishable', 'city', 'state', 'type', 'cluster', 'national_holiday', 'dcoilwtico', 'yea', 'mon', 'day']


In [13]:
x2.head()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion,family,class,perishable,perishable_w,transactions,city,state,type,cluster,national_holiday,dcoilwtico,yea,mon,day
9148889,123926072,1,1,37984,1.386294,0.0,12,1028,False,1.0,7.493317,18,12,3,13,-1,49.19,2017,8,1
9148890,123926073,1,1,38129,1.609438,0.0,5,2712,True,1.25,7.493317,18,12,3,13,-1,49.19,2017,8,1
9148891,123926074,1,1,40038,2.197225,0.0,12,1045,False,1.0,7.493317,18,12,3,13,-1,49.19,2017,8,1
9148892,123926075,1,1,40039,2.70805,0.0,12,1045,False,1.0,7.493317,18,12,3,13,-1,49.19,2017,8,1
9148893,123926076,1,1,40157,0.693147,0.0,12,1034,False,1.0,7.493317,18,12,3,13,-1,49.19,2017,8,1


In [2]:
[(c,x1[c].dtype) for c in x1.columns]

[('id', dtype('uint32')),
 ('date', dtype('int64')),
 ('store_nbr', dtype('int64')),
 ('item_nbr', dtype('uint16')),
 ('unit_sales', dtype('float64')),
 ('onpromotion', dtype('float64')),
 ('family', dtype('int64')),
 ('class', dtype('int64')),
 ('perishable', dtype('bool')),
 ('perishable_w', dtype('float64')),
 ('transactions', dtype('float64')),
 ('city', dtype('int64')),
 ('state', dtype('int64')),
 ('type', dtype('int64')),
 ('cluster', dtype('uint8')),
 ('national_holiday', dtype('O')),
 ('dcoilwtico', dtype('float64')),
 ('yea', dtype('int64')),
 ('mon', dtype('int64')),
 ('day', dtype('int64'))]

KeyError: "['family_AUTOMOTIVE' 'type_A'] not in index"

In [17]:
holidays = pd.read_csv('../input/holidays_events.csv', dtype={'transferred':bool}, parse_dates=['date'])

# Create national holidays field
holidays['national_holiday'] = False 
holidays.loc[
    lambda df: (df.type=='Holiday') & (df.locale=='National') & ~(df.transferred), 'national_holiday'
    ] = True
holidays.loc[
    lambda df: (df.type=='Transfer') & (df.locale=='National'), 'national_holiday'
    ] = True
[(c,holidays[c].dtype) for c in holidays.columns]

[('date', dtype('<M8[ns]')),
 ('type', dtype('O')),
 ('locale', dtype('O')),
 ('locale_name', dtype('O')),
 ('description', dtype('O')),
 ('transferred', dtype('bool')),
 ('national_holiday', dtype('bool'))]