In [10]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model, metrics
import gc; gc.enable()
import random
import time, datetime

from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import TheilSenRegressor, BayesianRidge

from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.tree import DecisionTreeRegressor

np.random.seed(1122)


# Util functions
def print_runtime(start, end):
    print("runtime: {}".format( datetime.timedelta(seconds=(end-start)/60)))

def print_dataframe_size(name, df):
    print("size of {}: {:.3f} MB".format(name, df.memory_usage(index=True).sum()/1E6))


# Read datasets
print('Reading datasets...')
start = time.time()

dtypes = {'id':'int64', 'item_nbr':'int32', 'store_nbr':'int8', 'onpromotion':bool}

train = pd.read_csv('../input/train2017.csv', dtype=dtypes, parse_dates=['date']) #TODO: 2017
test = pd.read_csv('../input/test.csv', dtype=dtypes, parse_dates=['date'])
items = pd.read_csv('../input/items.csv', dtype={'perishable':bool})
stores = pd.read_csv('../input/stores.csv')
transactions = pd.read_csv('../input/transactions.csv', parse_dates=['date'])
holidays = pd.read_csv('../input/holidays_events.csv', dtype={'transferred':bool}, parse_dates=['date'])
oil = pd.read_csv('../input/oil.csv', parse_dates=['date'])

## Reduce training dataset
#train2017 = train[(train['date'].dt.year == 2017)]
#train201608 = train[(train['date'].dt.year == 2016) & (train['date'].dt.day > 15)]
#train2016 = train[(train['date'].dt.year == 2016))]
#train = pd.concat([train2017,train2016])
#del train2017, train2016; gc.collect();

#train[(train['date'].dt.year == 2016) | (train['date'].dt.year == 2017)]
#train = train[(train['date'].dt.year == 2017)]

train = train[(train['date'].dt.month >= 5)]

print_runtime(start, time.time())

# Dataset processing
print('Datasets processing...')

# Transform target
target = train['unit_sales'].values
target[target < 0.] = 0.
train['unit_sales'] = np.log1p(target)

def df_lbl_enc(df):
    for c in df.columns:
        if df[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            df[c] = lbl.fit_transform(df[c])
            print(c)
    return df

def df_transform(df):
    #df['date'] = pd.to_datetime(df['date'])
    df['yea'] = df['date'].dt.year
    df['mon'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['date'] = df['date'].dt.dayofweek # Replace!!!
    df = df.fillna(-1)
    return df


items = pd.get_dummies(items, columns = ['family'] )
items['perishable_w'] = items['perishable'].map({False:1.0, True:1.25})

stores = pd.get_dummies(stores, columns = ['type'] ) #TODO: encode 

# Create national holidays field
holidays['national_holiday'] = False 
holidays.loc[
    lambda df: (df.type=='Holiday') & (df.locale=='National') & ~(df.transferred), 'national_holiday'
    ] = True
holidays.loc[
    lambda df: (df.type=='Transfer') & (df.locale=='National'), 'national_holiday'
    ] = True


# Merge dataframes
train = pd.merge(train, items, how='left', on=['item_nbr'])
train = pd.merge(train, transactions, how='left', on=['date','store_nbr'])
train = pd.merge(train, stores, how='left', on=['store_nbr'])
train = pd.merge(train, holidays[['date','national_holiday']], how='left', on=['date'])
train = pd.merge(train, oil, how='left', on=['date'])
train = df_transform(train)

test = pd.merge(test, items, how='left', on=['item_nbr'])
test = pd.merge(test, transactions, how='left', on=['date','store_nbr'])
test = pd.merge(test, stores, how='left', on=['store_nbr'])
test = pd.merge(test, holidays[['date','national_holiday']], how='left', on=['date'])
test = pd.merge(test, oil, how='left', on=['date'])
test = df_transform(test)

del items, transactions, stores, holidays, oil; gc.collect();
print_dataframe_size("train", train)
print_dataframe_size("test", test)
print_runtime(start,time.time())                                                                                                                                                                                                                                                                        


### Predict future transactions 

# Error metric
def NWRMSLE(y, pred, w):
    return metrics.mean_squared_error(y, pred, sample_weight=w)**0.5

col = [c for c in train if c not in ['id', 'unit_sales','perishable_w','transactions']]

#x1 = train[(train['yea'] != 2017)]
#x2 = train[(train['yea'] == 2017)]
x1 = train[(train['mon'] != 8)]
x2 = train[(train['yea'] == 8)]
del train; gc.collect();

y1 = x1['transactions'].values
y2 = x2['transactions'].values


Reading datasets...
runtime: 0:00:00.324350
Datasets processing...
size of train: 1923.041 MB
size of test: 529.163 MB
runtime: 0:00:00.721560


In [11]:
[(x,x1[x].dtype) for x in x1.columns]

[('id', dtype('int64')),
 ('date', dtype('int64')),
 ('store_nbr', dtype('int8')),
 ('item_nbr', dtype('int32')),
 ('unit_sales', dtype('float64')),
 ('onpromotion', dtype('bool')),
 ('class', dtype('int64')),
 ('perishable', dtype('bool')),
 ('family_AUTOMOTIVE', dtype('uint8')),
 ('family_BABY CARE', dtype('uint8')),
 ('family_BEAUTY', dtype('uint8')),
 ('family_BEVERAGES', dtype('uint8')),
 ('family_BOOKS', dtype('uint8')),
 ('family_BREAD/BAKERY', dtype('uint8')),
 ('family_CELEBRATION', dtype('uint8')),
 ('family_CLEANING', dtype('uint8')),
 ('family_DAIRY', dtype('uint8')),
 ('family_DELI', dtype('uint8')),
 ('family_EGGS', dtype('uint8')),
 ('family_FROZEN FOODS', dtype('uint8')),
 ('family_GROCERY I', dtype('uint8')),
 ('family_GROCERY II', dtype('uint8')),
 ('family_HARDWARE', dtype('uint8')),
 ('family_HOME AND KITCHEN I', dtype('uint8')),
 ('family_HOME AND KITCHEN II', dtype('uint8')),
 ('family_HOME APPLIANCES', dtype('uint8')),
 ('family_HOME CARE', dtype('uint8')),
 ('fa

In [13]:
x1[x1.onpromotion][['id',
 'date',
 'store_nbr',
 'item_nbr',
 'unit_sales',
 'onpromotion',
 'class',
 'perishable',
 'family_AUTOMOTIVE',
 'perishable_w',
 'transactions',
 'city',
 'state',
 'cluster',
 'type_A',
 'national_holiday',
 'dcoilwtico',
 'yea',
 'mon',
 'day']].head()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion,class,perishable,family_AUTOMOTIVE,perishable_w,transactions,city,state,cluster,type_A,national_holiday,dcoilwtico,yea,mon,day
16,114176266,0,1,119026,2.833213,True,3026,False,0,1.0,481,Quito,Pichincha,13,0,True,48.83,2017,5,1
24,114176274,0,1,129758,0.693147,True,1032,False,0,1.0,481,Quito,Pichincha,13,0,True,48.83,2017,5,1
25,114176275,0,1,153078,0.693147,True,3008,False,0,1.0,481,Quito,Pichincha,13,0,True,48.83,2017,5,1
46,114176296,0,1,173111,1.098612,True,1058,False,0,1.0,481,Quito,Pichincha,13,0,True,48.83,2017,5,1
51,114176301,0,1,207857,0.693147,True,1010,False,0,1.0,481,Quito,Pichincha,13,0,True,48.83,2017,5,1


In [17]:
holidays = pd.read_csv('../input/holidays_events.csv', dtype={'transferred':bool}, parse_dates=['date'])

# Create national holidays field
holidays['national_holiday'] = False 
holidays.loc[
    lambda df: (df.type=='Holiday') & (df.locale=='National') & ~(df.transferred), 'national_holiday'
    ] = True
holidays.loc[
    lambda df: (df.type=='Transfer') & (df.locale=='National'), 'national_holiday'
    ] = True
[(c,holidays[c].dtype) for c in holidays.columns]

[('date', dtype('<M8[ns]')),
 ('type', dtype('O')),
 ('locale', dtype('O')),
 ('locale_name', dtype('O')),
 ('description', dtype('O')),
 ('transferred', dtype('bool')),
 ('national_holiday', dtype('bool'))]