In [7]:
import pandas as pd
import csv
from datetime import datetime
import sklearn
from sklearn import preprocessing
import numpy as np

import sys
sys.setrecursionlimit(10000)
np.random.seed(42)

from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation,Reshape
from keras.layers import Input, concatenate
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint


## Leitura dos arquivos csv e salvando em pickle

In [2]:
def csv2dicts(csvfile):
    data = []
    keys = []
    for row_index, row in enumerate(csvfile):
        if row_index == 0:
            keys = row
            print(row)
            continue
        # if row_index % 10000 == 0:
        #     print(row_index)
        data.append({key: value for key, value in zip(keys, row)})
    return data


def set_nan_as_string(data, replace_str='0'):
    for i, x in enumerate(data):
        for key, value in x.items():
            if value == '':
                x[key] = replace_str
        data[i] = x

- [Cheng Guo, Felix Berkhahn, Entity Embeddings of Categorical Variables, arxiv 2016](https://arxiv.org/abs/1604.06737)
- Kaggle: https://www.kaggle.com/c/rossmann-store-sales/data
    

In [3]:
datapath = '/Users/robertoalotufo/mylocaldatasets/'
train_data_file = datapath + "rossmann_train.csv"
store_data_file = datapath + "rossmann_store.csv"
store_states_file = datapath + 'rossmann_store_states.csv'

In [4]:
train_csv = pd.read_csv(train_data_file)
train_csv.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [5]:
store_csv = pd.read_csv(store_data_file)
store_csv.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [6]:
states_csv = pd.read_csv(store_states_file)
states_csv.head()

Unnamed: 0,Store,State
0,1,HE
1,2,TH
2,3,NW
3,4,BE
4,5,SN


In [8]:
with open(train_data_file) as csvfile:
    data = csv.reader(csvfile, delimiter=',')
    train_data = csv2dicts(data)
num_records = len(train_data)
print(num_records)

['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']
1017209


In [9]:
print(type(train_data[0]))
print(train_data[:3])

<class 'dict'>
[{'Store': '1', 'DayOfWeek': '5', 'Date': '2015-07-31', 'Sales': '5263', 'Customers': '555', 'Open': '1', 'Promo': '1', 'StateHoliday': '0', 'SchoolHoliday': '1'}, {'Store': '2', 'DayOfWeek': '5', 'Date': '2015-07-31', 'Sales': '6064', 'Customers': '625', 'Open': '1', 'Promo': '1', 'StateHoliday': '0', 'SchoolHoliday': '1'}, {'Store': '3', 'DayOfWeek': '5', 'Date': '2015-07-31', 'Sales': '8314', 'Customers': '821', 'Open': '1', 'Promo': '1', 'StateHoliday': '0', 'SchoolHoliday': '1'}]


In [11]:
with open(store_data_file) as csvfile, open(store_states_file) as csvfile2:
    store_data_in = csv.reader(csvfile, delimiter=',')
    state_data = csv.reader(csvfile2, delimiter=',')
    store_data = csv2dicts(store_data_in)
    state_data = csv2dicts(state_data)
    set_nan_as_string(store_data)
    for index, val in enumerate(store_data):
        state = state_data[index]
        val['State'] = state['State']
        store_data[index] = val
    print(store_data[:2])

['Store', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
['Store', 'State']
[{'Store': '1', 'StoreType': 'c', 'Assortment': 'a', 'CompetitionDistance': '1270', 'CompetitionOpenSinceMonth': '9', 'CompetitionOpenSinceYear': '2008', 'Promo2': '0', 'Promo2SinceWeek': '0', 'Promo2SinceYear': '0', 'PromoInterval': '0', 'State': 'HE'}, {'Store': '2', 'StoreType': 'a', 'Assortment': 'a', 'CompetitionDistance': '570', 'CompetitionOpenSinceMonth': '11', 'CompetitionOpenSinceYear': '2007', 'Promo2': '1', 'Promo2SinceWeek': '13', 'Promo2SinceYear': '2010', 'PromoInterval': 'Jan,Apr,Jul,Oct', 'State': 'TH'}]


## Preparando as features

In [12]:
def feature_list(record):
    dt = datetime.strptime(record['Date'], '%Y-%m-%d')
    store_index = int(record['Store'])
    year = dt.year
    month = dt.month
    day = dt.day
    day_of_week = int(record['DayOfWeek'])
    try:
        store_open = int(record['Open'])
    except:
        store_open = 1

    promo = int(record['Promo'])

    return [store_open,
            store_index,
            day_of_week,
            promo,
            year,
            month,
            day,
            store_data[store_index - 1]['State']
            ]

In [13]:
train_data_X = []
train_data_y = []

for record in train_data:
    if record['Sales'] != '0' and record['Open'] != '':
        fl = feature_list(record)
        train_data_X.append(fl)
        train_data_y.append(int(record['Sales']))
print("Number of train datapoints: ", len(train_data_y))

print(min(train_data_y), max(train_data_y))

Number of train datapoints:  844338
46 41551


In [14]:
print(train_data_X[:5])

[[1, 1, 5, 1, 2015, 7, 31, 'HE'], [1, 2, 5, 1, 2015, 7, 31, 'TH'], [1, 3, 5, 1, 2015, 7, 31, 'NW'], [1, 4, 5, 1, 2015, 7, 31, 'BE'], [1, 5, 5, 1, 2015, 7, 31, 'SN']]


In [15]:
full_X = train_data_X
full_X = np.array(full_X)
train_data_X = np.array(train_data_X)
#les = []
for i in range(train_data_X.shape[1]):
    le = preprocessing.LabelEncoder()
    le.fit(full_X[:, i])
    #les.append(le)
    train_data_X[:, i] = le.transform(train_data_X[:, i])

train_data_X = train_data_X.astype(int)
train_data_y = np.array(train_data_y)



In [16]:
print(train_data_X.min(axis=0))
print(train_data_X.max(axis=0))

[0 0 0 0 0 0 0 0]
[   0 1114    6    1    2   11   30   11]


## Modelo

## Carregando os dados

In [17]:
shuffle_data = False
one_hot_as_input = False
embeddings_as_input = False
save_embeddings = False
save_models = False
saved_embeddings_fname = "../data/rossmann_embeddings.pickle"  # set save_embeddings to True to create this file

In [20]:
#f = open('../data/rossmann_feature_train_data.pickle', 'rb')
#(X, y) = pickle.load(f)
(X, y) = (train_data_X, train_data_y)

if shuffle_data:
    print("Using shuffled data")
    sh = numpy.arange(X.shape[0])
    numpy.random.shuffle(sh)
    X = X[sh]
    y = y[sh]

if embeddings_as_input:
    print("Using learned embeddings as input")
    X = embed_features(X, saved_embeddings_fname)

if one_hot_as_input:
    print("Using one-hot encoding as input")
    enc = OneHotEncoder(sparse=False)
    enc.fit(X)
    X = enc.transform(X)

## Normalizando os valores de vendas

$$ Sales_n = \frac{\log{(Sales)}}{Max(\log(Sales))} $$
$$ Sales = \exp(Sales_n Max(\log(Sales))) $$

In [21]:
class log_norm():
    def __init__(self,y):
        self.MaxLog = np.log(y).max()
        
    def lognorm(self, val):
        return np.log(val)/self.MaxLog
    
    def lognorm_back(self, val):
        return np.exp(val * self.MaxLog)

classnorm = log_norm(y)
y_norm = classnorm.lognorm(y)
y_back = classnorm.lognorm_back(y_norm)


## Dividindo em Dados de treinamento e de validação

In [22]:
num_records = len(X)
train_ratio = 0.9
train_size = int(train_ratio * num_records)

X_train = X[:train_size]
X_val   = X[train_size:]
y_train = y_norm[:train_size]
y_val   = y_norm[train_size:]
print('Shapes:',X_train.shape,y_train.shape,X_val.shape,y_val.shape)
print('X_train[:2]\n',X_train[:2])
print('y_train[:2]\n',y_train[:2])

Shapes: (759904, 8) (759904,) (84434, 8) (84434,)
X_train[:2]
 [[  0   0   4   1   2  11  17   6]
 [  0 258   4   1   2  11  17   3]]
y_train[:2]
 [ 0.80570915  0.81903052]


In [23]:
def sample(X, y, n):
    '''random samples'''
    num_row = X.shape[0]
    indices = np.random.randint(num_row, size=n)
    return X[indices, :], y[indices]

X_train, y_train = sample(X_train, y_train, 200000)  # Simulate data sparsity
print("Number of samples used for training: ",y_train.shape[0])

Number of samples used for training:  200000


## Criando o modelo

### Modelo com vários embeddings


|Característica|tipo do dado|tamanho|dimensão embedding|
|--------------|------------|-------|------------------|
| store      | nominal| 1115|10|
| day of week| ordinal| 7| 6|
|promoção| binário | 2|1|
| ano| ordinal| 3 (2013-2015)|2|
| mês| ordinal| 12| 6|
| dia| ordinal| 31| 10|
| Estado|nominal| 12| 6|
 


In [24]:
def build_keras_model():
    store_in = Input(shape=(1,), dtype='int64', name='store_in')
    x = Embedding(1115, 10, input_length=1, name='store_embedding')(store_in)
    store_emb = Reshape(target_shape=(10,), name='store_reshape')(x)

    dow_in = Input(shape=(1,), dtype='int64', name='down_in')
    x = Embedding(7, 6, input_length=1, name='dow_embedding')(dow_in)
    dow_emb = Reshape(target_shape=(6,), name='dow_reshape')(x)

    promo_in = Input(shape=(1,), name='promo_in')
    promo_out = Dense(1, input_dim=1)(promo_in)

    year_in = Input(shape=(1,), dtype='int64', name='year_in')
    x = Embedding(3, 2, input_length=1, name='year_embedding')(year_in)
    year_emb = Reshape(target_shape=(2,))(x)

    month_in = Input(shape=(1,), dtype='int64', name='month_in')
    x = Embedding(12, 6, input_length=1, name='month_embedding')(month_in)
    month_emb = Reshape(target_shape=(6,))(x)

    day_in = Input(shape=(1,), dtype='int64', name='day_in')
    x = Embedding(31, 10, input_length=1, name='day_embedding')(day_in)
    day_emb = Reshape(target_shape=(10,))(x)

    germanstate_in = Input(shape=(1,), dtype='int64', name='germanstate_in')
    x = Embedding(12, 6, input_length=1, name='germanstate_embedding')(germanstate_in)
    germanstate_emb = Reshape(target_shape=(6,))(x)

    xin = concatenate([store_emb, dow_emb, promo_out, year_emb, month_emb, day_emb, germanstate_emb])
    x = Dense(1000, kernel_initializer='uniform', activation='relu')(xin)
    x = Dense(500, kernel_initializer='uniform', activation='relu')(x)
    x_out = Dense(1, activation='sigmoid')(x)
    
    return Model([store_in,dow_in,promo_in,year_in,month_in,day_in,germanstate_in], x_out)

model_rs = build_keras_model()
print(model_rs.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
store_in (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
down_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
year_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
month_in (InputLayer)            (None, 1)             0                                            
___________________________________________________________________________________________

### Treinando

In [25]:
def split_features(X):
    store_index = X[..., [1]]
    day_of_week = X[..., [2]]
    promo = X[..., [3]]
    year = X[..., [4]]
    month = X[..., [5]]
    day = X[..., [6]]
    State = X[..., [7]]
    return [store_index,day_of_week,promo,year,month,day,State]
 
X_train_p = split_features(X_train)
X_val_p = split_features(X_val)
print(X_train.shape, X_val.shape)
print('Número de promoções (sem promoção, com promoção):',np.bincount(X_train[:,3]))

(200000, 8) (84434, 8)
Número de promoções (sem promoção, com promoção): [110508  89492]


In [26]:
model_rs.compile(loss='mean_absolute_error', optimizer='adam')
model_rs.fit(X_train_p, y_train,
             validation_data=(X_val_p, y_val),
             epochs=1, 
             batch_size=128,
             shuffle=False,
             # callbacks=[self.checkpointer],
             )

Train on 200000 samples, validate on 84434 samples
Epoch 1/1


<keras.callbacks.History at 0x147ac07b8>

### Avaliando

In [27]:
def guess(model,features):
    features = split_features(features)
    result = model.predict(features).flatten()
    return result

## Avaliando o modelo

In [28]:
def evaluate_model(model, X, y):
    assert(min(y) > 0)
    guessed_sales = np.array([guess(model,X)])
    mean_sales = guessed_sales.mean(axis=0)
    mean_sales_back = classnorm.lognorm_back(mean_sales)
    y_back = classnorm.lognorm_back(y)
    relative_err = np.absolute((y_back - mean_sales_back) / y_back)
    result = np.sum(relative_err) / len(y_back)
    return result

print("Evaluate combined models...")
print("Training error...")
r_train = evaluate_model(model_rs, X_train, y_train)
print(r_train)

print("Validation error...")
r_val = evaluate_model(model_rs, X_val, y_val)
print(r_val)

Evaluate combined models...
Training error...
0.111074412279
Validation error...
0.143284377634


In [None]:
if save_embeddings:
    model = models[0].model
    weights = model.get_weights()
    store_embedding = weights[0]
    dow_embedding = weights[1]
    year_embedding = weights[4]
    month_embedding = weights[5]
    day_embedding = weights[6]
    german_states_embedding = weights[7]
    with open(saved_embeddings_fname, 'wb') as f:
        pickle.dump([store_embedding, dow_embedding, year_embedding,
                    month_embedding, day_embedding, german_states_embedding], f, -1)

if save_models:
    with open('../data/rossmann_models.pickle', 'wb') as f:
        pickle.dump(models, f)