In [1]:
import pickle
import csv
from datetime import datetime
import sklearn
from sklearn import preprocessing
import numpy as np

import sys
sys.setrecursionlimit(10000)
np.random.seed(42)

from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation,Reshape
from keras.layers import Input, concatenate
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


## Leitura dos arquivos csv e salvando em pickle

In [2]:
def csv2dicts(csvfile):
    data = []
    keys = []
    for row_index, row in enumerate(csvfile):
        if row_index == 0:
            keys = row
            print(row)
            continue
        # if row_index % 10000 == 0:
        #     print(row_index)
        data.append({key: value for key, value in zip(keys, row)})
    return data


def set_nan_as_string(data, replace_str='0'):
    for i, x in enumerate(data):
        for key, value in x.items():
            if value == '':
                x[key] = replace_str
        data[i] = x

https://www.kaggle.com/c/rossmann-store-sales/data
    

In [3]:
datapath = '/Users/robertoalotufo/mylocaldatasets/'
train_data = datapath + "rossmann_train.csv"
store_data = datapath + "rossmann_store.csv"
store_states = datapath + 'rossmann_store_states.csv'

In [4]:
with open(train_data) as csvfile:
    data = csv.reader(csvfile, delimiter=',')
    with open('../data/rossmann_train_data.pickle', 'wb') as f:
        data = csv2dicts(data)
        data = data[::-1]
        pickle.dump(data, f, -1)
        print(data[:3])

['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']
[{'Store': '1115', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1114', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1113', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}]


In [5]:
with open(store_data) as csvfile, open(store_states) as csvfile2:
    data = csv.reader(csvfile, delimiter=',')
    state_data = csv.reader(csvfile2, delimiter=',')
    with open('../data/rossmann_store_data.pickle', 'wb') as f:
        data = csv2dicts(data)
        state_data = csv2dicts(state_data)
        set_nan_as_string(data)
        for index, val in enumerate(data):
            state = state_data[index]
            val['State'] = state['State']
            data[index] = val
        pickle.dump(data, f, -1)
        print(data[:2])

['Store', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
['Store', 'State']
[{'Store': '1', 'StoreType': 'c', 'Assortment': 'a', 'CompetitionDistance': '1270', 'CompetitionOpenSinceMonth': '9', 'CompetitionOpenSinceYear': '2008', 'Promo2': '0', 'Promo2SinceWeek': '0', 'Promo2SinceYear': '0', 'PromoInterval': '0', 'State': 'HE'}, {'Store': '2', 'StoreType': 'a', 'Assortment': 'a', 'CompetitionDistance': '570', 'CompetitionOpenSinceMonth': '11', 'CompetitionOpenSinceYear': '2007', 'Promo2': '1', 'Promo2SinceWeek': '13', 'Promo2SinceYear': '2010', 'PromoInterval': 'Jan,Apr,Jul,Oct', 'State': 'TH'}]


## Preparando as features

In [6]:
with open('../data/rossmann_train_data.pickle', 'rb') as f:
    train_data = pickle.load(f)
    num_records = len(train_data)
with open('../data/rossmann_store_data.pickle', 'rb') as f:
    store_data = pickle.load(f)

In [7]:
def feature_list(record):
    dt = datetime.strptime(record['Date'], '%Y-%m-%d')
    store_index = int(record['Store'])
    year = dt.year
    month = dt.month
    day = dt.day
    day_of_week = int(record['DayOfWeek'])
    try:
        store_open = int(record['Open'])
    except:
        store_open = 1

    promo = int(record['Promo'])

    return [store_open,
            store_index,
            day_of_week,
            promo,
            year,
            month,
            day,
            store_data[store_index - 1]['State']
            ]

In [8]:
train_data_X = []
train_data_y = []

for record in train_data:
    if record['Sales'] != '0' and record['Open'] != '':
        fl = feature_list(record)
        train_data_X.append(fl)
        train_data_y.append(int(record['Sales']))
print("Number of train datapoints: ", len(train_data_y))

print(min(train_data_y), max(train_data_y))

Number of train datapoints:  844338
46 41551


In [9]:
full_X = train_data_X
full_X = np.array(full_X)
train_data_X = np.array(train_data_X)
les = []
for i in range(train_data_X.shape[1]):
    le = preprocessing.LabelEncoder()
    le.fit(full_X[:, i])
    les.append(le)
    train_data_X[:, i] = le.transform(train_data_X[:, i])

with open('../data/rossmann_les.pickle', 'wb') as f:
    pickle.dump(les, f, -1)

train_data_X = train_data_X.astype(int)
train_data_y = np.array(train_data_y)

with open('../data/rossmann_feature_train_data.pickle', 'wb') as f:
    pickle.dump((train_data_X, train_data_y), f, -1)
    print(train_data_X[0], train_data_y[0])

[  0 109   1   0   0   0   0   7] 5961


## Modelo

In [10]:
def split_features(X):
    X_list = []

    store_index = X[..., [1]]
    X_list.append(store_index)

    day_of_week = X[..., [2]]
    X_list.append(day_of_week)

    promo = X[..., [3]]
    X_list.append(promo)

    year = X[..., [4]]
    X_list.append(year)

    month = X[..., [5]]
    X_list.append(month)

    day = X[..., [6]]
    X_list.append(day)

    State = X[..., [7]]
    X_list.append(State)

    return X_list


## Treinando e Testando o Modelo

#### Carregando os dados

In [11]:
train_ratio = 0.9
shuffle_data = False
one_hot_as_input = False
embeddings_as_input = False
save_embeddings = False
save_models = False
saved_embeddings_fname = "../data/rossmann_embeddings.pickle"  # set save_embeddings to True to create this file

In [12]:
f = open('../data/rossmann_feature_train_data.pickle', 'rb')
(X, y) = pickle.load(f)

num_records = len(X)
train_size = int(train_ratio * num_records)

if shuffle_data:
    print("Using shuffled data")
    sh = numpy.arange(X.shape[0])
    numpy.random.shuffle(sh)
    X = X[sh]
    y = y[sh]

if embeddings_as_input:
    print("Using learned embeddings as input")
    X = embed_features(X, saved_embeddings_fname)

if one_hot_as_input:
    print("Using one-hot encoding as input")
    enc = OneHotEncoder(sparse=False)
    enc.fit(X)
    X = enc.transform(X)

X_train = X[:train_size]
X_val   = X[train_size:]
y_train = y[:train_size]
y_val   = y[train_size:]

In [13]:
def sample(X, y, n):
    '''random samples'''
    num_row = X.shape[0]
    indices = np.random.randint(num_row, size=n)
    return X[indices, :], y[indices]

X_train, y_train = sample(X_train, y_train, 200000)  # Simulate data sparsity
print("Number of samples used for training: " + str(y_train.shape[0]))

Number of samples used for training: 200000


## Criando o modelo

### Modelo com vários embeddings

In [14]:
def build_keras_model():
    store_in = Input(shape=(1,), dtype='int64', name='store_in')
    x = Embedding(1115, 10, input_length=1, name='store_embedding')(store_in)
    store_emb = Reshape(target_shape=(10,))(x)

    dow_in = Input(shape=(1,), dtype='int64', name='down_in')
    x = Embedding(7, 6, input_length=1, name='dow_embedding')(dow_in)
    dow_emb = Reshape(target_shape=(6,))(x)

    promo_in = Input(shape=(1,), name='promo_in')
    promo_out = Dense(1, input_dim=1)(promo_in)

    year_in = Input(shape=(1,), dtype='int64', name='year_in')
    x = Embedding(3, 2, input_length=1, name='year_embedding')(year_in)
    year_emb = Reshape(target_shape=(2,))(x)

    month_in = Input(shape=(1,), dtype='int64', name='month_in')
    x = Embedding(12, 6, input_length=1, name='month_embedding')(month_in)
    month_emb = Reshape(target_shape=(6,))(x)

    day_in = Input(shape=(1,), dtype='int64', name='day_in')
    x = Embedding(31, 10, input_length=1, name='day_embedding')(day_in)
    day_emb = Reshape(target_shape=(10,))(x)

    germanstate_in = Input(shape=(1,), dtype='int64', name='germanstate_in')
    x = Embedding(12, 6, input_length=1, name='germanstate_embedding')(germanstate_in)
    germanstate_emb = Reshape(target_shape=(6,))(x)

    xin = concatenate([store_emb, dow_emb, promo_out, year_emb, month_emb, day_emb, germanstate_emb])
    x = Dense(1000, kernel_initializer='uniform', activation='relu')(xin)
    x = Dense(500, kernel_initializer='uniform', activation='relu')(x)
    x_out = Dense(1, activation='sigmoid')(x)
    
    return Model([store_in,dow_in,promo_in,year_in,month_in,day_in,germanstate_in], x_out)

model_rs = build_keras_model()
print(model_rs.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
store_in (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
down_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
year_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
month_in (InputLayer)            (None, 1)             0                                            
___________________________________________________________________________________________

### Treinando

In [15]:
def val_for_fit(val, y_train, y_val):
    max_log_y = max(np.max(np.log(y_train)), np.max(np.log(y_val)))
    val = np.log(val) / max_log_y
    return val
    
X_train_p = split_features(X_train)
X_val_p = split_features(X_val)
y_train_p = val_for_fit(y_train, y_train, y_val)
y_val_p = val_for_fit(y_val, y_train, y_val)
model_rs.compile(loss='mean_absolute_error', optimizer='adam')
model_rs.fit(X_train_p, y_train_p,
             validation_data=(X_val_p, y_val_p),
             epochs=1, batch_size=128,
             # callbacks=[self.checkpointer],
             )


Train on 200000 samples, validate on 84434 samples
Epoch 1/1


<keras.callbacks.History at 0x1514364a8>

### Avaliando

In [16]:
def val_for_pred(val, y_train, y_val):
    max_log_y = max(np.max(np.log(y_train)), np.max(np.log(y_val)))
    return np.exp(val * max_log_y)

def guess(model,features):
    features = split_features(features)
    result = model.predict(features).flatten()
    return val_for_pred(result, y_train, y_val)


## Avaliando o modelo

In [17]:
def evaluate_model(model, X, y):
    assert(min(y) > 0)
    guessed_sales = np.array([guess(model,X)])
    mean_sales = guessed_sales.mean(axis=0)
    relative_err = np.absolute((y - mean_sales) / y)
    result = np.sum(relative_err) / len(y)
    return result

print("Evaluate combined models...")
print("Training error...")
r_train = evaluate_model(model_rs, X_train, y_train)
print(r_train)

print("Validation error...")
r_val = evaluate_model(model_rs, X_val, y_val)
print(r_val)

Evaluate combined models...
Training error...
0.105761952822
Validation error...
0.11963804431


In [18]:
if save_embeddings:
    model = models[0].model
    weights = model.get_weights()
    store_embedding = weights[0]
    dow_embedding = weights[1]
    year_embedding = weights[4]
    month_embedding = weights[5]
    day_embedding = weights[6]
    german_states_embedding = weights[7]
    with open(saved_embeddings_fname, 'wb') as f:
        pickle.dump([store_embedding, dow_embedding, year_embedding,
                    month_embedding, day_embedding, german_states_embedding], f, -1)

if save_models:
    with open('../data/rossmann_models.pickle', 'wb') as f:
        pickle.dump(models, f)