In [1]:
import numpy as np
import pandas as pd
import time
import importlib

import scripts.local_environment as local
importlib.reload(local);

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
%%time
df = pd.read_csv('data/clean/train_clean.csv')
df_targets = pd.read_csv('data/clean/train_labels.csv')
df_test = pd.read_csv('data/clean/test_clean.csv')

CPU times: user 49.9 s, sys: 4.48 s, total: 54.4 s
Wall time: 54.3 s


### Experimento 3
Entrenamiento con el mes de Mayo/2016 con sólo clientes antiguos

In [None]:
x_train = df[df['fecha_dato'] == '2016-05-28']
y_train = df_targets.loc[x_train.index]

In [5]:
ncodpers_both_month = list(set(x_train['ncodpers'].values) & set(df_test['ncodpers'].values))
len(ncodpers_both_month)

699690

In [6]:
x = x_train[x_train['ncodpers'].isin(ncodpers_both_month)].drop(['fecha_dato', 'fecha_alta'], axis=1)
y = y_train.loc[x.index].as_matrix()
x = x.as_matrix()

In [7]:
%%time
rf = local.model(x, y, RandomForestClassifier(n_jobs=4))

CPU times: user 2min 49s, sys: 8.8 s, total: 2min 57s
Wall time: 53.2 s


In [8]:
x_test = df_test.drop(['fecha_dato', 'fecha_alta'], axis=1).as_matrix()

In [9]:
%%time
probs, preds = local.calculatePredsProbs(x_test, rf)

CPU times: user 2min 17s, sys: 4.58 s, total: 2min 22s
Wall time: 1min 5s


In [10]:
%%time
subm = local.processPredictions(probs=probs, preds=preds, df_train=x_train, df_test=df_test,
                                df_targets=y_train, env='submit')

change
results/submissions/2018-07-13-h12-05-59_submission.csv
CPU times: user 16.7 s, sys: 836 ms, total: 17.5 s
Wall time: 18.7 s


### Experimento 4
Entrenamiento con Mayo 2015 y todos los clientes

In [17]:
x_train = df[df['fecha_dato'] == '2016-05-28']
y_train = df_targets.loc[x_train.index]

x = x_train.drop(['fecha_dato', 'fecha_alta'], axis=1)
y = y_train.loc[x.index].as_matrix()
x = x.as_matrix()

In [18]:
%%time
rf = local.model(x, y, RandomForestClassifier(n_jobs=4))

CPU times: user 3min 26s, sys: 14.6 s, total: 3min 40s
Wall time: 1min 17s


In [19]:
x_test = df_test.drop(['fecha_dato', 'fecha_alta'], axis=1).as_matrix()

In [20]:
%%time
probs, preds = local.calculatePredsProbs(x_test, rf)

CPU times: user 3min 9s, sys: 6.43 s, total: 3min 15s
Wall time: 1min 34s


In [21]:
%%time
subm, name_file = local.processPredictions(probs=probs, preds=preds, df_train=x_train, df_test=df_test,
                                df_targets=y_train, env='submit')

change
results/submissions/2018-07-13-h13-52-03_submission.csv
CPU times: user 18.3 s, sys: 1.04 s, total: 19.3 s
Wall time: 20.4 s


### Experimento 5
Entrenamiento con Junio 2015 - todos los clientes

In [11]:
x_train = df[df['fecha_dato'] == '2015-06-28']
y_train = df_targets.loc[x_train.index]

In [12]:
x = x_train.drop(['fecha_dato', 'fecha_alta'], axis=1)
y = y_train.loc[x.index].as_matrix()
x = x.as_matrix()

In [13]:
%%time
rf = local.model(x, y, RandomForestClassifier(n_jobs=4))

CPU times: user 2min, sys: 7.36 s, total: 2min 7s
Wall time: 39 s


In [14]:
x_test = df_test.drop(['fecha_dato', 'fecha_alta'], axis=1).as_matrix()

In [15]:
%%time
probs, preds = local.calculatePredsProbs(x_test, rf)

CPU times: user 2min 16s, sys: 4.77 s, total: 2min 20s
Wall time: 1min 4s


In [16]:
%%time
subm, name_file = local.processPredictions(probs=probs, preds=preds, df_train=x_train, df_test=df_test,
                                df_targets=y_train, env='submit')

change
results/submissions/2018-07-13-h12-11-56_submission.csv
CPU times: user 16.5 s, sys: 428 ms, total: 17 s
Wall time: 18.1 s


### Experimento 6
Entrenamiento progresivo individual
Esta prueba será *local* para predecir los productos del mes de Mayo/2016

In [8]:
dates = df.fecha_dato.unique()
date_test = dates[-1]

results = pd.DataFrame(columns=['date', 'score', 'amount_data'])

for i, date in enumerate(dates[:-1]):
    df_x = df[df['fecha_dato'] == date]
    df_y = df_targets.loc[df_x.index]
    
    x_train = df_x.drop(['fecha_dato', 'fecha_alta'], axis=1).as_matrix()
    y_train = df_y.as_matrix()
    
    df_x_test = df[df['fecha_dato'] == date_test].drop(['fecha_dato', 'fecha_alta'], axis=1)
    df_y_test = df_targets.loc[df_x_test.index]
    
    x_test = df_x_test.as_matrix()
    y_test = df_y_test.as_matrix()
    
    model = local.model(x_train, y_train, RandomForestClassifier(n_jobs=4))
    probs, preds = local.calculatePredsProbs(x_test, model)
    
    predicted, actual = local.processPredictions(probs, preds, df_x, df_x_test, df_y, df_y_test)
    
    score = local.mapk(actual, predicted, 7)
    
    results.loc[i] = [date, score, x_train.shape[0]]
    print(date)

### Experimento 7
Entrenamiento progresivo acumulado.
* **Test:** Mayo 2016
* **Env:** local

In [None]:
dates = df.fecha_dato.unique()
date_test = dates[-1]

results = pd.DataFrame(columns=['date_start', 'date_end', 'score', 'amount_data'])

for i in range(1, len(dates[:-1])+1):
    date_range = dates[:i]
    df_x = df[df['fecha_dato'].isin(date_range)]
    df_y = df_targets.loc[df_x.index]
    
    x_train = df_x.drop(['fecha_dato', 'fecha_alta'], axis=1).as_matrix()
    y_train = df_y.as_matrix()
    
    df_x_test = df[df['fecha_dato'] == date_test].drop(['fecha_dato', 'fecha_alta'], axis=1)
    df_y_test = df_targets.loc[df_x_test.index]
    
    x_test = df_x_test.as_matrix()
    y_test = df_y_test.as_matrix()
    
    model = local.model(x_train, y_train, RandomForestClassifier(n_jobs=4))
    probs, preds = local.calculatePredsProbs(x_test, model)
    
    predicted, actual = local.processPredictions(probs, preds, df_x, df_x_test, df_y, df_y_test)
    
    score = local.mapk(actual, predicted, 7)
    
    results.loc[i] = [date_range[0], date_range[-1], score, x_train.shape[0]]