In [226]:
import numpy as np
import pandas as pd
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
%%time
df = pd.read_csv('data/clean/train_clean.csv')
df_targets = pd.read_csv('data/clean/train_labels.csv')
df_test = pd.read_csv('data/clean/test_clean.csv')

CPU times: user 50.6 s, sys: 4.29 s, total: 54.8 s
Wall time: 54.8 s


### Experimento 1
Entrenando sólo con el mes de Junio 2015 y con un modelo para todos los productos

In [221]:
x_train = df[df['fecha_dato'] == '2015-06-28']
y_train = df_targets.loc[x_train.index]

x = x_train.drop(['fecha_dato', 'fecha_alta', 'ncodpers'], axis=1).as_matrix()
y = y_train.as_matrix()

x_test = df_test.drop(['fecha_dato', 'fecha_alta', 'ncodpers'], axis=1).as_matrix()

In [4]:
%%time
rf = RandomForestClassifier(n_jobs=4)
rf.fit(x, y)

CPU times: user 1min 42s, sys: 6.37 s, total: 1min 48s
Wall time: 32.3 s


In [149]:
%%time
probs = rf.predict_proba(x_test)
preds = rf.predict(x_test)

pred_probs = np.array([pr.max(axis=1) for pr in probs]).T
pred_probs = (preds * pred_probs).argsort(axis=1)
pred_probs = np.fliplr(pred_probs)[:, :7]

targets = np.array(df_targets.columns.tolist())

CPU times: user 2min 14s, sys: 5.05 s, total: 2min 19s
Wall time: 1min 4s


In [150]:
%%time
final_pred = [" ".join(list(targets[p])) for p in pred_probs]

df_subm = pd.DataFrame({'ncodpers': df_test.ncodpers.values, 'added_products': final_pred})
df_subm.head()

CPU times: user 7.13 s, sys: 56 ms, total: 7.19 s
Wall time: 7.18 s


In [151]:
df_subm.head()

Unnamed: 0,added_products,ncodpers
0,ind_cco_fin_ult1 ind_recibo_ult1 ind_nom_pens_...,15889
1,ind_cco_fin_ult1 ind_recibo_ult1 ind_nom_pens_...,1170544
2,ind_cco_fin_ult1 ind_recibo_ult1 ind_nom_pens_...,1170545
3,ind_cco_fin_ult1 ind_recibo_ult1 ind_nom_pens_...,1170547
4,ind_cco_fin_ult1 ind_recibo_ult1 ind_nom_pens_...,1170548


In [None]:
%%time
df_subm.to_csv('scripts/results/submissions/2018_07_09.csv', index=False)

### Experimento 1a
Modificación del experimento anterior, solo que la *submission* se va a hacer sólo con los productos agregados

In [152]:
ncodpers_last_month = df_test['ncodpers'].values
df_prev_month = df[df['fecha_dato'] == '2016-05-28']
ncodpers_prev_month = df_prev_month.loc[:, 'ncodpers'].values

In [153]:
df_prev_month.shape

(700444, 28)

In [154]:
ncodpers_both = list(set(ncodpers_last_month) & set(ncodpers_prev_month))

In [155]:
index_prev = df_prev_month[df_prev_month['ncodpers'].isin(ncodpers_both)].index
index_last = df_test[df_test['ncodpers'].isin(ncodpers_both)].index

In [156]:
prev_prods = df_targets.loc[index_prev].as_matrix()
pred_prods = preds[index_last, :]

In [157]:
both_prods = pred_prods - prev_prods
both_prods = (both_prods < 0) * 1

In [158]:
preds[index_last] = both_prods

In [159]:
pred_probs = np.array([pr.max(axis=1) for pr in probs]).T
pred_probs = (preds * pred_probs).argsort(axis=1)
pred_probs = np.fliplr(pred_probs)[:, :7]

In [160]:
final_pred = [" ".join(list(targets[p])) for p in pred_probs]

df_subm = pd.DataFrame({'ncodpers': df_test.ncodpers.values, 'added_products': final_pred})

In [161]:
df_subm.head()

Unnamed: 0,added_products,ncodpers
0,ind_recibo_ult1 ind_nom_pens_ult1 ind_aval_fin...,15889
1,ind_cco_fin_ult1 ind_recibo_ult1 ind_nom_pens_...,1170544
2,ind_cco_fin_ult1 ind_recibo_ult1 ind_nom_pens_...,1170545
3,ind_recibo_ult1 ind_nom_pens_ult1 ind_aval_fin...,1170547
4,ind_recibo_ult1 ind_nom_pens_ult1 ind_aval_fin...,1170548


In [162]:
%%time
df_subm.to_csv('scripts/results/submissions/2018_07_10_prev_prods.csv', index=False)

CPU times: user 7.49 s, sys: 116 ms, total: 7.6 s
Wall time: 9.01 s


---
### Experimento 2
Entrenamiento de un modelo para cada uno de los productos

**Nota:** Se puede hacer una comparación entre 1 y 2


In [177]:
%%time
preds = []
probs = []
for i, col in enumerate(targets):
    
    rf.fit(x, y[:, i])
    
    pd = rf.predict(x_test)
    pb = rf.predict_proba(x_test).max(axis=1)
        
    preds.append(pd)
    probs.append(pb)
    print(col)

ind_ahor_fin_ult1
ind_aval_fin_ult1
ind_cco_fin_ult1
ind_cder_fin_ult1
ind_cno_fin_ult1
ind_ctju_fin_ult1
ind_ctma_fin_ult1
ind_ctop_fin_ult1
ind_ctpp_fin_ult1
ind_deco_fin_ult1
ind_deme_fin_ult1
ind_dela_fin_ult1
ind_ecue_fin_ult1
ind_fond_fin_ult1
ind_hip_fin_ult1
ind_plan_fin_ult1
ind_pres_fin_ult1
ind_reca_fin_ult1
ind_tjcr_fin_ult1
ind_valo_fin_ult1
ind_viv_fin_ult1
ind_nomina_ult1
ind_nom_pens_ult1
ind_recibo_ult1
CPU times: user 7min 5s, sys: 8.62 s, total: 7min 14s
Wall time: 2min 15s


In [184]:
preds = np.array(preds).T
probs = np.array(probs).T

pred_probs = (preds * probs).argsort(axis=1)
pred_probs = np.fliplr(pred_probs)[:, :7]

final_pred = [" ".join(list(targets[p])) for p in pred_probs]

df_subm = pd.DataFrame({'ncodpers': df_test.ncodpers.values, 'added_products': final_pred})
df_subm.head()

In [195]:
path_submission = 'scripts/results/submissions/'

In [196]:
name_file = path_submission + time.strftime("%Y-%m-%d-h%H-%M-%S_") + "submission.csv"
print(name_file)
df_subm.to_csv(name_file, index=False)

scripts/results/submissions/2018-07-10-h10-45-09_submission.csv


### Experimento 3
Entrenamiento con Julio 2015, un solo modelo GaussianNB para cada producto

In [210]:
%%time
gb = GaussianNB()
preds = []
probs = []
for i, col in enumerate(targets):
    
    gb.fit(x, y[:, i])
    
    prd = rf.predict(x_test)
    pb = rf.predict_proba(x_test).max(axis=1)
        
    preds.append(prd)
    probs.append(pb)

CPU times: user 2min 43s, sys: 6.04 s, total: 2min 49s
Wall time: 1min 6s


In [217]:
preds = np.array(preds).T
probs = np.array(probs).T

In [227]:
%%time
pred_probs = (preds * probs).argsort(axis=1)
pred_probs = np.fliplr(pred_probs)[:, :7]

final_pred = [" ".join(list(targets[p])) for p in pred_probs]

df_subm = pd.DataFrame({'ncodpers': df_test.ncodpers.values, 'added_products': final_pred})
df_subm.head()

CPU times: user 7.55 s, sys: 196 ms, total: 7.74 s
Wall time: 7.74 s


In [228]:
%%time
name_file = path_submission + time.strftime("%Y-%m-%d-h%H-%M-%S_") + "submission.csv"
print(name_file)
df_subm.to_csv(name_file, index=False)

scripts/results/submissions/2018-07-10-h11-18-47_submission.csv
CPU times: user 7.49 s, sys: 80 ms, total: 7.57 s
Wall time: 8.88 s
