In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [1]:
from sklearn.ensemble import RandomForestClassifier

### Carga de datos de entrenamiento

In [3]:
df = pd.read_csv('data/clean/train_clean_v2.csv')
df_targets = pd.read_csv('data/clean/train_labels.csv')

df_test = pd.read_csv('data/test_ver2.csv', dtype={'sexo': str, 'age': str, 'ind_nuevo': str,
                                                    'indrel_1mes': str, 'antiguedad': str, 'ult_fec_cli_lt': str,
                                                    'indext': str, 'conyuemp': str}, parse_dates=['fecha_dato', 'fecha_alta'])
df_test.head()

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,indext,conyuemp,canal_entrada,indfall,tipodom,cod_prov,nomprov,ind_actividad_cliente,renta,segmento
0,2016-06-28,15889,F,ES,V,56,1995-01-16,0,256,1,...,N,N,KAT,N,1,28.0,MADRID,1,326124.9,01 - TOP
1,2016-06-28,1170544,N,ES,H,36,2013-08-28,0,34,1,...,N,,KAT,N,1,3.0,ALICANTE,0,,02 - PARTICULARES
2,2016-06-28,1170545,N,ES,V,22,2013-08-28,0,34,1,...,N,,KHE,N,1,15.0,"CORUÑA, A",1,,03 - UNIVERSITARIO
3,2016-06-28,1170547,N,ES,H,22,2013-08-28,0,34,1,...,N,,KHE,N,1,8.0,BARCELONA,0,148402.98,03 - UNIVERSITARIO
4,2016-06-28,1170548,N,ES,H,22,2013-08-28,0,34,1,...,N,,KHE,N,1,7.0,"BALEARS, ILLES",0,106885.8,03 - UNIVERSITARIO


In [4]:
df_copy = df.copy()

In [8]:
df_copy.drop(['fecha_dato', 'fecha_alta', 'ncodpers'], axis=1, inplace=True)

### Preprocesado de los datos de test

In [5]:
#Eliminar columnas
df_test.drop(labels=['conyuemp', 'ult_fec_cli_1t'], inplace=True, axis=1)

In [12]:
#Eliminar filas
missing = df_test[df_test.isnull().any(axis=1)]
missing_index = missing.index
df_test.drop(missing_index, inplace=True, axis=0)

In [23]:
#Cambio de tipo de variable
df_test.age = df_test.age.astype('int64')
df_test.ind_nuevo = df_test.ind_nuevo.astype('int64')
df_test.antiguedad = df_test.antiguedad.astype('int64')
df_test.indrel_1mes = df_test.indrel_1mes.astype('float64')

In [55]:
df_test.renta.replace('         NA', 0, inplace=True)

In [58]:
df_test.renta = df_test.renta.astype('float64')

In [None]:
%%time
#Cambio de atributos categóricos a numéricos
cols = df_test.select_dtypes(['object']).columns

for col in cols:
    print(col, '¡LISTO!')
    attribute_vals = df_test[col].value_counts().index
    attribute_counts = np.arange(1, len(attribute_vals) + 1)
    df_test.replace(attribute_vals, attribute_counts, inplace=True)

ind_empleado ¡LISTO!
pais_residencia ¡LISTO!


In [72]:
#Cambio de formato de fecha
df_test['fecha_dato_year'] = df_test.fecha_dato.dt.year
df_test['fecha_dato_month'] = df_test.fecha_dato.dt.month
df_test['fecha_dato_day'] = df_test.fecha_dato.dt.day

df_test['fecha_alta_year'] = df_test.fecha_alta.dt.year
df_test['fecha_alta_month'] = df_test.fecha_alta.dt.month
df_test['fecha_alta_day'] = df_test.fecha_alta.dt.day

In [97]:
cols = df_test.columns.tolist()
cols_ordered = cols[0:1] + cols[22:25] + cols[1:7] + cols[25:] + cols[7:22]

In [98]:
df_test = df_test[cols_ordered]

In [100]:
%%time
#Guardar
df_test.to_csv('data/clean/test_clean.csv', index=False)

CPU times: user 49.7 s, sys: 60 ms, total: 49.8 s
Wall time: 50.5 s


### Entrenamiento y testeo

Carga de datos de test

In [15]:
df_test = pd.read_csv('data/clean/test_clean.csv')
df_test.shape

(923309, 28)

In [16]:
resultados = pd.DataFrame(columns= ['ncodpers'] + df_targets.columns.tolist() + ['added_products'])
resultados['ncodpers'] = df_test['ncodpers']

Datos de entrenamiento

In [13]:
x = df_copy.as_matrix()
y = df_targets.as_matrix()

In [17]:
df_test.drop(['fecha_dato', 'fecha_alta', 'ncodpers'], axis=1, inplace=True)
x_test = df_test.as_matrix()

In [None]:
rf = RandomForestClassifier()
for i, col in enumerate(df_targets.columns.tolist()):
    rf.fit(x, y[:, i])
    preds = [rf.predict(x.reshape(1, -1))[0] for x in x_test]
    resultados[col] = preds
rs = resultados.columns.tolist()[1:]
for i in range(len(resultados)):
    line = resultados.iloc[i, 1:-1].as_matrix()
    resultados.iloc[i , -1] = " ".join([rs[i] for i, t in enumerate(line) if t == 1])

resultados.to_csv('resultados_notebook.csv', index=False)