## Ambiente "local" de pruebas
En este notebook se hará el desarrollo del ambiente local de pruebas, en el que se incluirán las funciones necesarias y poder hacer un script final

In [1]:
import numpy as np
import pandas as pd
import time
import scripts.local_environment as local

from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('data/clean/train_clean.csv')
df_targets = pd.read_csv('data/clean/train_labels.csv')

### Pruebas

In [3]:
x_train = df[df['fecha_dato'] == '2015-01-28']
y_train = df_targets.loc[x_train.index]

x_test = df[df['fecha_dato'] == '2015-02-28']
y_test = df_targets.loc[x_test.index]

In [4]:
x = x_train.drop(['fecha_dato', 'fecha_alta'], axis=1).as_matrix()
y = y_train.as_matrix()

In [451]:
xtst = x_test.drop(['fecha_dato', 'fecha_alta'], axis=1).as_matrix()

In [14]:
rf = RandomForestClassifier(n_jobs=4)
rf = model(x, y, rf)

In [145]:
probs = rf.predict_proba(x_test)
preds = rf.predict(x_test)

In [146]:
probs = pred_probs = np.array([pr.max(axis=1) for pr in probs]).T
probs.shape

(516199, 24)

In [333]:
%%time
predicted, actual = processPredictions(probs, preds, x_train, x_test, y_train, y_test)

CPU times: user 15.7 s, sys: 108 ms, total: 15.8 s
Wall time: 15.8 s


In [223]:
%%time
subm = processPredictions(probs, preds, x_train, x_test, y_train, y_test, env='submit')

CPU times: user 9.91 s, sys: 236 ms, total: 10.1 s
Wall time: 10.8 s


In [334]:
%%time
mapk(actual, predicted, 7)

CPU times: user 2.44 s, sys: 4 ms, total: 2.45 s
Wall time: 2.44 s


0.025412497922884664

### Perfect score
Se hace una prueba de validación con un score casi perfecto (como si hubiese *overfitting*). La prueba se realiza con los meses de Enero y Febrero de 2015

In [57]:
x_train = df[df['fecha_dato'] == '2015-01-28']
y_train = df_targets.loc[x_train.index]

x_test = df[df['fecha_dato'] == '2015-02-28']
y_test = df_targets.loc[x_test.index]

### Entrenamiento del modelo

In [30]:
def model(x_train, y_train, model):
    """
    Parameters
    ----------
    x_train: Array
        Datos de entrenamiento previamente procesados
    y_train: Array
        Targets de entrenamiento
    model: Objeto (de sklearn)
        Algoritmo para entrenar con los paramétros configurados
    
    Returns
    -------
    model: Objeto
        Algortimo entrenado
    """
    return model.fit(x_train, y_train)

### Predicciones

In [1]:
def calculatePredsProbs(x_test, clf):
    """
    Función para calcular las probabilidades de predicción y predicciones hechas por el modelo entrenado
    
    Parameters
    ----------
    x_test: Array
        Datos de test
    clf: Objeto
        El modelo previamente entrenado
        
    Results
    -------
    probs: Array
        Array de las probabilidades de predicción
    preds: Array
        Array de predicciones

    """
    preds = clf.predict(x_test)
    
    probs = clf.predict_proba(x_test)
    probs = np.array([pr.max(axis=1) for pr in probs]).T
    
    return probs, preds

In [113]:
def processPredictions(probs=None, preds=None, df_train=None, df_test=None,
                       df_targets=None, y_test=None, env='local', path='results/submissions/'):
    """
    Procesamiento de las predicciones hechas para generar el archivo de submission
    o los datos necesarios para hacer una validación local
    Parameters
    ----------
    probs: Array
        Probabilidades de elegir un producto - Generado por el modelo
    preds: Array
        Predicciones hechas por el modelo entrenado
    df_train: DataFrame
        Datos de entrenamiento - último mes de entrenamiento
    df_test: DataFrame
        Datos de test si se va a hacer una validación local, por el contrario es None por default
    df_targets: DataFrame
        Targets - último mes de entrenamiento
    y_test: DataFrame
        Targets del mes de test, sólo si env es 'local'
    env: str (optional)
        Indica el tipo de ejecución que se quiere hacer
            'local': regresa dos listas para hacer la validación local - Default
            'submit': regresa un DataFrame con el archivo de submission csv (el archivo de submission se genera
                    y se guarda en el equipo)
    path: str (optional)
        Dirección de la carpeta donde se va a guardar el archivo
    
    Returns
    -------
    Si env es 'submit'
        df_subm: DataFrame
            DataFrame con el archivo de submission csv (el archivo de submission se genera y se guarda en el equipo)
    Si env es 'local'
        predicted: list
            Una lista de listas de los productos que se predijeron
    
    """
    
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    df_targets.reset_index(drop=True, inplace=True)
    
    ncodpers_prev_month = df_train.loc[:, 'ncodpers'].values
    ncodpers_last_month = df_test.loc[:, 'ncodpers'].values
    
    ncodpers_both = list(set(ncodpers_last_month) & set(ncodpers_prev_month))
    
    index_prev = df_train[df_train['ncodpers'].isin(ncodpers_both)].sort_values(['ncodpers']).index
    index_last = df_test[df_test['ncodpers'].isin(ncodpers_both)].sort_values(['ncodpers']).index
    
    prev_prods = df_targets.loc[index_prev].as_matrix()
    pred_prods = preds[index_last, :]
    
    #Predicciones de los productos que están en ambos meses - Sólo productos añadidos
    both_prods = pred_prods - prev_prods
    both_prods = (both_prods > 0) * 1
    
    if env == 'submit':
        
        preds[index_last] = both_prods
    
        pred_probs = (preds * probs).argsort(axis=1)
        pred_probs = np.fliplr(pred_probs)[:, :7]
    
        targets = np.array(df_targets.columns.tolist())
        final_pred = [" ".join(list(targets[p])) for p in pred_probs]

        df_subm = pd.DataFrame({'ncodpers': df_test.ncodpers.values, 'added_products': final_pred})
        name_file = path + time.strftime("%Y-%m-%d-h%H-%M-%S_") + "submission.csv"
        df_subm.to_csv(name_file, index=False)
    
        return df_subm
    else:
        y_test.reset_index(drop=True, inplace=True)
        y = y_test.loc[index_last].as_matrix()
        purchases = y - prev_prods
        purchases = (purchases > 0) * 1        
        
        indexs = np.array([[i for i in range(y.shape[1])] * y.shape[0]]).reshape(y.shape)
        actual = purchases * indexs
        actual = list(map(lambda x: list(np.unique(x)[1:]), actual))
        
        preds = both_prods #Estas son las predicciones de los productos que se añaden 
        probs = probs[index_last]
        
        pred_probs = (preds * probs).argsort(axis=1)
        pred_probs = np.fliplr(pred_probs)[:, :7]        
        
        predicted = pred_probs
        
        return predicted, actual

### Perfect score

In [65]:
df_train = x_train.reset_index(drop=True)
df_test = x_test.reset_index(drop=True)
df_labels = y_train.reset_index(drop=True)
y = y_test.reset_index(drop=True)

In [69]:
probs_perfect = np.zeros(y_test.shape)
preds_perfect = np.zeros(y_test.shape)

In [70]:
ncodpers_prev_month = df_train.loc[:, 'ncodpers'].values
ncodpers_last_month = df_test.loc[:, 'ncodpers'].values

In [82]:
ncodpers_both = list(set(ncodpers_last_month) & set(ncodpers_prev_month))
    
index_prev = df_train[df_train['ncodpers'].isin(ncodpers_both)].sort_values(['ncodpers']).index
index_last = df_test[df_test['ncodpers'].isin(ncodpers_both)].sort_values(['ncodpers']).index

512673

In [78]:
prev_prods = df_labels.loc[index_prev].as_matrix()
pred_prods = y.loc[index_last].as_matrix() #Estos son los productos reales

In [102]:
purchases = pred_prods - prev_prods
purchases = (purchases > 0) * 1

In [108]:
preds_perfect[index_last] = purchases

In [109]:
purchases_plus = purchases.sum(axis=1)
purchases_plus = np.array(list(map(lambda x: 1/x if 1/x != np.inf else 0, purchases_plus)))
purchases_plus = purchases_plus.reshape((purchases.shape[0], 1))
purchases_plus = purchases_plus * purchases

(512673, 24)

In [110]:
probs_perfect[index_last] = purchases_plus

In [114]:
%%time
predicted, actual = processPredictions(probs_perfect, preds_perfect, x_train, x_test, y_train, y_test)

CPU times: user 15.2 s, sys: 428 ms, total: 15.7 s
Wall time: 15.6 s


In [115]:
%%time
mapk(actual, predicted, 7)

CPU times: user 2.49 s, sys: 4 ms, total: 2.49 s
Wall time: 2.49 s


0.039543724752425034

In [48]:
both_prods = pred_prods - prev_prods
both_prods = (both_prods < 0) * 1

In [55]:
y.as_matrix()[index_last] - prev_prods

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [56]:
actual[:10]

[[18], [], [], [], [4], [], [], [], [4], []]

### Métrica de desempeño
Para hacer mediciones locales

In [46]:
"""
From github: @benhamner
"""

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])