In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import xgboost

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
def type_change(df):    
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
  
    cols = [x for x in list(df.columns) ]
    
    for col in tqdm(cols):
        col_type = df[col].dtype

        if col_type not in [object]:

            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def zip2csv_large(dirname, chunksize, zipf, csvf):    
    arc = zipfile.ZipFile(os.path.join(dirname, zipf)).extract(csvf)
    chunk = pd.read_csv(arc, iterator = True, chunksize = chunksize)
    return pd.concat([type_change(i) for i in chunk])

In [None]:
def EDA(df):
    print(df.head())
    print(df.describe())
    print(df.info())
    print(df.isna().sum())

In [None]:
def visualize_count(df, cols):
    for c in cols:        
        v = df[c].value_counts()
        print(v)
        
        sns.barplot(v.index.astype(df[c].dtypes), v.values, alpha = 0.8)
        plt.show()

        del v

In [None]:
def visualize(df):
    idx = df['fecha_dato'].value_counts(ascending = True).index.astype('str')
    values = df['fecha_dato'].value_counts(ascending = True).values
    print(df['fecha_dato'].value_counts())
    plt.figure(figsize=(8,4))
    sns.barplot(idx, values, alpha=0.8)
    plt.xlabel('Year and month of observation', fontsize=12)
    plt.ylabel('Number of customers', fontsize=12)
    plt.xticks(rotation='vertical')
    plt.show()

    del idx
    del values
    
    df['age'] = df['age'].replace(to_replace=[' NA'], value = np.nan)
    df['age'] = df['age'].astype('float64')

    age_series = df.age.value_counts()
    print(age_series)
    plt.figure(figsize=(12,4))
    sns.barplot(age_series.index.astype('int'), age_series.values, alpha=0.8)
    plt.ylabel('Number of Occurrences of the customer', fontsize=12)
    plt.xlabel('Age', fontsize=12)
    plt.xticks(rotation='vertical')
    plt.show()

    del age_series
    
    want_to_show = list(df.columns)
    want_to_show.remove('fecha_dato')
    want_to_show.remove('ncodpers')
    want_to_show.remove('age')
    want_to_show.remove('fecha_alta')
    want_to_show.remove('renta')
    visualize_count(df, want_to_show)

    del want_to_show

In [None]:
def drop_features(df, cols):
    for c in cols:
        del df[c]
    return df

In [None]:
def handle_mv(df):
    print("\nhandle missing value...")
    
    if df.age.dtype != int and df.age.dtype != float:
        df.age = df.age.apply(lambda x: str(x).strip())
        df.age = df.age.map(lambda x: None if x == 'NA' else int(x))
    df.age.fillna(df.age.median(), inplace = True)
    
    df.ind_nuevo.fillna(0, inplace = True)
    df.indrel.fillna(1, inplace = True)
    df.indresi.fillna('S', inplace = True)
    df.indext.fillna('N', inplace = True)
    df.indfall.fillna('N', inplace = True)
    df.ind_actividad_cliente.fillna(0, inplace = True)
    
    df.antiguedad = df.antiguedad.apply(lambda x: str(x).split()[0])
    df.antiguedad = df.antiguedad.map(lambda x: np.nan if x == 'NA' else float(x))
    df.antiguedad[df.antiguedad < 0] = df.antiguedad.max()
    df.antiguedad.fillna(df.antiguedad.median(), inplace = True)
    
    df.loc[df['sexo'].isna(),'sexo'] = 'V'
    
    df.loc[df['indrel_1mes'] == 'P', 'indrel_1mes'] = 5
    df.loc[df['indrel_1mes'] == '1.0', 'indrel_1mes'] = 1
    df.loc[df['indrel_1mes'] == '1', 'indrel_1mes'] = 1
    df.loc[df['indrel_1mes'] == '2.0', 'indrel_1mes'] = 2
    df.loc[df['indrel_1mes'] == '2', 'indrel_1mes'] = 2
    df.loc[df['indrel_1mes'] == '3.0', 'indrel_1mes'] = 3
    df.loc[df['indrel_1mes'] == '3', 'indrel_1mes'] = 3
    df.loc[df['indrel_1mes'] == '4.0', 'indrel_1mes'] = 4
    df.loc[df['indrel_1mes'] == '4', 'indrel_1mes'] = 4
        
    df.indrel_1mes.fillna(0, inplace = True)
    df.indrel_1mes = df.indrel_1mes.astype('int8')
    
    df.tiprel_1mes = df.tiprel_1mes.astype('object')
    df.tiprel_1mes.fillna('I', inplace = True)
    
    df.nomprov = df.nomprov.astype('object')
    df.nomprov.fillna('Unknown', inplace = True)  
    
    df.segmento = df.segmento.astype('object')
    df.segmento.fillna('02 - PARTICULARES', inplace = True)
    
    grouped        = df.groupby("nomprov").agg({"renta":lambda x: x.median(skipna=True)}).reset_index()
    new_incomes    = pd.merge(df,grouped,how="inner",on="nomprov").loc[:, ["nomprov","renta_y"]]
    new_incomes    = new_incomes.rename(columns={"renta_y":"renta"}).sort_values("renta").sort_values("nomprov")
    df.sort_values("nomprov",inplace=True)
    df             = df.reset_index()
    new_incomes    = new_incomes.reset_index()
    
    df.loc[df.renta.isnull(),"renta"] = new_incomes.loc[df.renta.isnull(),"renta"].reset_index()
    df.loc[df.renta.isnull(),"renta"] = df.loc[df.renta.notnull(),"renta"].median()
    df.sort_values(by="fecha_dato",inplace=True)
    
    del grouped
    del new_incomes
    
    df.ind_nomina_ult1.fillna(0, inplace = True)    
    df.ind_nom_pens_ult1.fillna(0, inplace = True)
    
    return df

In [None]:
def encoding(df):
    print("\nencoding...")
    df.age = df.age.astype(np.int16)
    df.age = df.age.map(lambda x: 0 if x < 18
                       else 1 if (x >= 18 and x < 25)
                       else 2 if (x >= 25 and x < 35)
                       else 3 if (x >= 35 and x < 45)
                       else 4 if (x >= 45 and x < 55)
                       else 5 if (x >= 55 and x < 65)
                       else 6 if x >= 65 else 7)
    df.ind_empleado = df.ind_empleado.map(lambda x: 0 if (x=='N' or x=='S') else 1).astype(np.int8)
    df.sexo = df.sexo.map({"H" : 0, "V" : 1}).astype(np.int8)
    df.antiguedad = df.antiguedad.astype(np.int16)
    df.tiprel_1mes = df.tiprel_1mes.map({"A" : 0, "I" : 1, "P" : 2, "R" : 3, "N" : 4}).astype(np.int8)
    df.indresi = df.indresi.map({"N" : 0, "S" : 1}).astype(np.int8)
    df.indext = df.indext.map({"N" : 0, "S" : 1}).astype(np.int8)
    df.indfall = df.indfall.map({"N" : 0, "S" : 1}).astype(np.int8)
    df.segmento = df.segmento.map({"01 - TOP" : 0,
                                  "02 - PARTICULARES" : 1,
                                  "03 - UNIVERSITARIO" : 2}).astype(np.int8)
    df.fecha_dato = df.fecha_dato.astype('str')
    return df

In [None]:
def makeX(df, thismonth):
    print("\nmakeX...")
        
    thismdata = df.loc[df.fecha_dato == thismonth, clientfeatures]

    prevmonth = months[months.index(thismonth) - 1]
    prev_ncodpers = df.loc[df.fecha_dato == prevmonth, "ncodpers"]
    prev_prods = df.loc[df.fecha_dato == prevmonth, prods]
    prevmdata = pd.concat([prev_ncodpers, prev_prods], axis = 1)
    
    X = pd.merge(thismdata, prevmdata, how = 'left', on = 'ncodpers')

    lag1month = months[months.index(prevmonth) - 1]
    lag2month = months[months.index(lag1month) - 1]
    lag3month = months[months.index(lag2month) - 1]
    lag4month = months[months.index(lag3month) - 1]
    lagmonths = [lag1month, lag2month, lag3month, lag4month]

    for lag in lagmonths:        
        lagmdata = df.loc[df.fecha_dato == lag, ['ncodpers'] + prods]
        i = lagmonths.index(lag)
        X = pd.merge(X, lagmdata, how='left', on='ncodpers', suffixes = [i, i+1])
        X.fillna(0, inplace=True)
    
    want2drop = ['ncodpers', 'fecha_dato', 'fecha_alta', 'nomprov']
    X.drop(want2drop, axis=1, inplace=True)
    
    return X

In [None]:
def getAddedProducts(mdata, prevmdata):    
    mgd = pd.merge(mdata, prevmdata, how='left', on='ncodpers')
    mgd.fillna(0, inplace=True)  
    
    added = pd.DataFrame(mgd.ncodpers)
    mgd.drop(['ncodpers'], axis=1, inplace = True)
    
    for i, pr in enumerate(targetprods):
        added[pr] = mgd.loc[:, pr + '_x'] - mgd.loc[1:, pr + '_y']
        added.loc[added[pr] == -1, pr] = 0

    return added.drop(['ncodpers'], axis=1)

In [None]:
def makeY(df, thismonth):
    print("\nmakeY...")
    
    this_ncodpers = df.loc[df.fecha_dato == thismonth, "ncodpers"]
    this_prods = df.loc[df.fecha_dato == thismonth, prods]    
    this_prods.rename(columns = lambda x: x + "_x", inplace = True)
    this_np = pd.concat([this_ncodpers, this_prods], axis = 1)
    
    prevmonth = months[months.index(thismonth) - 1]
    prev_ncodpers = df.loc[df.fecha_dato == prevmonth, "ncodpers"]
    prev_prods = df.loc[df.fecha_dato == prevmonth, prods]
    prev_prods.rename(columns = lambda x: x + "_y", inplace = True)
    prevmdata = pd.concat([prev_ncodpers, prev_prods], axis = 1)

    y = getAddedProducts(this_np, prevmdata)
    y.fillna(0, inplace = True)
    
    return y

In [None]:
def make_targetlist(prodict, yval):
    targlist=[]
    for row in yval.values:
        clientlist = []
        for i in range(yval.shape[1]):
            if row[i] == 1:
                clientlist.append(prodict[i])
        targlist.append(clientlist)
    
    return targlist

In [None]:
def best_hyperparameter(x, y):
    xgb_clf = xgboost.XGBClassifier(use_label_encoder=False)
    
    xgb_param_grid = {'max_depth' : [5, 7],   # 7, 0.1, 150 best
                 "learning_rate" : [0.05, 0.1],
                     "n_estimators" : [100, 150]}
    
    #xgb_param_grid = {'max_depth' : [6, 8]}
    hp_grid = GridSearchCV(estimator = xgb_clf,
                          param_grid = xgb_param_grid,
                          n_jobs = -1, cv = 2)
    for pr in targetprods:
        print(f"{pr}...")
        hp_grid.fit(x, y.loc[:, pr])
    best_max_depth = hp_grid.best_params_['max_depth']
    best_learning_rate = hp_grid.best_params_['learning_rate']
    best_n_estimators = hp_grid.best_params_['n_estimators']
    
    return best_max_depth, best_learning_rate, best_n_estimators

In [None]:
from sklearn.model_selection import GridSearchCV

def train_xgboost(Xtrain, ytrain, Xval, prodict):
    print('\nTraining...')
    clfdict = {}
    probs = []
    freq = ytrain.sum(axis=0)
    #depth, lr, estimators = best_hyperparameter(Xtrain, ytrain)
    depth, lr, estimators = 7, 0.1, 150
    for pr in targetprods:        
        clf = xgboost.XGBClassifier(max_depth=depth, learning_rate = lr, 
                    n_estimators=estimators,
                    base_score = freq[pr]/Xtrain.shape[0], nthread=2)
        clfdict[pr] = clf
        clf.fit(Xtrain, ytrain.loc[:, pr])
        ypredv = clf.predict(Xval)
        probs.append(clf.predict_proba(Xval)[:, 1])

    probs = np.array(probs).T
    
    idsort7 = np.argsort(probs, axis=1)[:, :-8:-1] # ids of seven greatest probs
    prlist = [[prodict[j] for j in irow] for irow in idsort7]

    return prlist, clfdict

In [None]:
def apk(actual, predicted, k=7):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

In [None]:
def mapk(actual, predicted, k=7):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
# 12 nonzero addedproducts (may16)
targetprods = ['ind_recibo_ult1', 'ind_cco_fin_ult1', 'ind_nom_pens_ult1',
    'ind_nomina_ult1', 'ind_tjcr_fin_ult1', 'ind_ecue_fin_ult1',
    'ind_cno_fin_ult1', 'ind_ctma_fin_ult1', 'ind_reca_fin_ult1',
    'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_valo_fin_ult1']

# 22 nonzero added products (jun15)
targetprods22 = ['ind_cco_fin_ult1', 'ind_recibo_ult1', 'ind_nom_pens_ult1',
    'ind_nomina_ult1', 'ind_tjcr_fin_ult1', 'ind_reca_fin_ult1', 
    'ind_cno_fin_ult1', 'ind_ecue_fin_ult1', 'ind_dela_fin_ult1',
    'ind_deco_fin_ult1', 'ind_ctma_fin_ult1', 'ind_fond_fin_ult1',
    'ind_ctop_fin_ult1', 'ind_valo_fin_ult1', 'ind_ctpp_fin_ult1',
    'ind_ctju_fin_ult1', 'ind_deme_fin_ult1', 'ind_plan_fin_ult1',
    'ind_cder_fin_ult1', 'ind_pres_fin_ult1', 'ind_hip_fin_ult1',
    'ind_viv_fin_ult1']

prods = ['ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 
        'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 
        'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 
        'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
        'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
        'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
        'ind_nomina_ult1', 'ind_nom_pens_ult1',  'ind_recibo_ult1']

prodict = dict(zip(range(len(targetprods)),targetprods))


In [None]:
directory = '../input/santander-product-recommendation'
size = 10 ** 6
train_zip = 'train.csv.zip'
train_csv = 'train.csv'
train = zip2csv_large(directory, size, train_zip, train_csv)

want2del = ['ult_fec_cli_1t', 'conyuemp', 'tipodom', 'pais_residencia', 'canal_entrada', 'cod_prov']
train = drop_features(train, want2del)

train = handle_mv(train)
train = encoding(train)

months = list(train.fecha_dato.unique())
train_month = '2015-06-28' 

clientfeatures = list(set(train.columns) - set(prods))

X = makeX(train, train_month)
y = makeY(train, train_month)

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.2,random_state=42)
del X, y

# select only clients with added products
addedprods = np.sum(ytrain, axis = 1)
Xtrain = Xtrain[addedprods != 0]
ytrain = ytrain[addedprods != 0]

targlist = make_targetlist(prodict, yval)
Xtrain.drop(columns = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1'], axis = 1, inplace = True)
Xval.drop(columns = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1'], axis = 1, inplace = True)
prlist, tclfdict = train_xgboost(Xtrain, ytrain, Xval, prodict)
 
print('MAP@7 score: %0.5f' % mapk(targlist, prlist, 7))
del Xtrain, Xval, ytrain, yval

In [None]:
def encoding_test(df):
    print("\nencoding...")
    df.age = df.age.astype(np.int16)
    df.age = df.age.map(lambda x: 0 if x < 18
                       else 1 if (x >= 18 and x < 25)
                       else 2 if (x >= 25 and x < 35)
                       else 3 if (x >= 35 and x < 45)
                       else 4 if (x >= 45 and x < 55)
                       else 5 if (x >= 55 and x < 65)
                       else 6 if x >= 65 else 7)
    df.ind_empleado = df.ind_empleado.map(lambda x: 0 if (x=='N' or x=='S') else 1).astype(np.int8)
    df.sexo = df.sexo.map({"H" : 0, "V" : 1}).astype(np.int8)    
    df.tiprel_1mes = df.tiprel_1mes.map({"A" : 0, "I" : 1, "P" : 2, "R" : 3, "N" : 4}).astype(np.int8)
    df.indresi = df.indresi.map({"N" : 0, "S" : 1}).astype(np.int8)
    df.indext = df.indext.map({"N" : 0, "S" : 1}).astype(np.int8)
    df.indfall = df.indfall.map({"N" : 0, "S" : 1}).astype(np.int8)
    df.segmento = df.segmento.map({"01 - TOP" : 0,
                                  "02 - PARTICULARES" : 1,
                                  "03 - UNIVERSITARIO" : 2}).astype(np.int8)
    return df

In [None]:
def handle_mv_test(df):
    print("handle missing value...")
    df.loc[df['sexo'].isna(),'sexo'] = 'V'
    
    df.loc[df['indrel_1mes'] == 'P', 'indrel_1mes'] = 5
    df.loc[df['indrel_1mes'] == '1.0', 'indrel_1mes'] = 1
    df.loc[df['indrel_1mes'] == '1', 'indrel_1mes'] = 1
    df.loc[df['indrel_1mes'] == '2.0', 'indrel_1mes'] = 2
    df.loc[df['indrel_1mes'] == '2', 'indrel_1mes'] = 2
    df.loc[df['indrel_1mes'] == '3.0', 'indrel_1mes'] = 3
    df.loc[df['indrel_1mes'] == '3', 'indrel_1mes'] = 3
    df.loc[df['indrel_1mes'] == '4.0', 'indrel_1mes'] = 4
    df.loc[df['indrel_1mes'] == '4', 'indrel_1mes'] = 4    
    df.indrel_1mes.fillna(0, inplace = True)
    df.indrel_1mes = df.indrel_1mes.astype('int8')
    
    df.tiprel_1mes = df.tiprel_1mes.astype('object')
    df.tiprel_1mes.fillna('I', inplace = True)
    
    df.nomprov = df.nomprov.astype('object')
    df.nomprov.fillna('Unknown', inplace = True)
    
    df.segmento = df.segmento.astype('object')
    df.segmento.fillna('02 - PARTICULARES', inplace = True)
    
    
    df.renta = df.renta.apply(lambda x : x.strip())
    df.renta = df.renta.replace('NA', np.nan)
    df.renta = df.renta.astype(float)
    
    grouped        = df.groupby("nomprov").agg({"renta":lambda x: x.median(skipna=True)}).reset_index()
    new_incomes    = pd.merge(df,grouped,how="inner",on="nomprov").loc[:, ["nomprov","renta_y"]]
    new_incomes    = new_incomes.rename(columns={"renta_y":"renta"}).sort_values("renta").sort_values("nomprov")
    df.sort_values("nomprov",inplace=True)
    df             = df.reset_index()
    new_incomes    = new_incomes.reset_index()
    
    df.loc[df.renta.isnull(),"renta"] = new_incomes.loc[df.renta.isnull(),"renta"].reset_index()
    df.loc[df.renta.isnull(),"renta"] = df.loc[df.renta.notnull(),"renta"].median()
    
    return df

In [None]:
def makeX_test(df, thismonth):
    print("\nmakeX...")
    
    thismdata = df.loc[df.fecha_dato == thismonth, clientfeatures]
    prtmonth = '2016-05-28'
    
    # for test
    prt_ncodpers = train.loc[train.fecha_dato == prtmonth, "ncodpers"]
    prt_prods = train.loc[train.fecha_dato == prtmonth, prods]
    prtmdata = pd.concat([prt_ncodpers, prt_prods], axis = 1)
    X = pd.merge(thismdata, prtmdata, how = 'left', on = 'ncodpers')

    lag1month = months[months.index(prtmonth) - 1]
    lag2month = months[months.index(lag1month) - 1]
    lag3month = months[months.index(lag2month) - 1]
    lag4month = months[months.index(lag3month) - 1]
    lagmonths = [lag1month, lag2month, lag3month, lag4month]

    for lag in lagmonths:        
        lagmdata = train.loc[train.fecha_dato == lag, ['ncodpers'] + prods]
        lagmdata = lagmdata.astype(np.int8)
        lagmdata.fillna(0, inplace = True)
        i = lagmonths.index(lag)
        X = pd.merge(X, lagmdata, how='left', on='ncodpers', suffixes = [i, i+1])
    
    want2drop = ['ncodpers', 'fecha_dato', 'fecha_alta', 'nomprov']
    X.drop(want2drop, axis=1, inplace=True)
    
    return X

In [None]:
test_month = '2016-06-28'

test_zip = 'test.csv.zip'
test_csv = 'test.csv'
test = zip2csv_large(directory, size, test_zip, test_csv)

test = drop_features(test, want2del)

test = handle_mv_test(test)
test = encoding_test(test)

clientfeatures = list(set(test.columns) - set(prods))

tids = test['ncodpers']

Xtest = makeX_test(test, test_month)

del test
testprobs = []

for pr in targetprods:
    testprobs.append(tclfdict[pr].predict_proba(Xtest)[:, 1])
testprobs = np.array(testprobs).T

idsort7 = np.argsort(testprobs, axis=1)[:, :-8:-1] # ids of seven greatest probs
predlist = [[prodict[j] for j in irow] for irow in idsort7]

subname = 'submission.csv'
sub = tids.to_frame()
sub['added_products'] = np.array([' '.join(p) for p in predlist])

sub.to_csv(subname, index=False)