In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder

In [18]:
train_path = 'santander-product-recommendation/train_dir/train_ver2.csv'
limit_rows = 200000
train_data = pd.read_csv(train_path, dtype = {'sexo':str, 'ind_nuevo':str,
                                              'ult_fec_cli_1t':str,
                                              'indext':str}, nrows = limit_rows)

In [19]:
# Converting Columns to English
train_data.columns = ['Status_Dt', 'Cust_ID', 'Emp_Status', 'Cust_Ctry', 'Cust_Sex', 'Age', 'Join_Dt',
                'Is_New_Cust', 'Cust_Since', 'Relship_Status','Lst_Dt_Primary_Cust', 'Cust_Type_Mth_Beg', 
                'Cust_Relship_Type_Mth_Beg', 'Residence_Ctry', 'Is_Foreigner', 'Is_Spouse_Emp', 'Join_Channel', 'Deceased_Flg', 
                'Address_Type', 'Cust_Province_Cd', 'Cust_Province_Name', 'Cust_Active_Status', 'Gross_HHLD_Income',
                'Cust_Segment', 'Savings_Acct', 'Guarantees', 'Cur_Acct', 'Derivative_Acct', 'Payroll_Acct',
                'Junior_Acct', 'Mas_Particular_Acct', 'Particular_Acct', 'Particular_Plus_Acct', 'Short_Term_Deposits',
                'Med_Term_Deposits', 'Long_Term_Deposits', 'e-Acct', 'Funds', 'Mortgage', 'Pension1', 'Loans',
                'Taxes', 'Credit_Card', 'Securities', 'Home_Acct', 'Payroll', 'Pension2', 'Direct_Debit']

In [20]:
# Fixing Age and Cust_Since
train_data["Age"]   = pd.to_numeric(train_data["Age"], errors="coerce")
train_data["Cust_Since"]   = pd.to_numeric(train_data["Cust_Since"], errors="coerce")
train_data["Gross_HHLD_Income"]   = pd.to_numeric(train_data["Gross_HHLD_Income"], errors="coerce")
train_data = train_data.drop(['Pension2'], axis = 1)

In [21]:
unimportant_features = ['Status_Dt','Cust_ID','Join_Dt','Is_New_Cust','Relship_Status','Lst_Dt_Primary_Cust',
                       'Cust_Ctry','Relship_Status','Address_Type','Cust_Province_Name','Cust_Active_Status']    

In [22]:
train_data['Payroll'] = train_data['Payroll'].fillna(0)
med = train_data['Gross_HHLD_Income'].median()
train_data['Gross_HHLD_Income'] = train_data['Gross_HHLD_Income'].fillna(med)
train_data.drop(columns=['Is_Spouse_Emp', 'Lst_Dt_Primary_Cust'], inplace=True)
train_data['Cust_Type_Mth_Beg'] = train_data['Cust_Type_Mth_Beg'].apply(lambda x: x[0] if isinstance(x,str) else str(x)[0])
train_data = train_data.dropna(subset = ['Emp_Status','Cust_Sex','Age','Cust_Since','Cust_Type_Mth_Beg', 'Cust_Relship_Type_Mth_Beg' , 'Residence_Ctry', 'Is_Foreigner', 'Join_Channel', 'Deceased_Flg','Cust_Province_Cd','Cust_Segment'])


In [23]:
inputs = ['Age',
'Cust_Since',
'Gross_HHLD_Income',
'Emp_Status',
'Cust_Type_Mth_Beg',
'Cust_Relship_Type_Mth_Beg',
'Join_Channel',
'Cust_Province_Cd',
'Cust_Segment',
 'Residence_Ctry',
'Is_Foreigner',
'Deceased_Flg']
X = train_data.loc[:, inputs]
y = train_data.iloc[:, 22:]

In [24]:
from sklearn.model_selection import train_test_split
X_dev, X_test, y_dev, y_test = train_test_split(X, y.to_numpy(), stratify=y.to_numpy()[:,1], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, stratify = y_dev[:, 1], test_size=0.25, random_state=42)


In [25]:
scalable = ['Age', 'Cust_Since', 'Gross_HHLD_Income']
one_hot = ['Emp_Status',
 'Cust_Type_Mth_Beg',
'Cust_Relship_Type_Mth_Beg',
'Join_Channel',
'Cust_Province_Cd',
'Cust_Segment']
ordinal = ['Residence_Ctry','Is_Foreigner','Deceased_Flg']

In [26]:
from sklearn.compose import make_column_transformer
transformer = make_column_transformer(
    (MinMaxScaler(), scalable),
    (OneHotEncoder(handle_unknown='infrequent_if_exist'), one_hot),
    (OrdinalEncoder(), ordinal)
)

X_train = transformer.fit_transform(X_train).toarray()
X_val = transformer.transform(X_val).toarray()
X_test = transformer.transform(X_test).toarray()
X_dev = transformer.transform(X_dev).toarray()


In [27]:
import timeit
def apk(actual, predicted, k):
    num_actual_products = sum(actual)
    expected_products = np.where(actual==1)[0]
    precision_at_k = []
    for i in range(1,k+1):
        num_hits = sum([recommended_product in expected_products 
             for recommended_product in np.argsort(predicted)[-i:]])
        proportion_of_hits = num_hits/i
        precision_at_k.append(proportion_of_hits*(np.argsort(predicted)[-i:][0] in expected_products))
    return sum(precision_at_k)/min(k,num_actual_products) if min(k,num_actual_products)!= 0 else 0

def mapk(actual, predicted, k):
    average_precisions = []
    i = 0
    start = timeit.default_timer()
    for a,p in zip(actual, predicted):
        average_precisions.append(apk(a,p,k))
        i+=1
        if i%100000 == 0:
            stop = timeit.default_timer()
            print(f"{stop-start} - {i} predictions have been processed with a MAP of {np.mean(average_precisions)}")
            start = timeit.default_timer()
            break
    return np.mean(average_precisions)

In [32]:
# Training Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
best_dt_params = {}
max_score = -1
depth = [10, 15]
splitter = ['best', 'random']
for d in depth:
    for s in splitter:
        dt = DecisionTreeClassifier(max_depth=d, splitter=s)
        dt.fit(X_dev, y_dev)
        pred_dt = dt.predict(X_val)
        score = mapk(y_val, pred_dt, 7)
        if score > max_score:
            max_score = score
            best_dt_params['max_depth'] = d
            best_dt_params['split'] = s

dt_tuned = DecisionTreeClassifier(max_depth = best_dt_params['max_depth'], splitter = best_dt_params['split'])
dt_tuned.fit(X_train, y_train)
dt_tuned_pred = dt_tuned.predict(X_test)
print(mapk(y_test, dt_tuned_pred, 7))

0.0


In [12]:
# Training a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
num_est = [150, 200, 250]
max_depth = [10, 15]
best_params = {}
max_score = -1
for e in num_est:
    for d in max_depth:
        rf = RandomForestClassifier(verbose=1, n_estimators=e, max_depth=d, n_jobs=4)
        rf.fit(X_dev, y_dev)
        pred_rf = rf.predict(X_val)
        score = mapk(y_val, pred_rf, 7)
        if score > max_score:
            max_score = score
            best_params['n_estimators'] = e
            best_params['max_depth'] = d
        print(best_params)
        print(score)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   14.0s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:   43.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    6.6s finished


{'n_estimators': 150, 'max_depth': 10}
0.8709687824129119


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   15.9s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:   54.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.6s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    8.9s finished


{'n_estimators': 150, 'max_depth': 15}
0.8775615870067321


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   14.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   59.6s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:  1.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    9.6s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:   10.0s finished


{'n_estimators': 150, 'max_depth': 15}
0.8717173960918158


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   15.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:  1.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   10.2s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:   10.6s finished


{'n_estimators': 200, 'max_depth': 15}
0.8778335664926161


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   14.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   57.7s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:  1.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   10.8s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:   14.1s finished


{'n_estimators': 200, 'max_depth': 15}
0.8715244294653444


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   16.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:  1.6min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    9.7s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:   12.6s finished


{'n_estimators': 250, 'max_depth': 15}
0.877947005410477


In [15]:
# Training the Optimal Model
rf_tuned = RandomForestClassifier(verbose=1, n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], n_jobs=4)
rf_tuned.fit(X_train, y_train)
score_final = mapk(y_test, rf_tuned.predict(X_test), 7)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   12.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   48.1s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:  1.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   10.2s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:   13.1s finished


In [16]:
score_final

0.8744820462189447