# 46-final-model

> Compare 3 different models

> Train final model on 2015-2018-Aug data, test on 2018-Sept data, predict 2018-Nov by 2018-Oct

In [3]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.neural_network import MLPClassifier
import os

In [5]:
def train_process(file, real_x, real_y, model, test_size=0.25, seed=114514):
    new_data = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/"+file)
    new_data = new_data.fillna(-1)
    X = new_data.drop('response',axis=1)
    Y = new_data['response']
    X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = seed)
    train = pd.concat([X_train_old, Y_train], axis=1)
    length = 2*len(train[train['response']==1])
    buy = train[train['response']==1].sample(n=length, replace=True, random_state = seed)
    nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = seed)
    new_train = pd.concat([buy,nobuy])
    X_train = new_train.drop(['indiv_id','response','store'], axis=1)
    Y_train = new_train['response']
    X_test = X_test_old.drop(['indiv_id','store'], axis = 1)
    print("data loaded")
    model = model
    print("model set")
    eval_set = [(X_test, Y_test)]
    model.fit(X_train, Y_train)
    print("model fit")
    Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
    Y_pred = model.predict(X_test)
    print("model predict")
    Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/"+real_x)
    Real_X = Real_X.fillna(-1)
    Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
    top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
    Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/"+real_y)
    Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
    index = len([x for x in top100k_id_new if x in Real_Y_id])/100000
    print("index:", index)
    return model, top100k_id_new

## RF

In [30]:
train_process('2015_17.csv','2015_18_02.csv','20180331.csv', RandomForestClassifier(n_estimators=100))

data loaded
model set
model fit
model predict
index: 0.07308


(RandomForestClassifier(),
 array([  5.32334493e+08,   3.14545170e+08,   3.22494400e+08, ...,
          2.62157041e+08,   2.88510415e+08,   2.62156573e+08]))

In [31]:
train_process('2015_18_02.csv','2015_18_04.csv','20180531.csv', RandomForestClassifier(n_estimators=100))

data loaded
model set
model fit
model predict
index: 0.07415


(RandomForestClassifier(),
 array([  3.18411810e+08,   3.21146366e+08,   4.25423680e+08, ...,
          3.14550182e+08,   2.76752632e+08,   2.72471729e+08]))

In [32]:
train_process('2015_18_04.csv','2015_18_06.csv','20180731.csv', RandomForestClassifier(n_estimators=100))

data loaded
model set
model fit
model predict
index: 0.07898


(RandomForestClassifier(),
 array([  2.53533886e+08,   5.04321878e+08,   2.68629500e+08, ...,
          2.76274493e+08,   2.79193656e+08,   2.91443136e+08]))

In [33]:
train_process('2015_18_06.csv','2015_18_08.csv','20180930.csv', RandomForestClassifier())

data loaded
model set
model fit
model predict
index: 0.0754


(RandomForestClassifier(),
 array([  5.32631095e+08,   3.06307678e+08,   2.82381241e+08, ...,
          2.64036656e+08,   2.88585451e+08,   3.66168699e+08]))

## MLP

In [22]:
train_process('2015_17.csv','2015_18_02.csv','20180331.csv', MLPClassifier())

data loaded
model set
model fit
model predict
index: 0.11445


(MLPClassifier(),
 array([  2.73830959e+08,   2.91967266e+08,   5.42354822e+08, ...,
          3.10656646e+08,   2.67277997e+08,   2.67928362e+08]))

In [23]:
train_process('2015_18_02.csv','2015_18_04.csv','20180531.csv', MLPClassifier())

data loaded
model set
model fit
model predict
index: 0.11108


(MLPClassifier(),
 array([  2.61233660e+08,   4.24093950e+08,   2.56980288e+08, ...,
          2.78547689e+08,   2.95722273e+08,   2.89846689e+08]))

In [24]:
train_process('2015_18_04.csv','2015_18_06.csv','20180731.csv', MLPClassifier())

data loaded
model set
model fit
model predict
index: 0.11188


(MLPClassifier(),
 array([  2.81381331e+08,   4.24093950e+08,   2.81842221e+08, ...,
          2.82784478e+08,   2.54052915e+08,   2.54617456e+08]))

In [25]:
train_process('2015_18_06.csv','2015_18_08.csv','20180930.csv', MLPClassifier())

data loaded
model set
model fit
model predict
index: 0.11039


(MLPClassifier(),
 array([  2.56980288e+08,   2.59698763e+08,   5.32456325e+08, ...,
          2.62964958e+08,   2.58604744e+08,   2.81410353e+08]))

## XGBoost

In [26]:
train_process('2015_17.csv','2015_18_02.csv','20180331.csv', XGBClassifier())

data loaded
model set




model fit
model predict
index: 0.10365


(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
               gamma=0, gpu_id=-1, importance_type=None,
               interaction_constraints='', learning_rate=0.300000012,
               max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
               monotone_constraints='()', n_estimators=100, n_jobs=8,
               num_parallel_tree=1, predictor='auto', random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
               tree_method='exact', validate_parameters=1, verbosity=None),
 array([  2.54315387e+08,   2.80719092e+08,   2.53795219e+08, ...,
          2.67867592e+08,   5.34305983e+08,   2.78207274e+08]))

In [27]:
train_process('2015_18_02.csv','2015_18_04.csv','20180531.csv', XGBClassifier())

data loaded
model set




model fit
model predict
index: 0.10382


(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
               gamma=0, gpu_id=-1, importance_type=None,
               interaction_constraints='', learning_rate=0.300000012,
               max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
               monotone_constraints='()', n_estimators=100, n_jobs=8,
               num_parallel_tree=1, predictor='auto', random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
               tree_method='exact', validate_parameters=1, verbosity=None),
 array([  4.45870572e+08,   2.75926385e+08,   2.53849821e+08, ...,
          2.59109931e+08,   2.54113772e+08,   2.85595293e+08]))

In [28]:
train_process('2015_18_04.csv','2015_18_06.csv','20180731.csv', XGBClassifier())

data loaded
model set




model fit
model predict
index: 0.1016


(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
               gamma=0, gpu_id=-1, importance_type=None,
               interaction_constraints='', learning_rate=0.300000012,
               max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
               monotone_constraints='()', n_estimators=100, n_jobs=8,
               num_parallel_tree=1, predictor='auto', random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
               tree_method='exact', validate_parameters=1, verbosity=None),
 array([  2.63360742e+08,   2.61233660e+08,   2.89588974e+08, ...,
          2.58404726e+08,   2.86031740e+08,   2.77227202e+08]))

In [29]:
train_process('2015_18_06.csv','2015_18_08.csv','20180930.csv', XGBClassifier())

data loaded
model set




model fit
model predict
index: 0.103


(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
               gamma=0, gpu_id=-1, importance_type=None,
               interaction_constraints='', learning_rate=0.300000012,
               max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
               monotone_constraints='()', n_estimators=100, n_jobs=8,
               num_parallel_tree=1, predictor='auto', random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
               tree_method='exact', validate_parameters=1, verbosity=None),
 array([  6.11852313e+08,   2.77364529e+08,   2.93169414e+08, ...,
          2.80528365e+08,   2.59828104e+08,   2.66650740e+08]))

## Gridsearch

In [14]:
new_data = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_06.csv")
new_data = new_data.fillna(-1)
X = new_data.drop('response',axis=1)
Y = new_data['response']
X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 114514)
train = pd.concat([X_train_old, Y_train], axis=1)
length = 2*len(train[train['response']==1])
buy = train[train['response']==1].sample(n=length, replace=True, random_state = 114514)
nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = 114514)
new_train = pd.concat([buy,nobuy])
X_train = new_train.drop(['indiv_id','response','store'], axis=1)
Y_train = new_train['response']
X_test = X_test_old.drop(['indiv_id','store'], axis = 1)

In [15]:
mlp_gs = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5)
clf.fit(X_train, Y_train) # X is train samples and y is the corresponding labels

GridSearchCV(cv=5, estimator=MLPClassifier(max_iter=100), n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'alpha': [0.0001, 0.05],
                         'hidden_layer_sizes': [(10, 30, 10), (20,)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']})

In [16]:
clf.best_params_

{'activation': 'relu',
 'alpha': 0.0001,
 'hidden_layer_sizes': (10, 30, 10),
 'learning_rate': 'adaptive',
 'solver': 'adam'}

In [18]:
Y_prob = pd.DataFrame(clf.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
Y_pred = clf.predict(X_test)
print("model predict")
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_08.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(clf.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180930.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id_new if x in Real_Y_id])/100000
print("index:", index)

model predict
index: 0.11051


## Final train and predict

In [19]:
seed=114514
test_size=0.25

In [21]:
new_train.columns

Index(['indiv_id', 'total_transaction', 'sales_total', 'tire_purchases',
       'service_purchases', 'other_purchases', 'days_since_first_transaction',
       'days_since_last_transaction', 'days_since_first_tire_purchase',
       'days_since_last_tire_purchase', 'vehicle_count', 'model_year_avg',
       'region', 'tire_purchase_freq', 'response'],
      dtype='object')

In [22]:
new_data = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_08.csv")
new_data = new_data.fillna(-1)
X = new_data.drop('response',axis=1)
Y = new_data['response']
X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = seed)
train = pd.concat([X_train_old, Y_train], axis=1)
length = 2*len(train[train['response']==1])
buy = train[train['response']==1].sample(n=length, replace=True, random_state = seed)
nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = seed)
new_train = pd.concat([buy,nobuy])
X_train = new_train.drop(['indiv_id','response'], axis=1)
Y_train = new_train['response']
X_test = X_test_old.drop(['indiv_id'], axis = 1)
print("data loaded")
model = MLPClassifier(activation= 'relu', alpha=0.0001, hidden_layer_sizes= (10, 30, 10), learning_rate='adaptive',solver='adam')
print("model set")
eval_set = [(X_test, Y_test)]
model.fit(X_train, Y_train)
print("model fit")
Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
Y_pred = model.predict(X_test)
print("model predict")
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_09.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20181031.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id_new if x in Real_Y_id])/100000
print("index:", index)

data loaded
model set
model fit
model predict
index: 0.10855


In [24]:
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_10.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
pd.DataFrame(top100k_id_new).to_csv('/data/p_dsi/teams2022/team_1/top100000.csv')

In [25]:
top100k_id_new

array([  2.61233660e+08,   5.32456325e+08,   2.56980288e+08, ...,
         2.67684336e+08,   2.66689889e+08,   2.86305973e+08])