# 44-model-new

> train model for guessing Sep 2018


In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
import os

In [2]:
data = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_08.csv")

In [3]:
data = data.fillna(0)

In [4]:
kf = KFold(n_splits=5, random_state = 42, shuffle = True)

In [5]:
train_list = []
test_list = []
for train_index, test_index in kf.split(data):
    train_list.append(train_index)
    test_list.append(test_index)

In [6]:
data.columns

Index(['indiv_id', 'total_transaction', 'sales_total', 'tire_purchases',
       'service_purchases', 'other_purchases', 'days_since_first_transaction',
       'days_since_last_transaction', 'days_since_first_tire_purchase',
       'days_since_last_tire_purchase', 'vehicle_count', 'model_year_avg',
       'store', 'tire_purchase_freq', 'response'],
      dtype='object')

In [7]:
data.groupby(["response"]).size()

response
0    5541516
1     103382
dtype: int64

In [8]:
train1, test1 = data.iloc[train_list[0],], data.iloc[test_list[0],]
train2, test2 = data.iloc[train_list[1],], data.iloc[test_list[1],]
train3, test3 = data.iloc[train_list[2],], data.iloc[test_list[2],]
train4, test4 = data.iloc[train_list[3],], data.iloc[test_list[3],]
train5, test5 = data.iloc[train_list[4],], data.iloc[test_list[4],]

In [9]:
def model_fold(train, test, model):
    x_train = train.drop(['indiv_id', 'response','store'], axis = 1)
    y_train = train["response"]
    x_test = test.drop(['response'], axis = 1)
    x_test_new = x_test.drop(['indiv_id','store'], axis = 1)
    y_test = test['response']
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test_new)
    y_prob = model.predict_proba(x_test_new)
    x_test_copy = x_test.copy()
    x_test_copy["actual"] = y_test
    x_test_copy["pred"] = y_pred
    x_test_copy["prob_0"] = list(y_prob[:,0])
    x_test_copy["prob_1"] = list(y_prob[:,1])
    return x_test_copy

In [10]:
model = RandomForestClassifier()

In [11]:
x1 = model_fold(train1, test1, model)
x2 = model_fold(train2, test2, model)
x3 = model_fold(train3, test3, model)
x4 = model_fold(train4, test4, model)
x5 = model_fold(train5, test5, model)
total_x = pd.concat([x1, x2, x3, x4, x5], axis = 0)

KeyboardInterrupt: 

In [None]:
total_x[total_x['pred']==1]

In [None]:
top100k_id = total_x.sort_values('prob_1', ascending=False)['indiv_id'].unique()[:100000]

In [None]:
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180930.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id if x in Real_Y_id])/100000

In [None]:
index

### Shuyang's Approach (w/ some tweaks)

#### 2018_06 to predict 2018_08 and test with September 2018

In [None]:
test_size = 0.25

### Random forest classifier

In [None]:
new_data = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_06.csv")
new_data = new_data.fillna(-1)
X = data.drop('response',axis=1)
Y = data['response']
X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = 42)
train = pd.concat([X_train_old, Y_train], axis=1)
length = 2*len(train[train['response']==1])
buy = train[train['response']==1].sample(n=length, replace=True, random_state = 42)
nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = 42)
new_train = pd.concat([buy,nobuy])
X_train = new_train.drop(['indiv_id', 'response','store'], axis=1)
Y_train = new_train['response']
X_test = X_test_old.drop('indiv_id', axis = 1)
model = RandomForestClassifier(n_estimators = 80)
eval_set = [(X_test, Y_test)]
model.fit(X_train, Y_train)
Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
Y_pred = model.predict(X_test)
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_08.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180930.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id_new if x in Real_Y_id])/100000
index

### XGboost classifier

In [None]:
new_data = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_06.csv")
new_data = new_data.fillna(-1)
X = data.drop('response',axis=1)
Y = data['response']
X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = 42)
train = pd.concat([X_train_old, Y_train], axis=1)
length = 2*len(train[train['response']==1])
buy = train[train['response']==1].sample(n=length, replace=True, random_state = 42)
nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = 42)
new_train = pd.concat([buy,nobuy])
X_train = new_train.drop(['indiv_id', 'response','store'], axis=1)
Y_train = new_train['response']
X_test = X_test_old.drop('indiv_id', axis = 1)
model = XGBClassifier()
eval_set = [(X_test, Y_test)]
model.fit(X_train, Y_train, early_stopping_rounds = 10, eval_metric = 'error', eval_set = eval_set, verbose=True)
Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
Y_pred = model.predict(X_test)
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_08.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180930.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id_new if x in Real_Y_id])/100000

### Neural Network

In [None]:
new_data = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_06.csv")
new_data = new_data.fillna(-1)
X = data.drop('response',axis=1)
Y = data['response']
X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = 42)
train = pd.concat([X_train_old, Y_train], axis=1)
length = 2*len(train[train['response']==1])
buy = train[train['response']==1].sample(n=length, replace=True, random_state = 42)
nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = 42)
new_train = pd.concat([buy,nobuy])
X_train = new_train.drop(['indiv_id', 'response','store'], axis=1)
Y_train = new_train['response']
X_test = X_test_old.drop('indiv_id', axis = 1)
model = MLPClassifier
eval_set = [(X_test, Y_test)]
model.fit(X_train, Y_train)
Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
Y_pred = model.predict(X_test)
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_08.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180930.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id_new if x in Real_Y_id])/100000

#### Stacked Approach

In [None]:
files = ['2015_18_06.csv','2015_18_04.csv','2015_17.csv','2015_18_02.csv']
new_data = pd.concat([pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/" + x) for x in files]).fillna(-1)
new_data = new_data.fillna(-1)
X = data.drop('response',axis=1)
Y = data['response']
X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = 42)
train = pd.concat([X_train_old, Y_train], axis=1)
length = 2*len(train[train['response']==1])
buy = train[train['response']==1].sample(n=length, replace=True, random_state = 42)
nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = 42)
new_train = pd.concat([buy,nobuy])
X_train = new_train.drop(['indiv_id', 'response','store'], axis=1)
Y_train = new_train['response']
X_test = X_test_old.drop('indiv_id', axis = 1)
model = RandomForestClassifier(n_estimators = 80)
eval_set = [(X_test, Y_test)]
model.fit(X_train, Y_train)
Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
Y_pred = model.predict(X_test)
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_08.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180930.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id_new if x in Real_Y_id])/100000
index

#### 2017 to predict 2018_02 and compare with March 2018

In [None]:
new_data = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_17.csv")
new_data = new_data.fillna(-1)
X = data.drop('response',axis=1)
Y = data['response']
X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = 42)
train = pd.concat([X_train_old, Y_train], axis=1)
length = 2*len(train[train['response']==1])
buy = train[train['response']==1].sample(n=length, replace=True, random_state = 42)
nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = 42)
new_train = pd.concat([buy,nobuy])
X_train = new_train.drop(['indiv_id', 'response','store'], axis=1)
Y_train = new_train['response']
X_test = X_test_old.drop('indiv_id', axis = 1)
model = RandomForestClassifier(n_estimators = 80)
eval_set = [(X_test, Y_test)]
model.fit(X_train, Y_train)
Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
Y_pred = model.predict(X_test)
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_02.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180331.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id_new if x in Real_Y_id])/100000
index

#### 2018_02 to predict 2018_04 and compare with May 2018

In [None]:
new_data = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_02.csv")
new_data = new_data.fillna(-1)
X = data.drop('response',axis=1)
Y = data['response']
X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = 42)
train = pd.concat([X_train_old, Y_train], axis=1)
length = 2*len(train[train['response']==1])
buy = train[train['response']==1].sample(n=length, replace=True, random_state = 42)
nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = 42)
new_train = pd.concat([buy,nobuy])
X_train = new_train.drop(['indiv_id', 'response','store'], axis=1)
Y_train = new_train['response']
X_test = X_test_old.drop('indiv_id', axis = 1)
model = RandomForestClassifier(n_estimators = 80)
eval_set = [(X_test, Y_test)]
model.fit(X_train, Y_train)
Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
Y_pred = model.predict(X_test)
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_04.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180531.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id_new if x in Real_Y_id])/100000
index

#### Stacked Approach

In [None]:
files = ['2015_17.csv','2015_18_02.csv']
new_data = pd.concat([pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/" + x) for x in files]).fillna(-1)
new_data = new_data.fillna(-1)
X = data.drop('response',axis=1)
Y = data['response']
X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = 42)
train = pd.concat([X_train_old, Y_train], axis=1)
length = 2*len(train[train['response']==1])
buy = train[train['response']==1].sample(n=length, replace=True, random_state = 42)
nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = 42)
new_train = pd.concat([buy,nobuy])
X_train = new_train.drop(['indiv_id', 'response','store'], axis=1)
Y_train = new_train['response']
X_test = X_test_old.drop('indiv_id', axis = 1)
model = RandomForestClassifier(n_estimators = 80)
eval_set = [(X_test, Y_test)]
model.fit(X_train, Y_train)
Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
Y_pred = model.predict(X_test)
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_04.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180531.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id_new if x in Real_Y_id])/100000
index

#### 2018_04 to predict 2018_06 and compare with July 2018

In [None]:
new_data = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_04.csv")
new_data = new_data.fillna(-1)
X = data.drop('response',axis=1)
Y = data['response']
X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = 42)
train = pd.concat([X_train_old, Y_train], axis=1)
length = 2*len(train[train['response']==1])
buy = train[train['response']==1].sample(n=length, replace=True, random_state = 42)
nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = 42)
new_train = pd.concat([buy,nobuy])
X_train = new_train.drop(['indiv_id', 'response','store'], axis=1)
Y_train = new_train['response']
X_test = X_test_old.drop('indiv_id', axis = 1)
model = RandomForestClassifier(n_estimators = 80)
eval_set = [(X_test, Y_test)]
model.fit(X_train, Y_train)
Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
Y_pred = model.predict(X_test)
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_06.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180731.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id_new if x in Real_Y_id])/100000
index

#### Stacked Approach

In [None]:
files = ['2015_18_04.csv','2015_17.csv','2015_18_02.csv']
new_data = pd.concat([pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/" + x) for x in files]).fillna(-1)
new_data = new_data.fillna(-1)
X = data.drop('response',axis=1)
Y = data['response']
X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = 42)
train = pd.concat([X_train_old, Y_train], axis=1)
length = 2*len(train[train['response']==1])
buy = train[train['response']==1].sample(n=length, replace=True, random_state = 42)
nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = 42)
new_train = pd.concat([buy,nobuy])
X_train = new_train.drop(['indiv_id', 'response','store'], axis=1)
Y_train = new_train['response']
X_test = X_test_old.drop('indiv_id', axis = 1)
model = RandomForestClassifier(n_estimators = 80)
eval_set = [(X_test, Y_test)]
model.fit(X_train, Y_train)
Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
Y_pred = model.predict(X_test)
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_06.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180731.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id_new if x in Real_Y_id])/100000
index

2018_08 to predict 2018_10 and compare with Sep 2018

In [8]:
new_data = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_08.csv")
new_data = new_data.fillna(-1)
X = new_data.drop('response',axis=1)
Y = new_data['response']
X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 42)
train = pd.concat([X_train_old, Y_train], axis=1)
length = 2*len(train[train['response']==1])
buy = train[train['response']==1].sample(n=length, replace=True, random_state = 42)
nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = 42)
new_train = pd.concat([buy,nobuy])
X_train = new_train.drop(['indiv_id','response','store'], axis=1)
Y_train = new_train['response']
X_test = X_test_old.drop(['indiv_id','store'], axis = 1)
print("data loaded")
model = RandomForestClassifier(n_estimators = 100)
print("model set")
eval_set = [(X_test, Y_test)]
model.fit(X_train, Y_train)
print("model fit")
Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
Y_pred = model.predict(X_test)
print("model predict")
Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/2015_18_08.csv")
Real_X = Real_X.fillna(-1)
Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180930.csv")
Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
index = len([x for x in top100k_id_new if x in Real_Y_id])/100000
index

data loaded
model set
model fit
model predict


0.5922

In [None]:
from sklearn.inspection import permutation_importance
r = permutation_importance(model, X_test, Y_test,
                           n_repeats=10,
                           random_state=0)
perm = pd.DataFrame(columns=['AVG_Importance', 'STD_Importance'], index=[i for i in X_train.columns])
perm['AVG_Importance'] = r.importances_mean

KeyboardInterrupt: 

In [None]:
perm

In [None]:
def train_process(file, real_file, model, test_size=0.25, seed=114514):
    new_data = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/"+file)
    new_data = new_data.fillna(-1)
    X = new_data.drop('response',axis=1)
    Y = new_data['response']
    X_train_old, X_test_old, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = seed)
    train = pd.concat([X_train_old, Y_train], axis=1)
    length = 2*len(train[train['response']==1])
    buy = train[train['response']==1].sample(n=length, replace=True, random_state = seed)
    nobuy = train[train['response']==0].sample(n=length, replace=True, random_state = seed)
    new_train = pd.concat([buy,nobuy])
    X_train = new_train.drop(['indiv_id','response','store'], axis=1)
    Y_train = new_train['response']
    X_test = X_test_old.drop(['indiv_id','store'], axis = 1)
    print("data loaded")
    model = model
    print("model set")
    eval_set = [(X_test, Y_test)]
    model.fit(X_train, Y_train)
    print("model fit")
    Y_prob = pd.DataFrame(model.predict_proba(X_test), columns = ['pred_0', 'pred_1'])
    Y_pred = model.predict(X_test)
    print("model predict")
    Real_X = pd.read_csv("/data/p_dsi/teams2022/team_1/fe_data/"+real_file)
    Real_X = Real_X.fillna(-1)
    Real_Y_prob = pd.DataFrame(model.predict_proba(Real_X.drop(['indiv_id', 'response','store'], axis=1)), columns = ['pred_0', 'pred_1'])
    top100k_id_new = pd.concat([Real_X, Real_Y_prob], axis=1).sort_values('pred_1', ascending=False)['indiv_id'].unique()[0:100000]
    Real_Y = pd.read_csv("/data/p_dsi/teams2022/team_1/new_data/20180930.csv")
    Real_Y_id = Real_Y[Real_Y['prod_group_code']==5]['indiv_id'].unique()
    index = len([x for x in top100k_id_new if x in Real_Y_id])/100000
    print("index:", index)
    return model, top100k_id_new

In [None]:
train_process('2015_18_08.csv','2015_18_08.csv', MLPClassifier())