-
Notifications
You must be signed in to change notification settings - Fork 1
/
train.py
104 lines (86 loc) · 3.67 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
print('Importing packages...', end="")
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
print('ok.')
BASE_DIR = '~/ML-AZ/Projeto - Churn'
DATA_DIR = os.path.join(BASE_DIR, 'input')
MODEL_DIR = os.path.join(BASE_DIR, 'models')
#Loading dataset
print('Loading dataset...', end="")
X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_out.csv'))
y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train.csv'))
X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test.csv'))
y_test = pd.read_csv(os.path.join(DATA_DIR, 'y_test.csv'))
print('ok.\n')
#Feature Encoding and Scaling
oe_features = ['Gender']
ohe_features = ['Geography']
scaling_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
print('Preprocessing...', end="")
oe = OrdinalEncoder()
oe.fit(X_train[oe_features])
X_train[oe_features] = oe.transform(X_train[oe_features])
X_test[oe_features] = oe.transform(X_test[oe_features])
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit(X_train[ohe_features])
ohe_df = pd.DataFrame(ohe.transform(X_train[ohe_features]), columns=ohe.get_feature_names(ohe_features))
ohe_df_test = pd.DataFrame(ohe.transform(X_test[ohe_features]), columns=ohe.get_feature_names(ohe_features))
df_train = pd.concat([ohe_df, X_train.drop(ohe_features, axis=1)], axis=1)
df_test = pd.concat([ohe_df_test, X_test.drop(ohe_features, axis=1)], axis=1)
features_list = df_train.columns.tolist()
scaler_train = StandardScaler()
X_train_scaled = df_train.copy()
X_train_scaled[scaling_features] = scaler_train.fit_transform(X_train_scaled[scaling_features])
scaler_test = StandardScaler()
X_test_scaled = df_test.copy()
X_test_scaled[scaling_features] = scaler_test.fit_transform(X_test_scaled[scaling_features])
print('ok.')
#Modelling
print('Initializing models...', end="")
clf_rf = RandomForestClassifier(n_estimators=40, criterion='entropy', random_state=1994)
clf_lr = LogisticRegression(random_state=1994, class_weight='balanced')
clf_xgb = XGBClassifier(learning_rate=0.02,n_estimators=600,objective='binary:logistic', random_state=1994)
clf_knn = KNeighborsClassifier(n_neighbors=5)
clf_svm = SVC(random_state=1994)
print('ok.')
models = [
('Random Forest', clf_rf),
('Logistic Regressor', clf_lr),
('XGB Classifier', clf_xgb),
('K-Nearest Neighbours', clf_knn),
('Support Vector Classifier', clf_svm)
]
from utils import trainModels, makePredictions
model_results = trainModels(models,
X_train_scaled[features_list],
y_train,
X_test_scaled[features_list],
y_test,
verbose=True)
#print(model_results)
#Saving the model
print('Saving models...', end="")
model_data = pd.Series({
'oe_features': oe_features,
'ohe_features': ohe_features,
'scaling_features': scaling_features,
'ohe': ohe,
'oe': oe,
'scaler_train': scaler_train,
'scaler_test': scaler_test,
'models': models,
'models_results': model_results,
'features': features_list
})
model_data.to_pickle(os.path.join(MODEL_DIR, 'first_models.pkl'))
print('ok.')
# The results aren't good. Better try some oversampling technique. Check oversampling_train.py