-
Notifications
You must be signed in to change notification settings - Fork 8
/
neural_test.py
144 lines (112 loc) · 6.84 KB
/
neural_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import io
import pydotplus
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, Imputer
#from sklearn.metrics import accuracy_score
from plot_curves import *
class rb_neural_test:
def __init__(self, x_train, x_test, y_train, y_test, x_col_names, data_label, cv):
self.x_train = x_train
self.x_test = x_test
self.y_train = y_train
self.y_test = y_test
self.x_col_names = x_col_names
self.data_label = data_label
self.cv = cv
def run_cv_model(self, alpha=0.0001, batch_size=200, learning_rate_init=0.001, power_t=0.5, max_iter=200, momentum=0.9, beta_1=0.9, beta_2=0.999, hidden_layer_sizes=(100,), do_plot=True):
# use k-fold cross validation
# we need to standardize the data for the KNN learner
pipe_clf = Pipeline([ ('scl', StandardScaler() ),
('clf', MLPClassifier(alpha=alpha,
batch_size=batch_size,
learning_rate_init=learning_rate_init,
power_t=power_t,
max_iter=max_iter,
momentum=momentum,
beta_1=beta_1,
beta_2=beta_2,
hidden_layer_sizes=hidden_layer_sizes))])
# resample the test data without replacement. This means that each data point is part of a test a
# training set only once. (paraphrased from Raschka p.176). In Stratified KFold, the features are
# evenly disributed such that each test and training set is an accurate representation of the whole
# this is the 0.17 version
#kfold = StratifiedKFold(y=self.y_train, n_folds=self.cv, random_state=0)
# this is the 0.18dev version
skf = StratifiedKFold(n_splits=self.cv, random_state=0)
# do the cross validation
train_scores = []
test_scores = []
#for k, (train, test) in enumerate(kfold):
for k, (train, test) in enumerate(skf.split(X=self.x_train, y=self.y_train)):
# run the learning algorithm
pipe_clf.fit(self.x_train[train], self.y_train[train])
train_score = pipe_clf.score(self.x_train[test], self.y_train[test])
train_scores.append(train_score)
test_score = pipe_clf.score(self.x_test, self.y_test)
test_scores.append(test_score)
print('Fold:', k+1, ', Training score:', train_score, ', Test score:', test_score)
train_score = np.mean(train_scores)
print('Training score is', train_score)
test_score = np.mean(test_scores)
print('Test score is', test_score)
if do_plot:
self.__plot_learning_curve(pipe_clf)
self.__plot_decision_boundaries(pipe_clf)
return train_score, test_score
def run_model(self, alpha=0.0001, batch_size=200, learning_rate_init=0.001, power_t=0.5, max_iter=200, momentum=0.9, beta_1=0.9, beta_2=0.999, hidden_layer_sizes=(100,), do_plot=True):
# we need to standardize the data for the learner
pipe_clf = Pipeline([ ('scl', StandardScaler() ),
('clf', MLPClassifier(alpha=alpha,
batch_size=batch_size,
learning_rate_init=learning_rate_init,
power_t=power_t,
max_iter=max_iter,
momentum=momentum,
beta_1=beta_1,
beta_2=beta_2,
hidden_layer_sizes=hidden_layer_sizes))])
# test it: this should match the non-pipelined call
pipe_clf.fit(self.x_train, self.y_train)
# check model accuracy
train_score = pipe_clf.score(self.x_train, self.y_train)
print('Training score is', train_score)
test_score = pipe_clf.score(self.x_test, self.y_test)
print('Test score is', test_score)
if do_plot:
self.__plot_learning_curve(pipe_clf)
self.__plot_decision_boundaries(pipe_clf)
return train_score, test_score
def __plot_learning_curve(self, estimator):
plc = rb_plot_curves()
plc.plot_learning_curve(estimator, self.x_train, self.y_train, self.cv, self.data_label)
def plot_validation_curve(self, alpha=0.0001, batch_size=200, learning_rate_init=0.001, power_t=0.5, max_iter=200, momentum=0.9, beta_1=0.9, beta_2=0.999, hidden_layer_sizes=(100,)):
estimator = Pipeline([ ('scl', StandardScaler() ),
('clf', MLPClassifier(alpha=alpha,
batch_size=batch_size,
learning_rate_init=learning_rate_init,
power_t=power_t,
max_iter=max_iter,
momentum=momentum,
beta_1=beta_1,
beta_2=beta_2,
hidden_layer_sizes=hidden_layer_sizes))])
param_names = ['clf__batch_size', 'clf__learning_rate_init', 'clf__power_t', 'clf__max_iter']
param_ranges = [np.arange(50,500,10), np.arange(0.001,0.1,0.01), np.arange(0.01,0.1,0.01), np.arange(50, 1000, 10)]
data_label = self.data_label
plc = rb_plot_curves()
for i in range(len(param_names)):
param_name = param_names[i]
param_range = param_ranges[i]
plc.plot_validation_curve(estimator, self.x_train, self.y_train,
self.cv, data_label,
param_range, param_name, n_jobs=-1)
def __plot_decision_boundaries(self, estimator):
plc = rb_plot_curves()
features = pd.DataFrame(self.x_train)
features.columns = self.x_col_names
plc.plot_decision_boundaries(estimator, features, self.y_train, self.data_label)