-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
148 lines (138 loc) · 4.64 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""This project is my first attempt in solving a real world problem using
machine learning. Here I made use of Random Forests, Gradient Tree Boosting,
Neural Networks and Ensembles to participate on the "Bikesharing Demand"
(http://www.kaggle.com/c/bike-sharing-demand) competition on Kaggle.
"""
# Author: Rafael Aguiar <rfna@cin.ufpe.br>
# License: MIT
from data_handling import load, output
from ml import RegressionModel, Ensemble, NNFromR
from optimize import cv, feature_worth, tune_parameters
from sklearn import ensemble
if __name__ == '__main__':
train, test, train_, test_, scaler = load(modify_dfs=False, load_=True)
# Specify variables to be considered on each model.
independent_vars = [
'weekday', 'hour', 'year', 'season', 'holiday',
'workingday', 'weather', 'temp', 'atemp', 'humidity',
'windspeed'
]
nn_independent_vars = filter(
# dummy variables use underscore
lambda x: x.split("_")[0] not in [
'casual', 'registered', 'count', 'month' # removes dependent vars
] and x.split(".")[0] not in [
'casual', 'registered', 'count' # removes grouped vars
],
train_.columns.tolist()
)
# A model is composed by a regressor and its variables.
models = {
"RandomForestRegressor": RegressionModel(
ensemble.RandomForestRegressor(
n_estimators=1000,
min_samples_split=11,
n_jobs=-1,
oob_score=False,
random_state=0
),
{
'casual': independent_vars+[
'registered.by.month',
'casual.by.hour',
'registered.by.hour'
],
'registered': independent_vars+[
'registered.by.month',
'casual.by.hour',
'registered.by.hour'
]
}
),
"GradientBoostingRegressor": RegressionModel(
ensemble.GradientBoostingRegressor(
n_estimators=100,
max_depth=6,
random_state=0
),
{
'casual': independent_vars+[
'registered.by.month',
'casual.by.hour',
'registered.by.hour'
],
'registered': independent_vars+[
'registered.by.month',
'casual.by.hour',
'registered.by.hour'
]
}
),
"NeuralNetwork": RegressionModel(
NNFromR(
hidden="10",
threshold=0.01,
stepmax=1e+06,
learningrate=0.001,
algorithm="rprop+",
lifesign="none"
),
{
'casual': nn_independent_vars,
'registered': nn_independent_vars
},
scaler
),
"NeuralNetwork_w_Momentum": RegressionModel(
NNFromR(
n_neurons="c(ncol(train),5,1)",
learning_rate_global=0.001,
momentum_global=0.001,
error_criterium="LMS",
hidden_layer="sigmoid",
output_layer="sigmoid",
method="ADAPTgdwm"
),
{
'casual': nn_independent_vars,
'registered': nn_independent_vars
},
scaler
)
}
# > Example: parameter optimization
# model = models['NeuralNetwork_w_Momentum']
# e_neurons, min_neurons = tune_parameters(
# model,
# 'n_neurons',
# range(5, 11)
# )
# # > Example: test if a feature is worth including in a model
# feature_worth(models['GradientBoostingRegressor'], train)
# # > Example: Compare validation error between models
# for model_key in models:
# print "> "+model_key
# model = models[model_key]
# if "NeuralNetwork" in model_key:
# error = cv(train_, 5, model)
# else:
# error = cv(train, 5, model)
# print model_key, error
del models['NeuralNetwork']
del models['NeuralNetwork_w_Momentum']
ensemble = Ensemble(
models=models,
combiner=ensemble.GradientBoostingRegressor(
n_estimators=100,
random_state=0
)
)
ensemble.fit(train)
d = dict(
zip(
models['GradientBoostingRegressor'].variables['casual'],
ensemble.feature_importances_
)
)
prediction = ensemble.predict(test)
output('12.0th.csv', prediction, test)