In [2]:
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle

In [3]:
training_data = pd.read_csv("Data/Scotland/scottish_data_training.csv")
training_labels = pd.read_csv("Data/Scotland/scottish_labels_training.csv")

In [4]:
Y = training_labels.drop(columns = ['attendance', 'number_over_4_hours',
       'number_over_8_hours', 'percentage_within_8_hours',
       'number_over_12_hours', 'percentage_within_12_hours',
       'number_under_4_hours', 'number_4_hours_8_hours',
       'number_8_hours_12_hours', 'percentage_within_4_hours'])

In [5]:
X = training_data
X["region_east"] = 1.0 *(X["region"] == "East")
X["region_west"] =  1.0 *(X["region"] == "West")
X["region_north"] =  1.0 *(X["region"] == "North")
X = X.drop(columns=["region", "moon_phase_name"])
for col in X.columns:
    X[col] = X[col].astype(str).str.replace(',', '').astype(float)
    if X[col].isna().any():
        X[col] = X[col].fillna(0)

In [6]:
model_4_hours = MLPRegressor(hidden_layer_sizes=(100, 100, 100, 100, 100, 100, 100, 100,), activation="tanh")
model_8_hours = MLPRegressor(hidden_layer_sizes=(100, 100, 100, 100, 100, 100, 100, 100,),activation="tanh")
model_12_hours = MLPRegressor(hidden_layer_sizes=(100, 100, 100, 100, 100, 100, 100, 100,),activation="tanh")
model_long_hours = MLPRegressor(hidden_layer_sizes=(100, 100, 100, 100, 100, 100, 100, 100,),activation="tanh")


In [7]:
model_4_hours.fit(X,Y.proportion_under_4_hours)
model_8_hours.fit(X,Y.proportion_4_hours_8_hours)
model_12_hours.fit(X,Y.proportion_8_hours_12_hours)
model_long_hours.fit(X,Y.proportion_over_12_hours)


MLPRegressor(activation='tanh',
             hidden_layer_sizes=(100, 100, 100, 100, 100, 100, 100, 100))

In [8]:
(mean_absolute_error(Y.proportion_under_4_hours, model_4_hours.predict(X)),
 mean_absolute_error(Y.proportion_4_hours_8_hours, model_8_hours.predict(X)),
 mean_absolute_error(Y.proportion_8_hours_12_hours, model_12_hours.predict(X)),
 mean_absolute_error(Y.proportion_over_12_hours, model_long_hours.predict(X)))

(0.07772857489923564,
 0.057926569696736034,
 0.010128649486211078,
 0.007379876555691441)

In [9]:
testing_data = pd.read_csv("Data/Scotland/scottish_data_testing.csv")
testing_labels = pd.read_csv("Data/Scotland/scottish_labels_testing.csv")

In [10]:
Y_test = testing_labels.drop(columns = ['attendance', 'number_over_4_hours',
       'number_over_8_hours', 'percentage_within_8_hours',
       'number_over_12_hours', 'percentage_within_12_hours',
       'number_under_4_hours', 'number_4_hours_8_hours',
       'number_8_hours_12_hours', 'percentage_within_4_hours'])
X_test = testing_data
X_test["region_east"] = 1.0 *(X_test["region"] == "East")
X_test["region_west"] =  1.0 *(X_test["region"] == "West")
X_test["region_north"] =  1.0 *(X_test["region"] == "North")
X_test = X_test.drop(columns=["region", "moon_phase_name"])
for col in X_test.columns:
    X_test[col] = X_test[col].astype(str).str.replace(',', '').astype(float)
    if X_test[col].isna().any():
        X_test[col] = X_test[col].fillna(0)

In [11]:
(model_4_hours.score(X_test,Y_test.proportion_under_4_hours),
 model_8_hours.score(X_test,Y_test.proportion_4_hours_8_hours),
 model_12_hours.score(X_test,Y_test.proportion_8_hours_12_hours),
 model_long_hours.score(X_test,Y_test.proportion_over_12_hours))

(-0.3695015038057152,
 -0.16210627600484884,
 -0.1376353705522131,
 -1.0278797187398223)

In [12]:
(mean_absolute_error(Y_test.proportion_under_4_hours, model_4_hours.predict(X_test)),
 mean_absolute_error(Y_test.proportion_4_hours_8_hours, model_8_hours.predict(X_test)),
 mean_absolute_error(Y_test.proportion_8_hours_12_hours, model_12_hours.predict(X_test)),
 mean_absolute_error(Y_test.proportion_over_12_hours, model_long_hours.predict(X_test)))

(0.07752377164137333,
 0.05735701190776209,
 0.010054645612671153,
 0.007316539265150512)

In [13]:
def divide_by_sum(tuple_):
    sum_ = sum(tuple_)
    return [e/sum_ for e in tuple_]
normed_predictions = [list(l) for l in
                      zip(*map(divide_by_sum,
                               zip(model_4_hours.predict(X_test),
                                   model_8_hours.predict(X_test),
                                   model_12_hours.predict(X_test),
                                   model_long_hours.predict(X_test))))
                      ]
(mean_absolute_error(Y_test.proportion_under_4_hours, normed_predictions[0]),
 mean_absolute_error(Y_test.proportion_4_hours_8_hours, normed_predictions[1]),
 mean_absolute_error(Y_test.proportion_8_hours_12_hours, normed_predictions[2]),
 mean_absolute_error(Y_test.proportion_over_12_hours, normed_predictions[3]))

(0.0714531556363753,
 0.05792528787961585,
 0.010133139711524585,
 0.007391164104041847)

In [14]:
model_bytes = pickle.dumps((model_4_hours,
                            model_8_hours,
                            model_12_hours,
                            model_long_hours))
print(len(model_bytes))
with open("Models/TrainingMLPScotlandModel.pkl", 'wb') as save_file:
    save_file.write(model_bytes)


11750753


In [15]:
model_4_hours_loaded,\
model_8_hours_loaded,\
model_12_hours_loaded,\
model_long_hours_loaded = None, None, None, None
with open("Models/TrainingMLPScotlandModel.pkl", 'rb') as save_file:
    model_bytes = save_file.read()
    print(len(model_bytes))
    model_4_hours_loaded,\
    model_8_hours_loaded,\
    model_12_hours_loaded,\
    model_long_hours_loaded = pickle.loads(model_bytes)

11750753
