In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle

In [2]:
training_data = pd.read_csv("Data/Scotland/scottish_data_training.csv")
training_labels = pd.read_csv("Data/Scotland/scottish_labels_training.csv")

In [3]:
model_4_hours = RandomForestRegressor(n_estimators=100, n_jobs=8)
model_8_hours = RandomForestRegressor(n_estimators=100, n_jobs=8)
model_12_hours = RandomForestRegressor(n_estimators=100, n_jobs=8)
model_long_hours = RandomForestRegressor(n_estimators=100, n_jobs=8)


In [4]:
Y = training_labels.drop(columns = ['attendance', 'number_over_4_hours',
       'number_over_8_hours', 'percentage_within_8_hours',
       'number_over_12_hours', 'percentage_within_12_hours',
       'number_under_4_hours', 'number_4_hours_8_hours',
       'number_8_hours_12_hours', 'percentage_within_4_hours'])

In [5]:
X = training_data
X["region_east"] = 1.0 *(X["region"] == "East")
X["region_west"] =  1.0 *(X["region"] == "West")
X["region_north"] =  1.0 *(X["region"] == "North")
X = X.drop(columns=["region", "moon_phase_name"])
for col in X.columns:
    X[col] = X[col].astype(str).str.replace(',', '').astype(float)
    if X[col].isna().any():
        X[col] = X[col].fillna(0)

In [6]:
model_4_hours.fit(X,Y.proportion_under_4_hours)
model_8_hours.fit(X,Y.proportion_4_hours_8_hours)
model_12_hours.fit(X,Y.proportion_8_hours_12_hours)
model_long_hours.fit(X,Y.proportion_over_12_hours)


RandomForestRegressor(n_jobs=8)

In [7]:
(mean_absolute_error(Y.proportion_under_4_hours, model_4_hours.predict(X)),
 mean_absolute_error(Y.proportion_4_hours_8_hours, model_8_hours.predict(X)),
 mean_absolute_error(Y.proportion_8_hours_12_hours, model_12_hours.predict(X)),
 mean_absolute_error(Y.proportion_over_12_hours, model_long_hours.predict(X)))

(0.010971934190481838,
 0.009329879763042326,
 0.001808255346906956,
 0.000702360263021923)

In [8]:
testing_data = pd.read_csv("Data/Scotland/scottish_data_testing.csv")
testing_labels = pd.read_csv("Data/Scotland/scottish_labels_testing.csv")

In [9]:
Y_test = testing_labels.drop(columns = ['attendance', 'number_over_4_hours',
       'number_over_8_hours', 'percentage_within_8_hours',
       'number_over_12_hours', 'percentage_within_12_hours',
       'number_under_4_hours', 'number_4_hours_8_hours',
       'number_8_hours_12_hours', 'percentage_within_4_hours'])
X_test = testing_data
X_test["region_east"] = 1.0 *(X_test["region"] == "East")
X_test["region_west"] =  1.0 *(X_test["region"] == "West")
X_test["region_north"] =  1.0 *(X_test["region"] == "North")
X_test = X_test.drop(columns=["region", "moon_phase_name"])
for col in X_test.columns:
    X_test[col] = X_test[col].astype(str).str.replace(',', '').astype(float)
    if X_test[col].isna().any():
        X_test[col] = X_test[col].fillna(0)

In [10]:

(mean_absolute_error(Y_test.proportion_under_4_hours, model_4_hours.predict(X_test)),
 mean_absolute_error(Y_test.proportion_4_hours_8_hours, model_8_hours.predict(X_test)),
 mean_absolute_error(Y_test.proportion_8_hours_12_hours, model_12_hours.predict(X_test)),
 mean_absolute_error(Y_test.proportion_over_12_hours, model_long_hours.predict(X_test)))

(0.029893727964648946,
 0.025737557993314078,
 0.004967004634389578,
 0.0017893774405809875)

In [11]:
def divide_by_sum(tuple_):
    sum_ = sum(tuple_)
    return [e/sum_ for e in tuple_]
normed_predictions = [list(l) for l in
                      zip(*map(divide_by_sum,
                               zip(model_4_hours.predict(X_test),
                                   model_8_hours.predict(X_test),
                                   model_12_hours.predict(X_test),
                                   model_long_hours.predict(X_test))))
                      ]
(mean_absolute_error(Y_test.proportion_under_4_hours, normed_predictions[0]),
 mean_absolute_error(Y_test.proportion_4_hours_8_hours, normed_predictions[1]),
 mean_absolute_error(Y_test.proportion_8_hours_12_hours, normed_predictions[2]),
 mean_absolute_error(Y_test.proportion_over_12_hours, normed_predictions[3]))

(0.030061082875264107,
 0.025715282410438533,
 0.004971507158954944,
 0.0017879112217933904)

In [17]:
model_bytes = pickle.dumps((model_4_hours,
                            model_8_hours,
                            model_12_hours,
                            model_long_hours))
print(len(model_bytes))
with open("Models/TrainingRandomForestScotlandModel.pkl", 'wb') as save_file:
    save_file.write(model_bytes)


172234923


In [18]:
model_4_hours_loaded,\
model_8_hours_loaded,\
model_12_hours_loaded,\
model_long_hours_loaded = None, None, None, None
with open("Models/TrainingRandomForestScotlandModel.pkl", 'rb') as save_file:
    model_bytes = save_file.read()
    print(len(model_bytes))
    model_4_hours_loaded,\
    model_8_hours_loaded,\
    model_12_hours_loaded,\
    model_long_hours_loaded = pickle.loads(model_bytes)

172234923


In [16]:
(mean_absolute_error(Y_test.proportion_under_4_hours, model_4_hours_loaded.predict(X_test)),
 mean_absolute_error(Y_test.proportion_4_hours_8_hours, model_8_hours_loaded.predict(X_test)),
 mean_absolute_error(Y_test.proportion_8_hours_12_hours, model_12_hours_loaded.predict(X_test)),
 mean_absolute_error(Y_test.proportion_over_12_hours, model_long_hours_loaded.predict(X_test)))

(0.02989372796464895,
 0.025737557993314078,
 0.004967004634389578,
 0.0017893774405809875)