In [None]:
import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import warnings

warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm

In [None]:
# read final processed data
df = pd.read_csv('../../inputs/final_processed_data.csv')
df.head()

In [None]:
# Method to split final processed data in training / testing
def split_data_in_training_testing(df):

    data = df.sort_values(by=['goals_per_match_year3'])

    data = data.drop('name', 1)

    y = data[['goals_per_match_year3']].values.astype('float32')

    data = data.drop('goals_per_match_year3', 1)
    x = data.values.astype('float32')

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10)
    
    return x_train, x_test, y_train, y_test

In [None]:
# Method to compute best model using k-fold validation
def compute_optimal_hyperparameters(x_train, y_train):
    
    max_depth_list = [None, 50, 100, 200, 500]
    max_leaf_nodes_list = [None, 50, 100, 200, 500]
    max_features_list = [None, 3, 4, 5, 6, 7]
    bootstrap_list = [False, True]
    
    best_score = -1000
    
    for max_depth in max_depth_list:
        for max_leaf_nodes in max_leaf_nodes_list:
            for max_features in max_features_list:
                for bootstrap in bootstrap_list:

                    current_model = RandomForestClassifier(max_depth=max_depth, max_leaf_nodes=max_leaf_nodes, 
                                                          max_features=max_features, bootstrap=bootstrap)
                    current_score = cross_val_score(current_model, x_train, y_train, cv=5).mean()

                    if current_score > best_score:
                        best_score = current_score
                        best_model = current_model
    
    return best_score, best_model

In [None]:
# Method to plot model accuracy on test data
def plot_model_accuracy_on_test_data(model, x_test, y_test):
    
    y_pred = model.predict(x_test)

    plt.plot(y_pred, label='prediction')
    plt.plot(y_test, label='true',alpha =0.3)
    plt.legend()
    plt.show() 

    fig, ax = plt.subplots()
    plt.title('Actual value vs predicted value (goals per match year 3)')
    ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0),alpha =0.3)
    ax.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'k--', lw=4)
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicted')
    plt.show()

In [None]:
# Method to compute best data split
def compute_best_data_split(df, iterations):
    
    best_test_score = -1
    
    for i in tqdm(range(0, iterations)):

        x_train, x_test, y_train, y_test = split_data_in_training_testing(df)

        validation_score, model = compute_optimal_hyperparameters(x_train, y_train)

        model.fit(x_train,y_train)

        test_score = model.score(x_test, y_test)
        
        if abs(validation_score - test_score) < 0.05 and test_score > best_test_score:
            
            best_model = model
            best_validation_score = validation_score
            best_test_score = test_score
            
    plot_model_accuracy_on_test_data(best_model, x_test, y_test)

    return best_model, best_validation_score, best_test_score

In [None]:
df['goals_per_match_year3'] = df.apply(lambda row: int(round(row['goals_per_match_year3'])), axis=1)

model, validation_score, test_score = compute_best_data_split(df, 10)

print("Validation score:", validation_score)
print("Test score:", test_score)
print(model)

In [None]:
# save model to file
pkl_filename = "random_forests_classifier_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [None]:
# separate dataset into particular player attributes
centre_df = df[df['center'] == 1]
back_df = df[df['back'] == 1]
wing_df = df[df['wing'] == 1]
line_df = df[df['line'] == 1]

short_players = df[df['height'] < df['height'].quantile(.25)]
tall_players = df[df['height'] > df['height'].quantile(.75)]

rarely_playing = df[df['matches_year2'] < df['matches_year2'].quantile(.25)]
often_playing = df[df['matches_year2'] > df['matches_year2'].quantile(.75)]

low_scoring = df[df['goals_year2'] < df['goals_year2'].quantile(.25)]
high_scoring = df[df['goals_year2'] > df['goals_year2'].quantile(.75)]

In [None]:
centre_x = centre_df[['center','back','wing','line','height','matches_year1','goals_year1','goals_per_match_year1',
               'matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
centre_y = centre_df[['goals_per_match_year3']].values.astype('float32')

back_x = back_df[['center','back','wing','line','height','matches_year1','goals_year1','goals_per_match_year1',
               'matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
back_y = back_df[['goals_per_match_year3']].values.astype('float32')

wing_x = wing_df[['center','back','wing','line','height','matches_year1','goals_year1','goals_per_match_year1',
               'matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
wing_y = wing_df[['goals_per_match_year3']].values.astype('float32')

line_x = line_df[['center','back','wing','line','height','matches_year1','goals_year1','goals_per_match_year1',
               'matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
line_y = line_df[['goals_per_match_year3']].values.astype('float32')

short_players_x = short_players[['center','back','wing','line','height','matches_year1','goals_year1',
        'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
short_players_y = short_players[['goals_per_match_year3']].values.astype('float32')

tall_players_x = tall_players[['center','back','wing','line','height','matches_year1','goals_year1',
        'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
tall_players_y = tall_players[['goals_per_match_year3']].values.astype('float32')

rarely_playing_x = rarely_playing[['center','back','wing','line','height','matches_year1','goals_year1',
        'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
rarely_playing_y = rarely_playing[['goals_per_match_year3']].values.astype('float32')

often_playing_x = often_playing[['center','back','wing','line','height','matches_year1','goals_year1',
        'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
often_playing_y = often_playing[['goals_per_match_year3']].values.astype('float32')

low_scoring_x = low_scoring[['center','back','wing','line','height','matches_year1','goals_year1',
        'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
low_scoring_y = low_scoring[['goals_per_match_year3']].values.astype('float32')

high_scoring_x = high_scoring[['center','back','wing','line','height','matches_year1','goals_year1',
        'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
high_scoring_y = high_scoring[['goals_per_match_year3']].values.astype('float32')

In [None]:
model.score(centre_x, centre_y)

In [None]:
model.score(back_x, back_y)

In [None]:
model.score(wing_x, wing_y)

In [None]:
model.score(line_x, line_y)

In [None]:
model.score(short_players_x, short_players_y)

In [None]:
model.score(tall_players_x, tall_players_y)

In [None]:
model.score(rarely_playing_x, rarely_playing_y)

In [None]:
model.score(often_playing_x, often_playing_y)

In [None]:
model.score(low_scoring_x, low_scoring_y)

In [None]:
model.score(high_scoring_x, high_scoring_y)