In [None]:
import datetime as dt
import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

from keras import backend as K
from keras import optimizers
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Activation
from keras.models import model_from_json
from keras.models import Sequential
from numpy import mean
from numpy import std
from numpy.random import randn
from numpy.random import seed
from scipy.stats.stats import pearsonr
from sklearn import preprocessing as pre
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from statsmodels.graphics.gofplots import qqplot
from tqdm import tqdm_notebook as tqdm

In [None]:
# read final processed data
def read_data():
    
    df = pd.read_csv('../../inputs/final_processed_data.csv')
    
    return df

def normalise_data(df):

    df['height'] = df['height'] / df['height'].max()
    df['matches_year1'] = df['matches_year1'] / df['matches_year1'].max()
    df['matches_year2'] = df['matches_year2'] / df['matches_year2'].max()
    df['goals_year1'] = df['goals_year1'] / df['goals_year1'].max()
    df['goals_year2'] = df['goals_year2'] / df['goals_year2'].max()
    df['goals_per_match_year1'] = df['goals_per_match_year1'] / df['goals_per_match_year1'].max()
    df['goals_per_match_year2'] = df['goals_per_match_year2'] / df['goals_per_match_year2'].max()
    df['goals_per_match_year3'] = df['goals_per_match_year3'] / df['goals_per_match_year3'].max()
    
    return df

df = read_data()
df = normalise_data(df)
df.head()

In [None]:
def split_data(df):

    data = df.sort_values(by=['goals_per_match_year3'])

    data = data.drop('name', 1)

    y = data[['goals_per_match_year3']].values.astype('float32')

    data = data.drop('goals_per_match_year3', 1)

    x = data.values.astype('float32')

    # split final processed data in training / validation / testing 
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
    x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)
    
    return x_train, y_train, x_val, y_val, x_test, y_test

In [None]:
# train the best neural network
def coeff_determination(y_true, y_pred):
    SS_res = K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return (1 - SS_res / (SS_tot + K.epsilon()))

def fit_NN(x_train, y_train, x_val, y_val):
    
    model = Sequential()
    model.add(Dense(32, activation='softmax', input_dim=11))
    model.add(Dense(4, activation='tanh'))
    model.add(Dense(8, activation='tanh'))
    model.add(Dense(1, activation='linear'))

    RMS = optimizers.RMSprop(lr=0.001)
    
    model.compile(optimizer=RMS,
                  loss='mean_absolute_error',
                  metrics=[coeff_determination])

    checkpointer = ModelCheckpoint(filepath="weights.hdf5", verbose=0, save_best_only=True)

    model.fit(x_train, y_train,
              batch_size=16, epochs=50, shuffle=False,
              validation_data=(x_val, y_val), callbacks=[checkpointer], verbose=0)

    model.load_weights('weights.hdf5')
    
    return model

In [None]:
# Method to plot model accuracy on test data
def plot_model_accuracy_on_test_data(model, x_test, y_test):
    
    y_pred = model.predict(x_test)

    plt.plot(y_pred, label='prediction')
    plt.plot(y_test, label='true',alpha =0.3)
    plt.legend()
    plt.show()

    fig, ax = plt.subplots()
    plt.title('Actual value vs predicted value (goals per match year 3)')
    ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0),alpha =0.3)
    ax.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'k--', lw=4)
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicted')
    plt.show()

In [None]:
# Method to compute best data split
def compute_best_data_split(df, iterations):
    
    best_test_score = -1
    
    for i in tqdm(range(0, iterations)):
        
        x_train, y_train, x_val, y_val, x_test, y_test = split_data(df)
        
        model = fit_NN(x_train, y_train, x_val, y_val)
        
        test_score = model.evaluate(x_test, y_test, verbose=0)[1]
        
        if test_score > best_test_score:
            
            best_model = model
            best_test_score = test_score
            
    plot_model_accuracy_on_test_data(best_model, x_test, y_test)
    
    return best_model, best_test_score

In [None]:
model, test_score = compute_best_data_split(df, 100)

print("Test score:", test_score)

In [None]:
# save model to file
model_json = model.to_json()
with open("4_layers_NN.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights("4_layers_NN.txt")

In [None]:
# separate dataset into particular player attributes
centre_df = df[df['center'] == 1]
back_df = df[df['back'] == 1]
wing_df = df[df['wing'] == 1]
line_df = df[df['line'] == 1]

short_players = df[df['height'] < df['height'].quantile(.25)]
tall_players = df[df['height'] > df['height'].quantile(.75)]

rarely_playing = df[df['matches_year2'] < df['matches_year2'].quantile(.25)]
often_playing = df[df['matches_year2'] > df['matches_year2'].quantile(.75)]

low_scoring = df[df['goals_year2'] < df['goals_year2'].quantile(.25)]
high_scoring = df[df['goals_year2'] > df['goals_year2'].quantile(.75)]

In [None]:
centre_x = centre_df[['center','back','wing','line','height','matches_year1','goals_year1','goals_per_match_year1',
               'matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
centre_y = centre_df[['goals_per_match_year3']].values.astype('float32')

back_x = back_df[['center','back','wing','line','height','matches_year1','goals_year1','goals_per_match_year1',
               'matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
back_y = back_df[['goals_per_match_year3']].values.astype('float32')

wing_x = wing_df[['center','back','wing','line','height','matches_year1','goals_year1','goals_per_match_year1',
               'matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
wing_y = wing_df[['goals_per_match_year3']].values.astype('float32')

line_x = line_df[['center','back','wing','line','height','matches_year1','goals_year1','goals_per_match_year1',
               'matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
line_y = line_df[['goals_per_match_year3']].values.astype('float32')

short_players_x = short_players[['center','back','wing','line','height','matches_year1','goals_year1',
        'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
short_players_y = short_players[['goals_per_match_year3']].values.astype('float32')

tall_players_x = tall_players[['center','back','wing','line','height','matches_year1','goals_year1',
        'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
tall_players_y = tall_players[['goals_per_match_year3']].values.astype('float32')

rarely_playing_x = rarely_playing[['center','back','wing','line','height','matches_year1','goals_year1',
        'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
rarely_playing_y = rarely_playing[['goals_per_match_year3']].values.astype('float32')

often_playing_x = often_playing[['center','back','wing','line','height','matches_year1','goals_year1',
        'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
often_playing_y = often_playing[['goals_per_match_year3']].values.astype('float32')

low_scoring_x = low_scoring[['center','back','wing','line','height','matches_year1','goals_year1',
        'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
low_scoring_y = low_scoring[['goals_per_match_year3']].values.astype('float32')

high_scoring_x = high_scoring[['center','back','wing','line','height','matches_year1','goals_year1',
        'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2']].values.astype('float32')
high_scoring_y = high_scoring[['goals_per_match_year3']].values.astype('float32')

In [None]:
model.evaluate(centre_x, centre_y, verbose=0)[1]

In [None]:
model.evaluate(back_x, back_y, verbose=0)[1]

In [None]:
model.evaluate(wing_x, wing_y, verbose=0)[1]

In [None]:
model.evaluate(line_x, line_y, verbose=0)[1]

In [None]:
model.evaluate(short_players_x, short_players_y, verbose=0)[1]

In [None]:
model.evaluate(tall_players_x, tall_players_y, verbose=0)[1]

In [None]:
model.evaluate(rarely_playing_x, rarely_playing_y, verbose=0)[1]

In [None]:
model.evaluate(often_playing_x, often_playing_y, verbose=0)[1]

In [None]:
model.evaluate(low_scoring_x, low_scoring_y, verbose=0)[1]

In [None]:
model.evaluate(high_scoring_x, high_scoring_y, verbose=0)[1]