In [None]:
import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm_notebook as tqdm

In [None]:
# read final processed data
df = pd.read_csv('../inputs/final_processed_data.csv')
df.head()

In [None]:
# Method to split final processed data in training / testing
def split_data_in_training_testing(df):

    data = df.sort_values(by=['goals_per_match_year3'])

    data = data.drop('name', 1)

    y = data[['goals_per_match_year3']].values.astype('float32')

    data = data.drop('goals_per_match_year3', 1)
    
    x = data.values.astype('float32')

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10)
    
    return x_train, x_test, y_train, y_test

In [None]:
# Method to compute best model using k-fold validation
def compute_optimal_hyperparameters(x_train, y_train):
    
    fit_intercept_list = [False, True]
    normalize_list = [False, True]
    copy_X_list = [False, True]

    best_score = -1000

    for fit_intercept in fit_intercept_list:
        for normalize in normalize_list:
            for copy_X in copy_X_list:

                current_model = LinearRegression(fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X)
                current_score = cross_val_score(current_model, x_train, y_train, cv=5).mean()

                if current_score > best_score:
                    best_score = current_score
                    best_model = current_model
    
    return best_score, best_model

In [None]:
# Method to plot model accuracy on test data
def plot_model_accuracy_on_test_data(model, x_test, y_test):
    
    y_pred = model.predict(x_test)

    plt.plot(y_pred, label='prediction')
    plt.plot(y_test, label='true',alpha =0.3)
    plt.legend()
    plt.show() 

    fig, ax = plt.subplots()
    plt.title('Actual value vs predicted value (goals per match year 3)')
    ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0),alpha =0.3)
    ax.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'k--', lw=4)
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicted')
    plt.show()

In [None]:
# Method to compute best data split and filtering values
def compute_best_data_split(df, iterations):
    
    best_test_score = -1

    df_copy = df.copy()
    
#     df_copy = df_copy[df_copy.goals_per_match_year3 < df_copy.goals_per_match_year3.quantile(.95)]
    
#     pca = PCA(n_components=1)
#     df_copy['past_performance_PCA'] = pca.fit_transform(df_copy[['matches_year1','matches_year2','goals_year1',
#                                                 'goals_year2','goals_per_match_year1','goals_per_match_year2']])
    
#     df_copy['height'] = df_copy['height'] / df_copy['height'].max()
#     df_copy['matches_year1'] = df_copy['matches_year1'] / df_copy['matches_year1'].max()
#     df_copy['matches_year2'] = df_copy['matches_year2'] / df_copy['matches_year2'].max()
#     df_copy['goals_year1'] = df_copy['goals_year1'] / df_copy['goals_year1'].max()
#     df_copy['goals_year2'] = df_copy['goals_year2'] / df_copy['goals_year2'].max()
#     df_copy['goals_per_match_year1'] = df_copy['goals_per_match_year1'] / df_copy['goals_per_match_year1'].max()
#     df_copy['goals_per_match_year2'] = df_copy['goals_per_match_year2'] / df_copy['goals_per_match_year2'].max()
#     df_copy['goals_per_match_year3'] = df_copy['goals_per_match_year3'] / df_copy['goals_per_match_year3'].max()

    df_copy['height'] = preprocessing.scale(df_copy['height'])
    df_copy['matches_year1'] = preprocessing.scale(df_copy['matches_year1'])
    df_copy['matches_year2'] = preprocessing.scale(df_copy['matches_year2'])
    df_copy['goals_year1'] = preprocessing.scale(df_copy['goals_year1'])
    df_copy['goals_year2'] = preprocessing.scale(df_copy['goals_year2'])
    df_copy['goals_per_match_year1'] = preprocessing.scale(df_copy['goals_per_match_year1'])
    df_copy['goals_per_match_year2'] = preprocessing.scale(df_copy['goals_per_match_year2'])
    df_copy['goals_per_match_year3'] = preprocessing.scale(df_copy['goals_per_match_year3'])

#     df_copy = df_copy[['name','center','back','wing','line','height','past_performance_PCA',
#                        'goals_per_match_year3']]
    
    for i in tqdm(range(0, iterations)):

        x_train, x_test, y_train, y_test = split_data_in_training_testing(df_copy)

        validation_score, model = compute_optimal_hyperparameters(x_train, y_train)

        model.fit(x_train,y_train)

        test_score = model.score(x_test, y_test)

        if abs(validation_score - test_score) < 0.05 and test_score > best_test_score:

            best_model = model
            best_validation_score = validation_score
            best_test_score = test_score

    plot_model_accuracy_on_test_data(best_model, x_test, y_test)

    return best_model, best_validation_score, best_test_score

In [None]:
model, best_validation_score, best_test_score = compute_best_data_split(df, 1000)
print(best_validation_score, best_test_score)