In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

from sklearn.metrics import r2_score
from statsmodels.tsa.arima_model import ARIMA
from tqdm import tqdm_notebook as tqdm

In [None]:
raw_df = pd.read_csv('../../inputs/raw_data.csv')

raw_df.drop('position', axis=1, inplace=True)
raw_df.drop('height', axis=1, inplace=True)

df = pd.read_csv('../../inputs/final_processed_data.csv')

df = df.merge(raw_df, how='inner', on=['name','matches_year1','goals_year1','matches_year2','goals_year2'])
df['prediction_season'] = df.apply(lambda row: int(row['prediction_season'].split('/')[1]), axis=1)

df.head()

In [None]:
time_series_df = pd.DataFrame(columns=['name','matches_year1','goals_year1','matches_year2','goals_year2',
                                       'matches_year3','goals_year3','matches_year4','goals_year4',
                                       'goals_per_match_year5','prediction_season'])

for player in tqdm(list(df['name'].unique())):
    for season in [2015, 2016, 2017, 2018]:
        
        if len(df[(df['name'] == player) & (df['prediction_season'] == season)]) > 0 and \
           len(df[(df['name'] == player) & (df['prediction_season'] == season - 1)]) > 0 and \
           len(df[(df['name'] == player) & (df['prediction_season'] == season - 2)]) > 0:
            
            name = player
            
            matches_year1 = df['matches_year1'][(df['name']==player)&(df['prediction_season']==season-2)].values[0]
            matches_year2 = df['matches_year2'][(df['name']==player)&(df['prediction_season']==season-2)].values[0]
            matches_year3 = df['matches_year2'][(df['name']==player)&(df['prediction_season']==season-1)].values[0]
            matches_year4 = df['matches_year2'][(df['name']==player)&(df['prediction_season']==season)].values[0]
            
            goals_year1 = df['goals_year1'][(df['name'] == player) & (df['prediction_season']==season-2)].values[0]
            goals_year2 = df['goals_year2'][(df['name'] == player) & (df['prediction_season']==season-2)].values[0]
            goals_year3 = df['goals_year2'][(df['name'] == player) & (df['prediction_season']==season-1)].values[0]
            goals_year4 = df['goals_year2'][(df['name'] == player) & (df['prediction_season']==season)].values[0]
            
            goals_per_match_year5 = df['goals_per_match_year3'][(df['name'] == player) & \
                                                                (df['prediction_season'] == season)].values[0]
            
            prediction_season = season
            
            time_series_df = time_series_df.append({'name':name, 
                    'matches_year1':matches_year1, 'matches_year2':matches_year2, 'matches_year3':matches_year3,
                    'matches_year4':matches_year4, 'goals_year1':goals_year1, 'goals_year2':goals_year2, 
                    'goals_year3':goals_year3, 'goals_year4':goals_year4,
                    'goals_per_match_year5':goals_per_match_year5, 'prediction_season':prediction_season}, 
                    ignore_index=True)

print("Initial # of rows:", len(df))
print("# of rows after coverting to 4 seasons of past data:", len(time_series_df))

time_series_df.head()

In [None]:
def compute_ARIMA(row, year1, year2, year3, year4, p_value, d_value, q_value):
    return ARIMA([float(row[year1]), float(row[year2]), float(row[year3]), float(row[year4])], 
                 order=(p_value, d_value, q_value)).fit(disp=0).forecast()[0][0]

def compute_optimal_hyperparameters(time_series_df):
    
    warnings.filterwarnings("ignore")
    
    p_list = [0,0,0,0,0,0,1,1]
    d_list = [0,0,0,1,1,2,0,1]
    q_list = [0,1,2,0,1,0,0,0]
    
    best_score = -1

    for index in tqdm(range(0, len(p_list))):
        
        df = time_series_df.copy()
        
        p_value = p_list[index]
        d_value = d_list[index]
        q_value = q_list[index]
        
        try:
        
            df['matches_year5'] = df.apply(lambda row: compute_ARIMA(row,'matches_year1', 'matches_year2', 
                                           'matches_year3', 'matches_year4', p_value, d_value, q_value), axis=1)
            
            df['goals_year5'] = df.apply(lambda row: compute_ARIMA(row,'goals_year1', 'goals_year2', 
                                         'goals_year3', 'goals_year4', p_value, d_value, q_value), axis=1)
            
            df['predicted_goals_per_match_year5'] = df['goals_year5'] / df['matches_year5']

            score = r2_score(df['goals_per_match_year5'].values, df['predicted_goals_per_match_year5'].values)
            
            if score > best_score:

                best_score = score
                best_p_value = p_value
                best_d_value = d_value
                best_q_value = q_value
        
        except:
            print('Parameters (%d, %d, %d) triggered an exception' % (p_value, d_value, q_value))
            
    return best_p_value, best_d_value, best_q_value

In [None]:
# Method to plot model accuracy on test data
def plot_model_accuracy_on_test_data(y_pred, y_test):

    plt.plot(y_pred, label='prediction')
    plt.plot(y_test, label='true',alpha =0.3)
    plt.legend()
    plt.show() 

    fig, ax = plt.subplots()
    plt.title('Actual value vs predicted value (goals per match year 5)')
    ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0),alpha =0.3)
    ax.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'k--', lw=4)
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicted')
    plt.show()

In [None]:
p_value, d_value, q_value = compute_optimal_hyperparameters(time_series_df)

print('Optimal hyperparameters (%d, %d, %d)' % (p_value, d_value, q_value))

df = time_series_df.copy()

df['matches_year5'] = df.apply(lambda row: compute_ARIMA(row,'matches_year1', 'matches_year2', 
                                           'matches_year3', 'matches_year4', p_value, d_value, q_value), axis=1)
            
df['goals_year5'] = df.apply(lambda row: compute_ARIMA(row,'goals_year1', 'goals_year2', 
                             'goals_year3', 'goals_year4', p_value, d_value, q_value), axis=1)

df['predicted_goals_per_match_year5'] = df['goals_year5'] / df['matches_year5']

score = r2_score(df['goals_per_match_year5'].values, df['predicted_goals_per_match_year5'].values)

print("r2_score:", score)

plot_model_accuracy_on_test_data(df['predicted_goals_per_match_year5'].values, df['goals_per_match_year5'].values)

In [None]:
processed_df = pd.read_csv('../../inputs/final_processed_data.csv')
processed_df = processed_df.groupby('name').agg({'center':'first','back':'first','wing':'first','line':'first',
                                                 'height':'first'}).reset_index()

df = df.merge(processed_df, on='name', how='left')
df.head()

In [None]:
# separate dataset into particular player attributes
centre_df = df[df['center'] == 1]
back_df = df[df['back'] == 1]
wing_df = df[df['wing'] == 1]
line_df = df[df['line'] == 1]

short_players = df[df['height'] < df['height'].quantile(.25)]
tall_players = df[df['height'] > df['height'].quantile(.75)]

rarely_playing = df[df['matches_year4'] < df['matches_year4'].quantile(.25)]
often_playing = df[df['matches_year4'] > df['matches_year4'].quantile(.75)]

low_scoring = df[df['goals_year4'] < df['goals_year4'].quantile(.25)]
high_scoring = df[df['goals_year4'] > df['goals_year4'].quantile(.75)]

In [None]:
r2_score(centre_df['goals_per_match_year5'].values, centre_df['predicted_goals_per_match_year5'].values)

In [None]:
r2_score(back_df['goals_per_match_year5'].values, back_df['predicted_goals_per_match_year5'].values)

In [None]:
r2_score(wing_df['goals_per_match_year5'].values, wing_df['predicted_goals_per_match_year5'].values)

In [None]:
r2_score(line_df['goals_per_match_year5'].values, line_df['predicted_goals_per_match_year5'].values)

In [None]:
r2_score(short_players['goals_per_match_year5'].values, short_players['predicted_goals_per_match_year5'].values)

In [None]:
r2_score(tall_players['goals_per_match_year5'].values, tall_players['predicted_goals_per_match_year5'].values)

In [None]:
r2_score(rarely_playing['goals_per_match_year5'].values, rarely_playing['predicted_goals_per_match_year5'].values)

In [None]:
r2_score(often_playing['goals_per_match_year5'].values, often_playing['predicted_goals_per_match_year5'].values)

In [None]:
r2_score(low_scoring['goals_per_match_year5'].values, low_scoring['predicted_goals_per_match_year5'].values)

In [None]:
r2_score(high_scoring['goals_per_match_year5'].values, high_scoring['predicted_goals_per_match_year5'].values)