In [131]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import timeit

In [132]:
#Position Booleans
goalkeeper = False
defender = False
midfielder = False
forward = True


if goalkeeper == True:
    position = 1
    position_str = 'goalkeeper'
elif defender == True:
    position = 2
    position_str = 'defender'
elif midfielder == True:
    position = 3
    position_str = 'midfielder'
elif forward == True:
    position = 4
    position_str = 'forward'
else:
    print("Error: Invalid position value ")
    sys.exit()
    
if goalkeeper == False:
    column_names = ['element','GW','minutes','goals_scored','assists','clean_sheets','goals_conceded','own_goals',
                    'penalties_missed','yellow_cards','red_cards','bonus','bps','influence','creativity','threat',
                    'ict_index','open_play_crosses','big_chances_created','clearances_blocks_interception','recoveries',
                    'key_passes','tackles','attempted_passes','completed_passes','big_chances_missed',
                    'errors_leading_to_goal','errors_leading_to_goal_attempt','tackled','offside','target_missed',
                    'fouls','dribbles','total_points','threshold']  #added threshold
    
    feature_names = ['minutes','goals_scored','assists','clean_sheets','goals_conceded','own_goals',
                    'penalties_missed','yellow_cards','red_cards','bonus','bps','influence','creativity','threat',
                    'ict_index','open_play_crosses','big_chances_created','clearances_blocks_interception','recoveries',
                    'key_passes','tackles','attempted_passes','completed_passes','big_chances_missed',
                    'errors_leading_to_goal','errors_leading_to_goal_attempt','tackled','offside','target_missed',
                    'fouls','dribbles']
    
elif goalkeeper == True:
    column_names = ['element','GW','minutes','clean_sheets','goals_conceded','own_goals','penalties_missed',
                    'yellow_cards','red_cards','bonus','bps','influence','creativity','threat','ict_index',
                    'big_chances_created','clearances_blocks_interception','recoveries','key_passes','tackles',
                    'attempted_passes','completed_passes','big_chances_missed','errors_leading_to_goal','errors_leading_to_goal_attempt',
                    'tackled','penalties_saved','saves','penalties_conceded','total_points','threshold']  #added threshold
    
    feature_names = ['minutes','clean_sheets','goals_conceded','own_goals','penalties_missed',
                    'yellow_cards','red_cards','bonus','bps','influence','creativity','threat','ict_index',
                    'big_chances_created','clearances_blocks_interception','recoveries','key_passes','tackles',
                    'attempted_passes','completed_passes','big_chances_missed','errors_leading_to_goal','errors_leading_to_goal_attempt',
                    'tackled','penalties_saved','saves','penalties_conceded']
    
    
    
#Import dataframe from csv 
#position_str is player position and date is the day the dataset was prepared in form YYYY-MM-DD (e.g 2018-12-27)
date = '2018-12-26'

try:
    df = pd.read_csv('data/'+str(position_str)+'/'+str(date)+'.csv',index_col='Unnamed: 0') #index_col = first col
except FileNotFoundError:
    print("ERROR FileNotFoundError: Check player position and date is an available dataset")
    print("E.g. data/forward/2018-12-26.csv")



In [133]:
#Further dataset cleaning
#Only to include players having played at least 1 minute of an individual gameweek 
print("Size of dataset (All): ", str(len(df.index)))
df = df[df['minutes']>0]
print("Size of dataset (minutes >0): ",str(len(df.index)))

df
pd.options.display.max_columns = None  #Line required to show all columns in df output below
display(df.head())

Size of dataset (All):  1466
Size of dataset (minutes >0):  797


Unnamed: 0,element,GW,minutes,goals_scored,assists,clean_sheets,goals_conceded,own_goals,penalties_missed,yellow_cards,red_cards,bonus,bps,influence,creativity,threat,ict_index,open_play_crosses,big_chances_created,clearances_blocks_interception,recoveries,key_passes,tackles,attempted_passes,completed_passes,big_chances_missed,errors_leading_to_goal,errors_leading_to_goal_attempt,tackled,offside,target_missed,fouls,dribbles,total_points,threshold
2,21,3,15,1,0,0,0,0,0,0,0,3,30,40.2,0.3,34.0,7.5,0,0,0,3,0,1,4,2,0,0,0,0,0,0,0,0,8,1
3,21,4,6,0,0,0,0,0,0,0,0,0,3,0.4,0.8,0.0,0.1,0,0,0,0,0,0,4,4,0,0,0,0,0,0,0,0,1,0
4,21,5,10,0,0,0,1,0,0,0,0,0,1,0.0,10.8,6.0,1.6,0,0,0,2,1,0,3,3,0,0,0,1,0,1,1,0,1,0
5,21,6,10,0,0,0,0,0,0,0,0,0,1,1.0,0.9,18.0,2.0,0,0,1,2,0,0,6,6,0,0,0,1,0,0,1,0,1,0
6,21,7,13,0,0,0,0,0,0,0,0,0,3,2.8,10.5,0.0,1.3,0,0,0,1,1,0,6,4,0,0,0,1,0,0,0,0,1,0


In [134]:
#Essentially, we want to use the values of the features (e.g. goals_scored, assists etc) to determine whether or
#not a player in a given week will achieve > threshold number of points (i.e. this is a classification task).
#This is what we will do with the data of past GW's however firstly we have to consider how to predict player 
#performance for their next fixture. 

#At the moment we will use a simple average to predict the feature value for their next GW given a 'form' defined
#as 4, 5 or 6 games. For example, we will take the average of feature 'assists' from the previous 6 GW's (i.e. GW's 4-9)
#before a given GW (GW 10) and assess how close our prediction is by looking at the difference between the
#predicted value and the actual value. 

form_weeks_num = 6   #Number of form weeks
weeks = np.arange(7,39)  #List of weeks to predict  


feature_names_plus_GW_ID = feature_names.copy()   #Had to insert these two lines as 'insert' was getting annoying whilst debugging
feature_names_plus_GW_ID.insert(0,'GW')           #Used in series below
feature_names_plus_GW_ID.insert(0,'element')

df_pred = pd.DataFrame(columns=column_names)   #Initalise datafame with predicted values of features


start_time = timeit.default_timer()  #For timing how long output takes to complete

#Start of loop that will fill df_pred
for element in df['element'].unique():    #Loops through player ID's
    
    for week in weeks:    #Loops through weeks to predict
        GW_pred_arr = [element,week]
        av_weeks = np.arange(week-form_weeks_num, week)  #weeks list of week-form_weeks_num

        for feature in feature_names:    #Loops through features
            av_array = []

            for gw in av_weeks:       #Loops through GW before predicted week
                if gw in df[df['element']==element]['GW'].values:  #Needed incase player doesn't play certain weeks
                    av_array.append(df[df['GW']==gw][feature].values[0])   #Notation necessary for getting value

            if len(av_array)>0:      #No need to add if player didn't play
                GW_pred_arr.append(np.mean(av_array))
            else:
                break

        if len(GW_pred_arr)>2:  #Has to be 2 as GW_pred_arr already [element,GW]
            series = pd.Series(GW_pred_arr,index=feature_names_plus_GW_ID)  #Append to df_pred
            df_pred = df_pred.append(series,ignore_index=True)

            
elapsed = timeit.default_timer() - start_time  #Finish timer
print("Take taken for completion: ", elapsed," seconds")

display(df_pred.head())

Take taken for completion:  131.39260254700275


Unnamed: 0,element,GW,minutes,goals_scored,assists,clean_sheets,goals_conceded,own_goals,penalties_missed,yellow_cards,red_cards,bonus,bps,influence,creativity,threat,ict_index,open_play_crosses,big_chances_created,clearances_blocks_interception,recoveries,key_passes,tackles,attempted_passes,completed_passes,big_chances_missed,errors_leading_to_goal,errors_leading_to_goal_attempt,tackled,offside,target_missed,fouls,dribbles,total_points,threshold
0,21.0,7.0,10.25,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.75,8.75,10.4,3.2,14.5,2.8,0.0,0.0,0.25,1.75,0.25,0.25,4.25,3.75,0.0,0.0,0.0,0.5,0.0,0.25,0.5,0.0,,
1,21.0,8.0,10.8,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.6,7.6,8.88,4.66,11.6,2.5,0.0,0.0,0.2,1.6,0.4,0.2,4.6,3.8,0.0,0.0,0.0,0.6,0.0,0.2,0.4,0.0,,
2,21.0,9.0,19.166667,0.166667,0.166667,0.0,0.333333,0.0,0.0,0.0,0.0,0.5,10.0,11.733333,9.4,10.333333,3.133333,0.0,0.0,0.333333,1.666667,0.833333,0.5,7.833333,6.0,0.0,0.0,0.0,0.5,0.0,0.166667,0.333333,0.0,,
3,21.0,10.0,20.0,0.0,0.2,0.0,0.4,0.0,0.0,0.0,0.0,0.0,6.0,6.04,11.22,5.6,2.26,0.0,0.0,0.4,1.4,1.0,0.4,8.6,6.8,0.0,0.0,0.0,0.6,0.0,0.2,0.4,0.0,,
4,21.0,11.0,23.2,0.0,0.2,0.0,0.6,0.0,0.0,0.0,0.0,0.0,5.8,6.6,11.12,5.6,2.32,0.0,0.0,0.4,1.6,1.0,0.4,8.4,6.4,0.0,0.0,0.0,1.2,0.0,0.2,0.4,0.4,,


In [None]:
#......... continue by dropping certain columns in df_pred and graphing + finding difference between actual weekly score (if played) and predicted score. Try cleaning df_pred to integers/floats etc..................