In [1]:
import pandas as pd 
import os

# Compiling Individual Player Data

In [2]:
#we first want to compile all the individual players data into two large datasets to work with. one for defenders, and one for midfielders/attackers
att_path = 'Player_Data'

att_dataframe = []
if os.path.exists(att_path):
    for file in os.listdir(att_path):
        if file.endswith('.csv'):
            file_path = os.path.join(att_path, file)
            df = pd.read_csv(file_path)
            att_dataframe.append(df)
    final_df = pd.concat(att_dataframe, ignore_index=
                         True)
final_df.to_csv(os.path.join("", "att_finaldat.csv"))

In [3]:
def_path = os.path.join("Player_Data", "Defenders")

def_dataframe = []
if os.path.exists(def_path):
    for file in os.listdir(def_path):
        if file.endswith('.csv'):
            file_path = os.path.join(def_path, file)
            df = pd.read_csv(file_path)
            def_dataframe.append(df)
    final_df_def = pd.concat(def_dataframe, ignore_index= True)
    
final_df_def.to_csv(os.path.join("", "def_finaldat.csv"))

# Data Exploration (Attackers)

In [4]:
final_df.describe()

Unnamed: 0.1,Unnamed: 0,Matchweek,Minutes Played,Goals,Assists,Penalties Scored,Penalties Attempted,Shots,Shots on Target,Yellow Cards,...,loaned_in,loaned_out,offside,open_play_crosses,penalties_conceded,recoveries,tackled,tackles,target_missed,winning_goals
count,39673.0,39673.0,39673.0,39673.0,39673.0,39673.0,39673.0,39673.0,39673.0,39673.0,...,1971.0,1971.0,1971.0,1971.0,1971.0,1971.0,1971.0,1971.0,1971.0,1971.0
mean,57.789227,19.531394,64.916417,0.152345,0.09475,0.013813,0.017115,1.278552,0.446626,0.120107,...,0.0,0.0,0.210553,0.174531,0.003551,3.723491,1.562659,0.582445,0.414003,0.0345
std,48.740601,10.993223,30.468864,0.41344,0.3182,0.120958,0.134847,1.44896,0.754051,0.329328,...,0.0,0.0,0.552622,0.464999,0.059504,3.127762,1.711563,0.969419,0.718915,0.182557
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,10.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,44.0,19.0,80.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0
75%,87.0,29.0,90.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,0.0
max,249.0,38.0,90.0,4.0,4.0,2.0,2.0,12.0,6.0,2.0,...,0.0,0.0,5.0,4.0,1.0,17.0,10.0,6.0,5.0,1.0


In [5]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Day,Matchweek,Venue,Result,Team,Opponent,Start,Position,...,loaned_in,loaned_out,offside,open_play_crosses,penalties_conceded,recoveries,tackled,tackles,target_missed,winning_goals
0,0,2023-08-12,Sat,1,Home,W 4–1,Brighton,Luton Town,N,FW,...,,,,,,,,,,
1,1,2023-08-19,Sat,2,Away,W 4–1,Brighton,Wolves,N,FW,...,,,,,,,,,,
2,2,2023-08-26,Sat,3,Home,L 1–3,Brighton,West Ham,Y,FW,...,,,,,,,,,,
3,3,2023-09-02,Sat,4,Home,W 3–1,Brighton,Newcastle Utd,Y,FW,...,,,,,,,,,,
4,4,2023-09-16,Sat,5,Away,W 3–1,Brighton,Manchester Utd,N,RW,...,,,,,,,,,,


Can see that some columns have NaN, so we want to have a look at how many columns contain null values

In [6]:
nan_columns = final_df.isna().sum()
nan_columns = nan_columns[nan_columns != False]
nan_columns = nan_columns.reset_index()
nan_columns.columns = ['Column Name', 'NaN Count']
print(nan_columns)

                        Column Name  NaN Count
0                          Position          2
1                  attempted_passes      37702
2               big_chances_created      37702
3                big_chances_missed      37702
4   clearances_blocks_interceptions      37702
5                  completed_passes      37702
6                          dribbles      37702
7                          ea_index      37702
8            errors_leading_to_goal      37702
9    errors_leading_to_goal_attempt      37702
10                            fouls      37702
11                               id      37702
12                       key_passes      37702
13           kickoff_time_formatted      37702
14                        loaned_in      37702
15                       loaned_out      37702
16                          offside      37702
17                open_play_crosses      37702
18               penalties_conceded      37702
19                       recoveries      37702
20           

The position column only has 2 NaN's, but the other columns have majority of the observations being NaN. I suspect this is due to the difference in data that is available throughout the years (big chance created data etc. is available for more recent seasons, but wasn't available previously. We can probably remove the features that have majority of their observations being NaN)

In [7]:
#this code first drops the column that contains info about how many NaN's in each feature. we then also drop the first row, because we would like to keep the 'position' feature for now. Finally, turn the 
#dataframe into a list, so that we can easily use the column names to remove them from the final_df
nan_columns = nan_columns.drop(nan_columns.columns[1], axis = 1)
nan_columns = nan_columns.drop(index = 0)
nan_columns = nan_columns['Column Name'].tolist()

In [8]:
final_df = final_df.drop(columns = nan_columns)

We now want to investigate the two NaN's in the position feature

In [9]:
pd.set_option('display.max_columns', None)
final_df[final_df['Position'].isna()]

Unnamed: 0.1,Unnamed: 0,Date,Day,Matchweek,Venue,Result,Team,Opponent,Start,Position,Minutes Played,Goals,Assists,Penalties Scored,Penalties Attempted,Shots,Shots on Target,Yellow Cards,Red Cards,Touches,Tackles,Interceptions,Blocks,xG,npxG,xAG,Shot Creating Actions,Goal Creating Actions,Passes Completed,Passes Attempted,Progressive Passes,Carries,Progressive Carries,Take-ons Attempted,Successful Take-ons,Passing Distance,Progressive Passing Distance,Short Passes Completed,Short Passes Attempted,Medium Passes Completed,Medium Passes Attempted,Long Passes Completed,Long Passes Attempted,Expected Assists,Key Passes,Passes into Final Third,Passes into Penalty Area,Crosses into Penalty Area,Live Pass,Dead Pass,Free Kick Pass,Through Balls,Switches,Crosses,Throw Ins Taken,Corners Taken,Passes Offside,Live SCA,Deadball SCA,Take-on SCA,Shot SCA,Foul SCA,Defense SCA,Live GCA,Deadball GCA,Take-on GCA,Shot GCA,Foul GCA,Defense GCA,Tackles Won,Defensive Third Tackles,Middle Third Tackles,Attacking Third Tackles,Dribblers Tackled,Dribblers Tackled Attempts,Challenges Lost,Shots Blocked,Passes Blocked,Clearances,Defensive Errors,Defensive Penalty Area Touches,Defensive Third Touches,Middle Third Touches,Attacking Third Touches,Penalty Area Touches,Carry Distance,Progressive Carry Distance,Final Third Carries,Carries into Penalty Area,Miscontrols,Dispossessed,Passes Received,Progressive Passes Received,bonus,bps,clean_sheets,creativity,ict_index,influence,kickoff_time,minutes,own_goals,round,saves,selected,threat,total_points,transfers_balance,transfers_in,transfers_out,value,kickoff_date
2469,76,2021-01-20,Wed,1,Home,W 2–0,Manchester City,Aston Villa,N,,32,0,0,0,0,2,0,0,0,12,0,0,0,0.1,0.1,0.0,1,1,5,6,0,5,0,0,0,39,3,3,3,0,0,0,0,0.0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,8,3,14.0,2.0,0,0,2,0,9,0,0,10,0,10.9,2.9,0.0,2021-01-20 18:00:00+00:00,31,0,19,0,109790,19.0,4,13192,24289,11097,91,2021-01-20
34170,122,2021-01-12,Tue,1,Home,L 0–1,Burnley,Manchester Utd,N,,26,0,0,0,0,0,0,0,0,21,0,1,0,0.0,0.0,0.0,1,0,15,18,2,10,0,1,0,235,72,9,10,4,4,1,2,0.0,0,2,0,0,17,1,0,0,0,2,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,4,12,5,0,58.0,30.0,0,0,0,0,15,1,0,2,0,0.9,0.4,2.6,2021-01-12 20:15:00+00:00,25,0,18,0,23787,0.0,1,-1159,115,1274,57,2021-01-12


By looking at the data manually, we can input positions for these two observations. 
* 2469 - Gabriel Jesus - LW
* 34170 - Dwight McNeil - LW 

In [10]:
final_df.loc[2469, 'Position'] = 'LW'
final_df.loc[34170, 'Position'] = 'LW'

#just checking to make sure the inputation worked 
final_df[final_df['Position'].isna()]

Unnamed: 0.1,Unnamed: 0,Date,Day,Matchweek,Venue,Result,Team,Opponent,Start,Position,Minutes Played,Goals,Assists,Penalties Scored,Penalties Attempted,Shots,Shots on Target,Yellow Cards,Red Cards,Touches,Tackles,Interceptions,Blocks,xG,npxG,xAG,Shot Creating Actions,Goal Creating Actions,Passes Completed,Passes Attempted,Progressive Passes,Carries,Progressive Carries,Take-ons Attempted,Successful Take-ons,Passing Distance,Progressive Passing Distance,Short Passes Completed,Short Passes Attempted,Medium Passes Completed,Medium Passes Attempted,Long Passes Completed,Long Passes Attempted,Expected Assists,Key Passes,Passes into Final Third,Passes into Penalty Area,Crosses into Penalty Area,Live Pass,Dead Pass,Free Kick Pass,Through Balls,Switches,Crosses,Throw Ins Taken,Corners Taken,Passes Offside,Live SCA,Deadball SCA,Take-on SCA,Shot SCA,Foul SCA,Defense SCA,Live GCA,Deadball GCA,Take-on GCA,Shot GCA,Foul GCA,Defense GCA,Tackles Won,Defensive Third Tackles,Middle Third Tackles,Attacking Third Tackles,Dribblers Tackled,Dribblers Tackled Attempts,Challenges Lost,Shots Blocked,Passes Blocked,Clearances,Defensive Errors,Defensive Penalty Area Touches,Defensive Third Touches,Middle Third Touches,Attacking Third Touches,Penalty Area Touches,Carry Distance,Progressive Carry Distance,Final Third Carries,Carries into Penalty Area,Miscontrols,Dispossessed,Passes Received,Progressive Passes Received,bonus,bps,clean_sheets,creativity,ict_index,influence,kickoff_time,minutes,own_goals,round,saves,selected,threat,total_points,transfers_balance,transfers_in,transfers_out,value,kickoff_date


In [11]:
final_df.head(8)

Unnamed: 0.1,Unnamed: 0,Date,Day,Matchweek,Venue,Result,Team,Opponent,Start,Position,Minutes Played,Goals,Assists,Penalties Scored,Penalties Attempted,Shots,Shots on Target,Yellow Cards,Red Cards,Touches,Tackles,Interceptions,Blocks,xG,npxG,xAG,Shot Creating Actions,Goal Creating Actions,Passes Completed,Passes Attempted,Progressive Passes,Carries,Progressive Carries,Take-ons Attempted,Successful Take-ons,Passing Distance,Progressive Passing Distance,Short Passes Completed,Short Passes Attempted,Medium Passes Completed,Medium Passes Attempted,Long Passes Completed,Long Passes Attempted,Expected Assists,Key Passes,Passes into Final Third,Passes into Penalty Area,Crosses into Penalty Area,Live Pass,Dead Pass,Free Kick Pass,Through Balls,Switches,Crosses,Throw Ins Taken,Corners Taken,Passes Offside,Live SCA,Deadball SCA,Take-on SCA,Shot SCA,Foul SCA,Defense SCA,Live GCA,Deadball GCA,Take-on GCA,Shot GCA,Foul GCA,Defense GCA,Tackles Won,Defensive Third Tackles,Middle Third Tackles,Attacking Third Tackles,Dribblers Tackled,Dribblers Tackled Attempts,Challenges Lost,Shots Blocked,Passes Blocked,Clearances,Defensive Errors,Defensive Penalty Area Touches,Defensive Third Touches,Middle Third Touches,Attacking Third Touches,Penalty Area Touches,Carry Distance,Progressive Carry Distance,Final Third Carries,Carries into Penalty Area,Miscontrols,Dispossessed,Passes Received,Progressive Passes Received,bonus,bps,clean_sheets,creativity,ict_index,influence,kickoff_time,minutes,own_goals,round,saves,selected,threat,total_points,transfers_balance,transfers_in,transfers_out,value,kickoff_date
0,0,2023-08-12,Sat,1,Home,W 4–1,Brighton,Luton Town,N,FW,13,1,0,0,0,2,1,0,0,10,0,0,0,0.9,0.9,0.1,1,0,6,6,1,8,0,1,0,62,9,5,5,1,1,0,0,0.0,1,0,1,0,5,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,6,2,24.0,1.0,0,0,0,0,7,1,0,25,0,10.6,8.0,32.2,2023-08-12 14:00:00+00:00,12,0,1,0,559185,37.0,5,0,0,0,60,2023-08-12
1,1,2023-08-19,Sat,2,Away,W 4–1,Brighton,Wolves,N,FW,24,0,0,0,0,1,0,0,0,11,0,0,2,0.1,0.1,0.0,1,0,2,4,0,4,0,2,1,11,0,2,3,0,0,0,0,0.0,0,0,0,0,3,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,1,1,0,0,2,2,5,4,1,30.0,9.0,0,0,1,0,7,0,0,-1,0,1.5,0.8,2.0,2023-08-19 14:00:00+00:00,23,0,2,0,593303,4.0,1,-16405,37439,53844,60,2023-08-19
2,2,2023-08-26,Sat,3,Home,L 1–3,Brighton,West Ham,Y,FW,90,0,0,0,0,6,4,0,0,32,0,0,0,0.6,0.6,0.0,2,0,19,21,1,21,0,1,0,211,26,15,15,4,5,0,0,0.0,0,0,1,0,17,4,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,22,6,70.0,17.0,0,0,2,2,25,5,0,-2,0,3.5,8.4,10.2,2023-08-26 16:30:00+00:00,90,0,3,0,506600,70.0,2,-95362,22125,117487,60,2023-08-26
3,3,2023-09-02,Sat,4,Home,W 3–1,Brighton,Newcastle Utd,Y,FW,80,3,0,0,0,4,4,0,0,27,0,0,1,0.8,0.8,0.0,1,0,17,18,3,23,2,2,2,252,64,11,11,4,5,1,1,0.0,0,2,1,0,17,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,11,15,3,163.0,75.0,2,0,3,1,26,3,3,81,1,3.1,15.9,103.4,2023-09-02 16:30:00+00:00,80,0,4,0,453114,52.0,17,-60023,11535,71558,59,2023-09-02
4,4,2023-09-16,Sat,5,Away,W 3–1,Brighton,Manchester Utd,N,RW,6,0,0,0,0,1,1,0,0,10,0,0,1,0.1,0.1,0.0,0,0,6,6,0,10,1,1,0,104,4,3,3,0,0,2,2,0.0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,5,5,1,73.0,33.0,1,0,1,2,10,1,0,1,0,1.9,2.2,3.2,2023-09-16 14:00:00+00:00,5,0,5,0,765758,17.0,1,272535,355471,82936,60,2023-09-16
5,5,2023-09-24,Sun,6,Home,W 3–1,Brighton,Bournemouth,Y,FW,45,0,0,0,0,1,1,0,0,15,0,0,0,0.0,0.0,0.1,2,0,6,8,0,9,0,0,0,87,4,4,6,0,0,1,1,0.3,1,0,0,0,8,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,8,6,1,13.0,6.0,1,0,3,1,11,2,0,3,0,11.1,2.6,6.2,2023-09-24 13:00:00+00:00,45,0,6,0,802885,9.0,1,34133,111860,77727,60,2023-09-24
6,6,2023-09-30,Sat,7,Away,L 1–6,Brighton,Aston Villa,Y,FW,45,0,0,0,0,0,0,0,0,16,0,0,1,0.0,0.0,0.0,0,0,11,11,1,8,1,1,0,121,16,7,7,3,3,0,0,0.0,0,1,0,0,7,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,8,5,0,35.0,13.0,0,0,1,1,9,0,0,-3,0,1.7,0.0,0.0,2023-09-30 11:30:00+00:00,45,0,7,0,779047,0.0,1,-28049,75147,103196,60,2023-09-30
7,7,2023-10-08,Sun,8,Home,D 2–2,Brighton,Liverpool,Y,FW,59,0,0,0,0,1,0,0,0,19,2,0,1,0.0,0.0,0.3,2,0,11,13,1,10,0,0,0,178,50,7,8,1,1,2,2,0.0,1,1,0,0,12,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,2,0,2,4,10,5,0,48.0,7.0,0,0,1,3,14,2,0,1,0,11.2,1.7,4.4,2023-10-08 13:00:00+00:00,59,0,8,0,709129,1.0,1,-71612,33271,104883,60,2023-10-08


A quick look at the first 5 rows yields the following observations
* The first column (Unnamed:0) can be removed, because it is just the index for the individual player dataframes 
* The 'Date' column can be removed, since this information is duplicated in the 'kickoff_time' column, for the same reason we can remove the 'kickoff_date' column
* The 'minutes' column can be removed, as this gives the same information as the 'Mins Played' column 
* The 'round' column can be removed, as this gives the same information as the 'Matchweek' column 


In [15]:
final_df = final_df.drop(columns = ['Unnamed: 0', 'Date', 'kickoff_date', 'minutes', 'round'])
final_df.head()

Unnamed: 0,Day,Matchweek,Venue,Result,Team,Opponent,Start,Position,Minutes Played,Goals,Assists,Penalties Scored,Penalties Attempted,Shots,Shots on Target,Yellow Cards,Red Cards,Touches,Tackles,Interceptions,Blocks,xG,npxG,xAG,Shot Creating Actions,Goal Creating Actions,Passes Completed,Passes Attempted,Progressive Passes,Carries,Progressive Carries,Take-ons Attempted,Successful Take-ons,Passing Distance,Progressive Passing Distance,Short Passes Completed,Short Passes Attempted,Medium Passes Completed,Medium Passes Attempted,Long Passes Completed,Long Passes Attempted,Expected Assists,Key Passes,Passes into Final Third,Passes into Penalty Area,Crosses into Penalty Area,Live Pass,Dead Pass,Free Kick Pass,Through Balls,Switches,Crosses,Throw Ins Taken,Corners Taken,Passes Offside,Live SCA,Deadball SCA,Take-on SCA,Shot SCA,Foul SCA,Defense SCA,Live GCA,Deadball GCA,Take-on GCA,Shot GCA,Foul GCA,Defense GCA,Tackles Won,Defensive Third Tackles,Middle Third Tackles,Attacking Third Tackles,Dribblers Tackled,Dribblers Tackled Attempts,Challenges Lost,Shots Blocked,Passes Blocked,Clearances,Defensive Errors,Defensive Penalty Area Touches,Defensive Third Touches,Middle Third Touches,Attacking Third Touches,Penalty Area Touches,Carry Distance,Progressive Carry Distance,Final Third Carries,Carries into Penalty Area,Miscontrols,Dispossessed,Passes Received,Progressive Passes Received,bonus,bps,clean_sheets,creativity,ict_index,influence,kickoff_time,own_goals,saves,selected,threat,total_points,transfers_balance,transfers_in,transfers_out,value
0,Sat,1,Home,W 4–1,Brighton,Luton Town,N,FW,13,1,0,0,0,2,1,0,0,10,0,0,0,0.9,0.9,0.1,1,0,6,6,1,8,0,1,0,62,9,5,5,1,1,0,0,0.0,1,0,1,0,5,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,6,2,24.0,1.0,0,0,0,0,7,1,0,25,0,10.6,8.0,32.2,2023-08-12 14:00:00+00:00,0,0,559185,37.0,5,0,0,0,60
1,Sat,2,Away,W 4–1,Brighton,Wolves,N,FW,24,0,0,0,0,1,0,0,0,11,0,0,2,0.1,0.1,0.0,1,0,2,4,0,4,0,2,1,11,0,2,3,0,0,0,0,0.0,0,0,0,0,3,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,1,1,0,0,2,2,5,4,1,30.0,9.0,0,0,1,0,7,0,0,-1,0,1.5,0.8,2.0,2023-08-19 14:00:00+00:00,0,0,593303,4.0,1,-16405,37439,53844,60
2,Sat,3,Home,L 1–3,Brighton,West Ham,Y,FW,90,0,0,0,0,6,4,0,0,32,0,0,0,0.6,0.6,0.0,2,0,19,21,1,21,0,1,0,211,26,15,15,4,5,0,0,0.0,0,0,1,0,17,4,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,22,6,70.0,17.0,0,0,2,2,25,5,0,-2,0,3.5,8.4,10.2,2023-08-26 16:30:00+00:00,0,0,506600,70.0,2,-95362,22125,117487,60
3,Sat,4,Home,W 3–1,Brighton,Newcastle Utd,Y,FW,80,3,0,0,0,4,4,0,0,27,0,0,1,0.8,0.8,0.0,1,0,17,18,3,23,2,2,2,252,64,11,11,4,5,1,1,0.0,0,2,1,0,17,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,11,15,3,163.0,75.0,2,0,3,1,26,3,3,81,1,3.1,15.9,103.4,2023-09-02 16:30:00+00:00,0,0,453114,52.0,17,-60023,11535,71558,59
4,Sat,5,Away,W 3–1,Brighton,Manchester Utd,N,RW,6,0,0,0,0,1,1,0,0,10,0,0,1,0.1,0.1,0.0,0,0,6,6,0,10,1,1,0,104,4,3,3,0,0,2,2,0.0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,5,5,1,73.0,33.0,1,0,1,2,10,1,0,1,0,1.9,2.2,3.2,2023-09-16 14:00:00+00:00,0,0,765758,17.0,1,272535,355471,82936,60


We now want to take a look at the data types of our remaining features, and then make any adjustments if necessary

In [25]:
final_df_datatypes = final_df.dtypes.reset_index()
final_df_datatypes.columns = ['Column Name', 'Data Type']

pd.set_option('display.max_rows', None)
final_df_datatypes

Unnamed: 0,Column Name,Data Type
0,Day,object
1,Matchweek,int64
2,Venue,object
3,Result,object
4,Team,object
5,Opponent,object
6,Start,object
7,Position,object
8,Minutes Played,int64
9,Goals,int64


Looking at this, the main column we want to change right away is 'kickoff_time' into a datetime64 object. The rest we can leave for now

In [26]:
final_df['kickoff_time'] = pd.to_datetime(final_df['kickoff_time'])

Prior to actually exploring the data, we need to separate our dataset into training and test sets, so we don't draw any conclusions from data that is supposed to be unseen. 

In [28]:
from sklearn.model_selection import train_test_split
att_train, att_test = train_test_split(final_df, test_size = 0.2, random_state = 66)

In [29]:
att_train.head()

Unnamed: 0,Day,Matchweek,Venue,Result,Team,Opponent,Start,Position,Minutes Played,Goals,Assists,Penalties Scored,Penalties Attempted,Shots,Shots on Target,Yellow Cards,Red Cards,Touches,Tackles,Interceptions,Blocks,xG,npxG,xAG,Shot Creating Actions,Goal Creating Actions,Passes Completed,Passes Attempted,Progressive Passes,Carries,Progressive Carries,Take-ons Attempted,Successful Take-ons,Passing Distance,Progressive Passing Distance,Short Passes Completed,Short Passes Attempted,Medium Passes Completed,Medium Passes Attempted,Long Passes Completed,Long Passes Attempted,Expected Assists,Key Passes,Passes into Final Third,Passes into Penalty Area,Crosses into Penalty Area,Live Pass,Dead Pass,Free Kick Pass,Through Balls,Switches,Crosses,Throw Ins Taken,Corners Taken,Passes Offside,Live SCA,Deadball SCA,Take-on SCA,Shot SCA,Foul SCA,Defense SCA,Live GCA,Deadball GCA,Take-on GCA,Shot GCA,Foul GCA,Defense GCA,Tackles Won,Defensive Third Tackles,Middle Third Tackles,Attacking Third Tackles,Dribblers Tackled,Dribblers Tackled Attempts,Challenges Lost,Shots Blocked,Passes Blocked,Clearances,Defensive Errors,Defensive Penalty Area Touches,Defensive Third Touches,Middle Third Touches,Attacking Third Touches,Penalty Area Touches,Carry Distance,Progressive Carry Distance,Final Third Carries,Carries into Penalty Area,Miscontrols,Dispossessed,Passes Received,Progressive Passes Received,bonus,bps,clean_sheets,creativity,ict_index,influence,kickoff_time,own_goals,saves,selected,threat,total_points,transfers_balance,transfers_in,transfers_out,value
10000,Sun,32,Away,L 1–2,Crystal Palace,Leicester City,Y,DM,90,0,0,0,0,1,0,0,0,56,2,1,4,0.1,0.1,0.2,2,0,31,37,6,26,1,8,5,523,144,13,15,15,16,2,3,0.1,2,2,0,0,36,1,1,1,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,2,1,1,3,2,0,3,16,31,10,2,160.0,66.0,3,0,1,1,28,3,0,14,0,24.2,8.1,21.2,2022-04-10 13:00:00+00:00,0,0,6955,36.0,2,27,343,316,54
24977,Sat,29,Away,L 1–2,Bournemouth,Liverpool,N,LW,23,0,0,0,0,0,0,0,0,10,0,1,0,0.0,0.0,0.0,0,0,5,7,1,5,0,1,0,47,21,2,4,1,1,0,0,0.0,0,1,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,4,3,0,50.0,29.0,0,0,0,0,4,0,0,2,0,0.8,0.4,2.8,2020-03-07 12:30:00+00:00,0,0,24392,0.0,1,-359,133,492,52
37756,Sun,37,Away,D 0–0,Huddersfield,Manchester City,Y,CM,90,0,0,0,0,0,0,0,0,36,4,9,2,0.0,0.0,0.0,0,0,10,17,0,13,0,0,0,106,39,6,8,2,5,0,2,0.0,0,0,0,0,17,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,1,1,0,1,1,0,2,4,0,8,20,11,6,0,58.0,40.0,0,0,1,2,7,0,0,13,1,3.6,2.6,22.8,2018-05-06 12:30:00+00:00,0,0,10349,0.0,3,-47,129,176,44
18759,Sun,34,Away,D 2–2,Southampton,Brighton,N,LM,25,0,0,0,0,1,0,0,0,17,0,0,0,0.0,0.0,0.1,1,0,9,11,1,10,0,0,0,87,25,6,6,1,2,0,0,0.2,1,0,1,1,11,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,3,6,8,2,65.0,32.0,1,0,2,2,14,3,0,3,0,15.1,2.6,5.6,2022-04-24 13:00:00+00:00,0,0,19229,5.0,1,-1446,288,1734,58
168,Sun,38,Home,W 5–0,Manchester City,Norwich City,Y*,LM,84,0,0,0,0,5,1,0,0,73,3,0,1,0.4,0.4,0.1,8,2,49,56,10,47,2,1,1,675,104,29,32,18,20,1,2,0.1,1,6,3,0,55,1,0,0,0,0,0,0,0,5,0,1,2,0,0,2,0,0,0,0,0,3,0,3,0,0,0,0,0,1,0,0,0,5,27,41,17,211.0,103.0,3,0,5,0,60,12,0,18,1,18.3,13.2,12.2,2020-07-26 15:00:00+00:00,0,0,801163,101.0,3,198047,228466,30419,75


Our initial goal is to use the data available to predict the number of goals scored. We can start by exploring the relationships between each of the features and goals scored. We start off by looking at the relationship between the 'Day' feature and the 'Goals' feature. 

In [31]:
att_train['Day'].unique()

array(['Sun', 'Sat', 'Mon', 'Wed', 'Fri', 'Tue', 'Thu'], dtype=object)