In [1]:
# Packages
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint

In [2]:
# Import game level stats for GK and FP
## 2017 and 2020 dropped from project. Inconsistent statistics available for 2017, No all star game in 2020
S22_GK_stats = pd.read_csv('C:/Users/scott/OneDrive/Documents/GitHub/Capstone/Data/S22_GK_stats.csv')
S21_GK_stats = pd.read_csv('C:/Users/scott/OneDrive/Documents/GitHub/Capstone/Data/S21_GK_stats.csv')
S19_GK_stats = pd.read_csv('C:/Users/scott/OneDrive/Documents/GitHub/Capstone/Data/S19_GK_stats.csv')
S18_GK_stats = pd.read_csv('C:/Users/scott/OneDrive/Documents/GitHub/Capstone/Data/S18_GK_stats.csv')
S22_FP_stats = pd.read_csv('C:/Users/scott/OneDrive/Documents/GitHub/Capstone/Data/S22_FP_stats.csv')
S21_FP_stats = pd.read_csv('C:/Users/scott/OneDrive/Documents/GitHub/Capstone/Data/S21_FP_stats.csv')
S19_FP_stats = pd.read_csv('C:/Users/scott/OneDrive/Documents/GitHub/Capstone/Data/S19_FP_stats.csv')
S18_FP_stats = pd.read_csv('C:/Users/scott/OneDrive/Documents/GitHub/Capstone/Data/S18_FP_stats.csv')

In [3]:
GK_stats = pd.concat([S22_GK_stats,S21_GK_stats,S19_GK_stats,S18_GK_stats], ignore_index= True)
# GK_stats.to_csv('C:/Users/scott/OneDrive/BANA/Capstone/Data/GK_stats.csv') # save file
FP_stats = pd.concat([S22_FP_stats,S21_FP_stats,S19_FP_stats,S18_FP_stats], ignore_index= True)
# FP_stats.to_csv('C:/Users/scott/OneDrive/BANA/Capstone/Data/FP_stats.csv') # save file

In [4]:
# Add Captain Indicator
FP_stats['Captain'] = 0
FP_stats.loc[FP_stats['Start'] == "Y*", 'Captain'] = 1
GK_stats['Captain'] = 0
GK_stats.loc[GK_stats['Start'] == "Y*", 'Captain'] = 1
# Add Count of Games Played (will be aggregated)
FP_stats['Games_Played'] = 1
GK_stats['Games_Played'] = 1

In [5]:
# Isolate Y/N from start status
FP_stats['Start'] = FP_stats['Start'].str[0]
GK_stats['Start'] = GK_stats['Start'].str[0]
# Convert Start to bool
FP_stats['Start'] = FP_stats['Start'].replace({'Y':1,'N':0})
GK_stats['Start'] = GK_stats['Start'].replace({'Y':1,'N':0})
# Summarize team performance using match result scoring
FP_stats['Win_Loss_Draw'] = FP_stats['Win_Loss_Draw'].replace({'L':0, 'D':1, 'W':3})
GK_stats['Win_Loss_Draw'] = GK_stats['Win_Loss_Draw'].replace({'L':0, 'D':1, 'W':3})
# Drop stats that occur after All star game 
FP_stats = FP_stats[FP_stats['Before_All_Star'] == True]
GK_stats = GK_stats[GK_stats['Before_All_Star'] == True]

In [6]:
# Drop fields that are not needed
FP_stats = FP_stats.drop(['Unnamed: 0','Date','Day','Comp','Round','Venue','Result','Squad','Opponent','Pos','Match Report'], axis = 1)
GK_stats = GK_stats.drop(['Unnamed: 0','Date','Day','Comp','Round','Venue','Result','Squad','Opponent','Pos','Match Report'], axis = 1)

In [7]:
print(FP_stats.columns)
print(GK_stats.columns)

Index(['Start', 'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY',
       'CrdR', 'Touches', 'Press', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xA',
       'SCA', 'GCA', 'Cmp', 'Att', 'Cmp%', 'Prog', 'Carries', 'Prog.1', 'Succ',
       'Att.1', 'fbref_ID', 'Win_Loss_Draw', 'Season', 'Before_All_Star',
       'Captain', 'Games_Played'],
      dtype='object')
Index(['Start', 'Min', 'SoTA', 'GA', 'Saves', 'Save%', 'CS', 'PSxG', 'PKatt',
       'PKA', 'PKsv', 'PKm', 'Cmp', 'Att', 'Cmp%', 'Att.1', 'Thr', 'Launch%',
       'AvgLen', 'Att.2', 'Launch%.1', 'AvgLen.1', 'Opp', 'Stp', 'Stp%',
       '#OPA', 'AvgDist', 'fbref_ID', 'Win_Loss_Draw', 'Season',
       'Before_All_Star', 'Captain', 'Games_Played'],
      dtype='object')


In [8]:
# Summarize player stats by ID, season, before all star game
## Aggregation dicts (could this be simplified by using tuples?)
    # Should more of the aggregations be mean instead of sum to account for...
FP_aggs = {'Start':'sum', 'Min':'sum', 'Gls':'sum', 'Ast':'sum', 'PK':'sum', 'PKatt':'sum', 'Sh':'sum', 'SoT':'sum', 'CrdY':'sum', 'CrdR':'sum', 'Touches':'sum', 'Press':'sum', 'Tkl':'sum', 'Int':'sum', 'Blocks':'sum', 'xG':'mean', 'npxG':'mean', 'xA':'mean', 'SCA':'sum', 'GCA':'sum', 'Cmp':'sum', 'Att':'sum', 'Cmp%':'mean', 'Prog':'sum', 'Carries':'sum', 'Prog.1':'sum', 'Succ':'sum', 'Att.1':'sum', 'Win_Loss_Draw':'sum', 'Captain':'sum', 'Games_Played':'sum'}
GK_aggs = {'Start':'sum', 'Min':'sum', 'SoTA':'sum', 'GA':'sum', 'Saves':'sum', 'Save%':'mean', 'CS':'sum', 'PSxG':'mean', 'PKatt':'sum', 'PKA':'sum', 'PKsv':'sum', 'PKm':'sum', 'Cmp':'sum', 'Att':'sum', 'Cmp%':'mean', 'Att.1':'sum', 'Thr':'sum', 'Launch%':'mean', 'AvgLen':'mean', 'Att.2':'sum', 'Launch%.1':'mean', 'AvgLen.1':'mean', 'Opp':'sum', 'Stp':'sum', 'Stp%':'mean', '#OPA':'sum', 'AvgDist':'mean', 'Win_Loss_Draw':'sum', 'Captain':'sum', 'Games_Played':'sum'}

FP_stats = FP_stats.groupby(['fbref_ID', 'Season']).agg(FP_aggs).reset_index()
GK_stats = GK_stats.groupby(['fbref_ID', 'Season']).agg(GK_aggs).reset_index()

In [9]:
# Add All-Star Indicator
AS_roster = pd.read_csv('C:/Users/scott/OneDrive/BANA/Capstone/Data/all_star_rosters.csv', encoding = 'iso-8859-1')
FP_stats = FP_stats.merge(AS_roster[['Year','fbref_ID']], how = 'left', left_on = ['Season','fbref_ID'], right_on = ['Year','fbref_ID']).rename(columns={'Year':'all_star'})
FP_stats['all_star'] = FP_stats['all_star'].fillna(False).replace([2018,2019,2021,2022],True) 
GK_stats = GK_stats.merge(AS_roster[['Year','fbref_ID']], how = 'left', left_on = ['Season','fbref_ID'], right_on = ['Year','fbref_ID']).rename(columns={'Year':'all_star'})
GK_stats['all_star'] = GK_stats['all_star'].fillna(False).replace([2018,2019,2021,2022],True) 

In [10]:
# Rename Features to useful names
print(FP_stats.columns)
print(GK_stats.columns)
FP_name_dict = {'fbref_ID':'fbref_ID', 'Season':'Season','Start':'Count_Starts', 'Min':'Minutes_Played', 'Gls':'Goals', 'Ast':'Assists', 'PK':'Penalty_Kick_Success', 'PKatt':'Penalty_Kick_Attempts', 'Sh':'Shots',
       'SoT':'Shot_on_Target', 'CrdY':'Yellow_Card', 'CrdR':'Red_Card', 'Touches':'Touches', 'Press':'Press', 'Tkl':'Tackles', 'Int':'Interceptions', 'Blocks':'Blocks', 'xG':'Exp_Goals',
       'npxG':'Expect_None_PK_Goal', 'xA':'Expect_Assist', 'SCA':'Shot_Create_Action', 'GCA':'Goal_Create_Action', 'Cmp':'Complete_Passes', 'Att':'Attempt_Passes', 'Cmp%':'Pass_Complete_Percent', 'Prog':'Progressive_Passes', 'Carries':'Carries',
       'Prog.1':'Progressive_Dribble', 'Succ':'Success_Dribble', 'Att.1':'Attempt_Dribble', 'Win_Loss_Draw':'Win_Loss_Draw', 'all_star':'all_star'}
GK_name_dict = {}
#FP_stats.rename(columns = FP_name_dict, inplace = True)
#GK_stats.rename(columns = GK_name_dict, inplace = True)

Index(['fbref_ID', 'Season', 'Start', 'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh',
       'SoT', 'CrdY', 'CrdR', 'Touches', 'Press', 'Tkl', 'Int', 'Blocks', 'xG',
       'npxG', 'xA', 'SCA', 'GCA', 'Cmp', 'Att', 'Cmp%', 'Prog', 'Carries',
       'Prog.1', 'Succ', 'Att.1', 'Win_Loss_Draw', 'Captain', 'Games_Played',
       'all_star'],
      dtype='object')
Index(['fbref_ID', 'Season', 'Start', 'Min', 'SoTA', 'GA', 'Saves', 'Save%',
       'CS', 'PSxG', 'PKatt', 'PKA', 'PKsv', 'PKm', 'Cmp', 'Att', 'Cmp%',
       'Att.1', 'Thr', 'Launch%', 'AvgLen', 'Att.2', 'Launch%.1', 'AvgLen.1',
       'Opp', 'Stp', 'Stp%', '#OPA', 'AvgDist', 'Win_Loss_Draw', 'Captain',
       'Games_Played', 'all_star'],
      dtype='object')


# Exploratory Data Analysis

In [11]:
# Address missing data
print(FP_stats.isnull().sum())
print(GK_stats.isnull().sum())
FP_stats = FP_stats.fillna(0)
GK_stats = GK_stats.fillna(0)

FP_stats.info()
GK_stats.info()
FP_stats.describe()

fbref_ID         0
Season           0
Start            0
Min              0
Gls              0
Ast              0
PK               0
PKatt            0
Sh               0
SoT              0
CrdY             0
CrdR             0
Touches          0
Press            0
Tkl              0
Int              0
Blocks           0
xG               1
npxG             1
xA               1
SCA              0
GCA              0
Cmp              0
Att              0
Cmp%             5
Prog             0
Carries          0
Prog.1           0
Succ             0
Att.1            0
Win_Loss_Draw    0
Captain          0
Games_Played     0
all_star         0
dtype: int64
fbref_ID         0
Season           0
Start            0
Min              0
SoTA             0
GA               0
Saves            0
Save%            0
CS               0
PSxG             0
PKatt            0
PKA              0
PKsv             0
PKm              0
Cmp              0
Att              0
Cmp%             0
Att.1            0

Unnamed: 0,Season,Start,Min,Gls,Ast,PK,PKatt,Sh,SoT,CrdY,...,Att,Cmp%,Prog,Carries,Prog.1,Succ,Att.1,Win_Loss_Draw,Captain,Games_Played
count,2347.0,2347.0,2347.0,2347.0,2347.0,2347.0,2347.0,2347.0,2347.0,2347.0,...,2347.0,2347.0,2347.0,2347.0,2347.0,2347.0,2347.0,2347.0,2347.0,2347.0
mean,2020.161483,9.510013,853.499361,1.356199,0.961227,0.118875,0.152109,12.153387,4.052407,1.845334,...,428.241585,78.035771,32.956966,356.918193,38.73285,9.077972,15.623349,17.654026,0.866638,12.812527
std,1.576657,6.956034,595.911507,2.387469,1.523819,0.506494,0.607813,14.226591,5.603348,1.943013,...,356.303334,10.345797,32.228237,287.236756,37.930375,10.80768,18.475521,11.148442,3.490168,6.693621
min,2018.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,2019.0,3.0,297.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,124.0,73.35,8.0,107.5,9.0,2.0,3.0,8.0,0.0,7.0
50%,2021.0,9.0,808.0,0.0,0.0,0.0,0.0,7.0,2.0,1.0,...,344.0,79.225,24.0,301.0,28.0,5.0,9.0,17.0,0.0,14.0
75%,2022.0,16.0,1360.5,2.0,1.0,0.0,0.0,17.0,5.0,3.0,...,666.0,84.433239,49.0,541.0,56.5,13.0,22.0,26.0,0.0,19.0
max,2022.0,25.0,2181.0,24.0,10.0,5.0,6.0,105.0,44.0,11.0,...,1865.0,100.0,218.0,1454.0,292.0,89.0,154.0,51.0,24.0,25.0


In [12]:
# Enhance PK features, Captain, % start
FP_stats['PK_Succ'] = (FP_stats['PK']/FP_stats['PKatt'])
FP_stats['Captain_%'] = (FP_stats['Captain']/FP_stats['Games_Played'])
GK_stats['Captain_%'] = (GK_stats['Captain']/GK_stats['Games_Played'])
FP_stats['Start_%'] = (FP_stats['Start']/FP_stats['Games_Played'])
GK_stats['Start_%'] = (GK_stats['Start']/GK_stats['Games_Played'])

In [13]:
FP_stats.groupby('all_star').mean()

Unnamed: 0_level_0,Season,Start,Min,Gls,Ast,PK,PKatt,Sh,SoT,CrdY,...,Carries,Prog.1,Succ,Att.1,Win_Loss_Draw,Captain,Games_Played,PK_Succ,Captain_%,Start_%
all_star,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,2020.170591,9.171924,823.411373,1.192359,0.880942,0.091515,0.116393,11.269658,3.700133,1.795646,...,341.838294,36.611728,8.501555,14.633496,17.087517,0.721901,12.557086,0.753416,0.041408,0.641819
True,2019.947917,17.4375,1559.0,5.197917,2.84375,0.760417,0.989583,32.875,12.3125,3.010417,...,710.510417,88.46875,22.59375,38.833333,30.9375,4.260417,18.802083,0.796078,0.21798,0.922794


# Normalize data by season

In [14]:
S22_GK_stats = GK_stats[GK_stats['Season'] == 2022]
S21_GK_stats = GK_stats[GK_stats['Season'] == 2021]
S19_GK_stats = GK_stats[GK_stats['Season'] == 2019]
S18_GK_stats = GK_stats[GK_stats['Season'] == 2018]
S22_FP_stats = FP_stats[FP_stats['Season'] == 2022]
S21_FP_stats = FP_stats[FP_stats['Season'] == 2021]
S19_FP_stats = FP_stats[FP_stats['Season'] == 2019]
S18_FP_stats = FP_stats[FP_stats['Season'] == 2018]

In [15]:
def normalize_FP(df):
    result = df.copy()
    for feature_name in df.columns[np.r_[2:33,34:37]]:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

def normalize_GK(df):
    result = df.copy()
    for feature_name in df.columns[np.r_[2:32,33:35]]:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result
# Is there a way to normalize by season without needing to split DF by season and then reassemble?

In [18]:
# Normalize all features by each season and FP/GK
S22_GK_stats = normalize_GK(S22_GK_stats)
S21_GK_stats = normalize_GK(S21_GK_stats)
S19_GK_stats = normalize_GK(S19_GK_stats)
S18_GK_stats = normalize_GK(S18_GK_stats)
S22_FP_stats = normalize_FP(S22_FP_stats)
S21_FP_stats = normalize_FP(S21_FP_stats)
S19_FP_stats = normalize_FP(S19_FP_stats)
S18_FP_stats = normalize_FP(S18_FP_stats)

# Join DFs back together
GK_stats = pd.concat([S22_GK_stats,S21_GK_stats,S19_GK_stats,S18_GK_stats], ignore_index= True)
FP_stats = pd.concat([S22_FP_stats,S21_FP_stats,S19_FP_stats,S18_FP_stats], ignore_index= True)

In [19]:
# Add Player details
demo = pd.read_csv('C:/Users/scott/OneDrive/BANA/Capstone/Data/player_demographics.csv', encoding = 'iso-8859-1')
FP_stats = FP_stats.merge(demo, how = 'left', on = ['Season','fbref_ID'])
GK_stats = GK_stats.merge(demo, how = 'left', on = ['Season','fbref_ID'])

In [20]:
FP_stats.groupby(['Season','Pos'])['Pos'].count()

Season  Pos 
2018    DF      173
        DFFW     12
        DFMF     20
        FW       79
        FWDF      6
        FWMF     71
        MF       87
        MFDF     15
        MFFW     55
2019    DF      188
        DFFW      6
        DFMF     10
        FW       74
        FWDF      8
        FWMF     84
        MF      103
        MFDF     14
        MFFW     52
2021    DF      217
        DFFW     10
        DFMF     17
        FW       87
        FWDF     11
        FWMF     95
        MF      110
        MFDF     14
        MFFW     65
2022    DF      222
        DFFW     15
        DFMF     20
        FW       94
        FWDF      6
        FWMF    101
        MF      149
        MFDF     12
        MFFW     45
Name: Pos, dtype: int64

In [21]:
# Add Model Indicators Based on role
## FP_stats.groupby(['Pos'])['Pos'].count()
FP_stats['DF_model'] = FP_stats['Pos'].str.contains('DF') == True
FP_stats['MF_model'] = FP_stats['Pos'].str.contains('MF') == True
FP_stats['FW_model'] = FP_stats['Pos'].str.contains('FW') == True

In [22]:
FP_stats.groupby(['FW_model','MF_model','DF_model'])['fbref_ID'].count()
# No players fit in all models, but a large number cross into two
# Especially forwards and midfielders. This is expected based on role. Expect to see similar features of importance for both models

FW_model  MF_model  DF_model
False     False     True        800
          True      False       449
                    True        122
True      False     False       334
                    True         74
          True      False       568
Name: fbref_ID, dtype: int64

In [32]:
# % of players who are all stars
FP_FW = FP_stats[FP_stats['FW_model'] == True]
FP_FW.loc[:, FP_FW.columns != ('fbref_ID','all_star')]
#FP_FW.columns != ('fbref_ID','all_star')

Unnamed: 0,fbref_ID,Season,Start,Min,Gls,Ast,PK,PKatt,Sh,SoT,...,PK_Succ,Captain_%,Start_%,Player,Nation,Pos,Age,DF_model,MF_model,FW_model
1,00b629e1,2022,0.000000,0.050917,0.000000,0.000000,0.0,0.000000,0.022989,0.022727,...,,0.000000,0.000000,Nicolás Mezquida,URU,MFFW,30,False,True,True
8,04cc9edd,2022,0.680000,0.627982,0.062500,0.444444,0.0,0.000000,0.229885,0.159091,...,,0.000000,0.708333,Maikel Chang,CUB,MFFW,31,False,True,True
9,04e195ee,2022,0.360000,0.332110,0.125000,0.000000,0.0,0.000000,0.183908,0.204545,...,,0.000000,0.642857,Indiana Vassilev,USA,FWMF,21,False,True,True
10,058c938c,2022,0.640000,0.709174,0.125000,0.444444,0.0,0.166667,0.505747,0.295455,...,0.0,0.000000,0.695652,Marcelino Moreno,ARG,MFFW,28,False,True,True
14,0672f944,2022,0.000000,0.053670,0.000000,0.000000,0.0,0.000000,0.057471,0.022727,...,,0.000000,0.000000,Ousseni Bouda,BFA,FW,22,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2339,fd1f71f5,2018,0.000000,0.054795,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,,0.000000,0.000000,Louis Béland-Goyette,CAN,MFFW,22,False,True,True
2342,fdb15495,2018,0.478261,0.408023,0.000000,0.200000,0.0,0.000000,0.133333,0.088235,...,,0.000000,0.785714,Cristian Martínez,PAN,FW,20,False,False,True
2343,fe81682e,2018,0.869565,0.861057,0.291667,0.500000,0.2,0.333333,0.371429,0.500000,...,0.5,0.050000,1.000000,Diego Fagúndez,URU,MFFW,22,False,True,True
2344,fe81d0d1,2018,0.652174,0.659980,0.166667,0.300000,0.4,0.333333,0.304762,0.205882,...,1.0,0.533333,1.000000,Nicolás Lodeiro,URU,MFFW,28,False,True,True


In [25]:
FP_FW = FP_stats[FP_stats['FW_model'] == True]
FP_FW.columns
X = FP_FW.iloc[:,1:31]
Y = FP_FW['all_star']

best_features = SelectKBest(score_func=chi2, k=3)
fit = best_features.fit(X,Y)
feature_scores = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(fit.scores_)], axis=1)
feature_scores.columns = ['Feature','Score']
feature_scores.sort_values(by = 'Score', ascending= False)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
FP_MF = FP_stats[FP_stats['MF_model'] == True]
X = FP_MF.iloc[:,1:31]
Y = FP_MF['all_star']

best_features = SelectKBest(score_func=chi2, k=3)
fit = best_features.fit(X,Y)
feature_scores = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(fit.scores_)], axis=1)
feature_scores.columns = ['Feature','Score']
feature_scores.sort_values(by = 'Score', ascending= False)

In [None]:
FP_DF = FP_stats[FP_stats['DF_model'] == True]
X = FP_DF.iloc[:,1:31]
Y = FP_DF['all_star']

best_features = SelectKBest(score_func=chi2, k=3)
fit = best_features.fit(X,Y)
feature_scores = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(fit.scores_)], axis=1)
feature_scores.columns = ['Feature','Score']
feature_scores.sort_values(by = 'Score', ascending= False)

In [None]:
X = GK_stats.iloc[:,1:30]
Y = GK_stats['all_star']

best_features = SelectKBest(score_func=chi2, k=3)
fit = best_features.fit(X,Y)
feature_scores = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(fit.scores_)], axis=1)
feature_scores.columns = ['Feature','Score']
feature_scores.sort_values(by = 'Score', ascending= False)

# Modeling

### Algorithms to consider
* Random Forest
* Other Ensemble (decision tree, AdaBoost)
* Neural Network
* Logistic (Need variable selection by model)
* SVM
* KNN?

### Notes
* Test/Train Split should needs to be adjusted 
    * What should be my test criteria? Should season be excluded for testing prediction on new season?
    * How can I assure that test partition has sufficient number of allstar records

In [None]:
# Split into Test/Train
fw_train, fw_test, fw_as_train, fw_as_test = train_test_split(FP_FW.iloc[:,2:31], FP_FW['all_star'], test_size=0.2, random_state=1017, stratify=FP_FW['all_star'])
mf_train, mf_test, mf_as_train, mf_as_test = train_test_split(FP_MF.iloc[:,2:31], FP_MF['all_star'], test_size=0.2, random_state=623, stratify=FP_MF['all_star'])
df_train, df_test, df_as_train, df_as_test = train_test_split(FP_DF.iloc[:,2:31], FP_DF['all_star'], test_size=0.2, random_state=225, stratify=FP_DF['all_star'])
gk_train, gk_test, gk_as_train, gk_as_test = train_test_split(GK_stats.iloc[:,2:30], GK_stats['all_star'], test_size=0.2, random_state=511, stratify=GK_stats['all_star'])

In [None]:
print(fw_train.shape, mf_train.shape, df_train.shape, gk_train.shape)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1000, max_features='sqrt', bootstrap=False)
pprint(clf.get_params())

In [None]:
clf.fit(fw_train, fw_as_train)
y_pred = clf.predict(fw_test)

print('Random Forest FW Accuracy:',metrics.f1_score(fw_as_test, y_pred))

clf.fit(mf_train, mf_as_train)
y_pred = clf.predict(mf_test)

print('Random Forest MF Accuracy:',metrics.f1_score(mf_as_test, y_pred))

clf.fit(df_train, df_as_train)
y_pred = clf.predict(df_test)

print('Random Forest DF Accuracy:',metrics.f1_score(df_as_test, y_pred))

clf.fit(gk_train, gk_as_train)
y_pred = clf.predict(gk_test)

print('Random Forest GK Accuracy:',metrics.f1_score(gk_as_test, y_pred))

#### Random Forest Notes
* Consider running decision tree and evaluate if interpretability is worth accuracy trade off
* CV using cross_val_score and RepeatedStratifiedKFold

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier(n_estimators=1000, learning_rate=1)
pprint(ab.get_params())

In [None]:
ab_model = ab.fit(fw_train, fw_as_train)
y_pred = ab.predict(fw_test)
print('AdaBoost FW Accuracy:',metrics.f1_score(fw_as_test, y_pred))

ab_model = ab.fit(mf_train, mf_as_train)
y_pred = ab.predict(mf_test)
print('AdaBoost MF Accuracy:',metrics.f1_score(mf_as_test, y_pred))

ab_model = ab.fit(df_train, df_as_train)
y_pred = ab.predict(df_test)
print('AdaBoost DF Accuracy:',metrics.f1_score(df_as_test, y_pred))

ab_model = ab.fit(gk_train, gk_as_train)
y_pred = ab.predict(gk_test)
print('AdaBoost GK Accuracy:',metrics.f1_score(gk_as_test, y_pred))

### XGBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error



### Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs')

pprint(clf.get_params())

In [None]:
clf.fit(fw_train, fw_as_train)
y_pred = clf.predict(fw_test)
print('NN FW Accuracy:',metrics.f1_score(fw_as_test, y_pred))

clf.fit(mf_train, mf_as_train)
y_pred = clf.predict(mf_test)
print('NN MF Accuracy:',metrics.f1_score(mf_as_test, y_pred))

clf.fit(df_train, df_as_train)
y_pred = clf.predict(df_test)
print('NN DF Accuracy:',metrics.f1_score(df_as_test, y_pred))
# Defender model does not consistently converge

clf.fit(gk_train, gk_as_train)
y_pred = clf.predict(gk_test)
print('NN GK Accuracy:',metrics.f1_score(gk_as_test, y_pred))

# Misc

In [None]:
import sklearn.metrics
from math import sqrt
train_preds = knn_model.predict(fw_test)
mse = mean_squared_error(fw_as_train, train_preds)
rmse = sqrt(mse)
rmse

In [None]:
# Find Optimal K
from sklearn.model_selection import GridSearchCV
parameters = {"n_neighbors": range(1, 50)}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(fw_train, fw_as_train)
print('Ideal K', gridsearch.best_params_)
print('KNN FW Accuracy:',metrics.accuracy_score(fw_as_test, train_preds))

In [None]:
# DFs for individual seasons
S22_FP = FP_stats[FP_stats['Season'] == 2022]
S22_GK = GK_stats[GK_stats['Season'] == 2022]
S21_FP = FP_stats[FP_stats['Season'] == 2021]
S21_GK = GK_stats[GK_stats['Season'] == 2021]
S19_FP = FP_stats[FP_stats['Season'] == 2019]
S19_GK = GK_stats[GK_stats['Season'] == 2019]
S18_FP = FP_stats[FP_stats['Season'] == 2018]
S18_GK = GK_stats[GK_stats['Season'] == 2018]

In [None]:
# Save Stats to file
FP_stats.to_csv('C:/Users/scott/OneDrive/BANA/Capstone/Data/FP_stats.csv')
GK_stats.to_csv('C:/Users/scott/OneDrive/BANA/Capstone/Data/GK_stats.csv')

In [None]:
# Linear Regression
regressor = LinearRegression()
regressor.fit(GK_stats.iloc[:,2:30],GK_stats['all_star'])

In [None]:
print(regressor.coef_)
print(regressor.intercept_)

### Additional Features Needed?

* Update Field Names to be interpretable 
* consider enhancing start feature to be % start or new feature for games played

### Grouping

* most stats should be summed, review to confirm

### Parameters for all star game
* breakdown by position (check year by year)
* primarily used for final conclusions

### Models
* Start with full model that considers all field players. Will always need a separate model for GK due to different set of metrics
* Build sub-models for position groups (fw, mid, def, gk)?
    * Would need separate groupings to handle players that split across positions and roles like 'wing'