In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import fpl_functions as fpl
pd.options.mode.chained_assignment = None
sns.set_theme()

In [2]:
'''
TODO

- add FPL built in ratings for teams done
- prepare data for ML
    - encode player names? maybe not needed
    - add newly promoted boolean
- explore points correlation between players? Is it a better idea to get uncorrelated players, or focus in on correlated sources of points
'''

'\nTODO\n\n- add FPL built in ratings for teams done\n- prepare data for ML\n    - encode player names? maybe not needed\n    - add newly promoted boolean\n- explore points correlation between players? Is it a better idea to get uncorrelated players, or focus in on correlated sources of points\n'

In [3]:
players_url = "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2023-24/gws/merged_gw.csv"
fixtures_url = "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2023-24/fixtures.csv"
teams_url = "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2023-24/teams.csv"

players = pd.read_csv(players_url)
fixtures = pd.read_csv(fixtures_url)
teams = pd.read_csv(teams_url)

In [4]:
players.head()

Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW
0,Femi Seriki,DEF,Sheffield Utd,0.5,0,0,0,0,0.0,653,...,0,0.0,0,0,0,0,40,True,0,1
1,Jack Hinshelwood,MID,Brighton,1.5,0,0,0,0,0.0,621,...,4,0.0,0,0,0,0,45,True,0,1
2,Jadon Sancho,MID,Man Utd,3.0,0,0,4,0,11.3,397,...,1,8.0,1,0,0,0,70,True,0,1
3,Rhys Norrington-Davies,DEF,Sheffield Utd,0.1,0,0,0,0,0.0,487,...,0,0.0,0,0,0,0,40,True,0,1
4,Vitaly Janelt,MID,Brentford,2.1,0,0,6,0,11.5,105,...,2,17.0,2,0,0,0,55,True,0,1


In [5]:
players.columns

Index(['name', 'position', 'team', 'xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'fixture', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'kickoff_time', 'minutes', 'opponent_team',
       'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
       'round', 'saves', 'selected', 'starts', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'transfers_balance', 'transfers_in',
       'transfers_out', 'value', 'was_home', 'yellow_cards', 'GW'],
      dtype='object')

In [6]:
teams.head().T

Unnamed: 0,0,1,2,3,4
code,3,7,91,94,36
draw,0,0,0,0,0
form,,,,,
id,1,2,3,4,5
loss,0,0,0,0,0
name,Arsenal,Aston Villa,Bournemouth,Brentford,Brighton
played,0,0,0,0,0
points,0,0,0,0,0
position,0,0,0,0,0
short_name,ARS,AVL,BOU,BRE,BHA


In [7]:
teams['name'].values

array(['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford', 'Brighton',
       'Burnley', 'Chelsea', 'Crystal Palace', 'Everton', 'Fulham',
       'Liverpool', 'Luton', 'Man City', 'Man Utd', 'Newcastle',
       "Nott'm Forest", 'Sheffield Utd', 'Spurs', 'West Ham', 'Wolves'],
      dtype=object)

In [8]:
teams['overall_strength'] = teams['strength_overall_home'] + teams['strength_overall_away']
list(teams.sort_values(by='overall_strength', ascending=False)['name'])

['Man City',
 'Arsenal',
 'Liverpool',
 'Aston Villa',
 'Spurs',
 'Newcastle',
 'Man Utd',
 'Chelsea',
 'Brighton',
 'West Ham',
 'Fulham',
 'Brentford',
 'Wolves',
 'Bournemouth',
 'Everton',
 'Crystal Palace',
 "Nott'm Forest",
 'Luton',
 'Burnley',
 'Sheffield Utd']

In [9]:
#make strength tier for teams
teams.loc[teams['name'].isin(['Man City','Arsenal','Liverpool','Aston Villa']), 'tier'] = 4
teams.loc[teams['name'].isin(['Spurs','Newcastle','Man Utd','Chelsea']), 'tier'] = 3
teams.loc[teams['name'].isin(['Brighton','West Ham','Fulham','Brentford']), 'tier'] = 2
teams.loc[teams['name'].isin(['Wolves','Bournemouth','Everton','Crystal Palace']), 'tier'] = 1
teams.loc[teams['name'].isin(["Nott'm Forest",'Luton','Burnley','Sheffield Utd']), 'tier'] = 0

In [10]:
fixtures.drop(['id', 'code', 'finished', 'finished_provisional', 'minutes', 'provisional_start_time', 'started', 'stats', 'pulse_id'], axis=1, inplace=True)
fixtures.rename(columns={'event': 'Gameweek'}, inplace=True)
fixtures['kickoff_date'] = fixtures['kickoff_time'].apply(lambda x: x.split('T')[0])
fixtures['kickoff_time'] = fixtures['kickoff_time'].apply(lambda x: x.split('T')[1].replace('Z', ''))
fixtures = fixtures[['Gameweek', 'kickoff_date', 'kickoff_time', 'team_a', 'team_a_score', 'team_h', 'team_h_score', 'team_h_difficulty', 'team_a_difficulty']]
fixtures['team_a'] = fixtures['team_a'].map(teams.set_index('id').name)
fixtures['team_h'] = fixtures['team_h'].map(teams.set_index('id').name)
fixtures.head()

Unnamed: 0,Gameweek,kickoff_date,kickoff_time,team_a,team_a_score,team_h,team_h_score,team_h_difficulty,team_a_difficulty
0,1,2023-08-11,19:00:00,Man City,3,Burnley,0,5,2
1,1,2023-08-12,12:00:00,Nott'm Forest,1,Arsenal,2,2,5
2,1,2023-08-12,14:00:00,West Ham,1,Bournemouth,1,2,2
3,1,2023-08-12,14:00:00,Luton,1,Brighton,4,2,3
4,1,2023-08-12,14:00:00,Fulham,1,Everton,0,2,2


In [11]:
#before removing players from dataset who don't meet minutes requirements, create DF for expected goals conceded per team, by GW
xgc = pd.DataFrame((players.loc[players['position'] == 'GK'].groupby(['team', 'GW'])['expected_goals_conceded'].sum()).cumsum()).reset_index()
#xgc['expected_goals_conceded'] = np.round(xgc['expected_goals_conceded']/xgc['GW'], 2)

In [12]:
player_minutes = players.groupby('name')['minutes'].sum()
player_minutes.describe()

count     869.000000
mean      861.484465
std      1036.526341
min         0.000000
25%         0.000000
50%       338.000000
75%      1585.000000
max      3420.000000
Name: minutes, dtype: float64

In [13]:
no_min = list(player_minutes[player_minutes == 0].index)
players.drop(players[players['name'].isin(no_min)].index, inplace=True)
player_minutes = players.groupby('name')['minutes'].sum()
below_med = list(player_minutes[player_minutes <= player_minutes.mean()].index)
players.drop(players[players['name'].isin(below_med)].index, inplace=True)

In [14]:
player_minutes = players.groupby('name')['minutes'].sum()
player_minutes.describe()

count     267.000000
mean     2250.865169
std       610.802532
min      1310.000000
25%      1703.000000
50%      2208.000000
75%      2778.000000
max      3420.000000
Name: minutes, dtype: float64

In [15]:
players.drop('round', axis=1, inplace=True)
players = players.merge(teams[['name', 'tier']].rename(columns={'name': 'team', 'tier': 'team_tier'}), how='left', on='team')
players['opponent_team'] = players['opponent_team'].map(teams.set_index('id').name)
players = players.merge(xgc.rename(columns={'team': 'opponent_team', 'expected_goals_conceded': 'opp_xgc'}), how='left', left_on=['opponent_team', 'GW'], right_on=['opponent_team', 'GW'])
#players['FDR'] = players.apply(lambda x: fpl.single_fixture_by_team(fixtures, x['team'], x['GW']).loc[fpl.single_fixture_by_team(fixtures, x['team'], x['GW'])['opponent'] == x['opponent_team']]['FDR'].iloc[-1], axis=1)
p1 = players.loc[players['was_home'] == True].merge(teams[['name', 'strength_attack_away', 
                                                           'strength_defence_away', 'tier']].rename(columns={'name': 'opponent_team', 
                                                           'strength_attack_away': 'opponent_attack_strength',
                                                           'strength_defence_away': 'opponent_defence_strength',
                                                           'tier': 'opponent_tier'}), how='left', on='opponent_team')
p2 = players.loc[players['was_home'] == False].merge(teams[['name', 'strength_attack_home',
                                                            'strength_defence_home']].rename(columns={'name': 'opponent_team', 
                                                            'strength_attack_home': 'opponent_attack_strength',
                                                            'strength_defence_home': 'opponent_defence_strength',
                                                            'tier': 'opponent_tier'}), how='left', on='opponent_team')
players2 = pd.concat([p1, p2])

In [16]:
players2.columns

Index(['name', 'position', 'team', 'xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'fixture', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'kickoff_time', 'minutes', 'opponent_team',
       'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
       'saves', 'selected', 'starts', 'team_a_score', 'team_h_score', 'threat',
       'total_points', 'transfers_balance', 'transfers_in', 'transfers_out',
       'value', 'was_home', 'yellow_cards', 'GW', 'team_tier', 'opp_xgc',
       'opponent_attack_strength', 'opponent_defence_strength',
       'opponent_tier'],
      dtype='object')

In [17]:
players_slim = players2.drop(['xP', 'creativity', 'element', 'fixture', 'ict_index', 'influence', 'selected', 'threat', 'transfers_balance', 
                             'transfers_in', 'transfers_out', 'team_a_score', 'team_h_score'], axis=1)

In [18]:
players_slim['goal_involvements'] = players_slim['goals_scored'] + players_slim['assists']
players_slim['value'] = players_slim['value']/10

#boolean for whether the opponents team is a newly promoted side
players_slim['promoted'] = players_slim['opponent_team'].apply(lambda x: 1 if x in ['Luton, Sheffield Utd', 'Burnley'] else 0)

#create expected goals and goal involvements vs actual goals and goal involvements columns
players_slim['g_vs_xg'] = players_slim['goals_scored'] - players_slim['expected_goals']
players_slim['gi_vs_xgi'] = players_slim['goal_involvements'] - players_slim['expected_goal_involvements']
players_slim['gc_vs_xgc'] = players_slim['goals_conceded'] - players_slim['expected_goals_conceded']

#weighted columns
players_slim['weighted_points'] = players_slim['total_points']*(players_slim['opponent_defence_strength'] + players_slim['opponent_defence_strength'])/2
players_slim['weighted_xgi'] = players_slim['expected_goal_involvements']*players_slim['opponent_defence_strength']

In [19]:
players_slim.columns

Index(['name', 'position', 'team', 'assists', 'bonus', 'bps', 'clean_sheets',
       'expected_assists', 'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'goals_conceded', 'goals_scored',
       'kickoff_time', 'minutes', 'opponent_team', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'saves', 'starts',
       'total_points', 'value', 'was_home', 'yellow_cards', 'GW', 'team_tier',
       'opp_xgc', 'opponent_attack_strength', 'opponent_defence_strength',
       'opponent_tier', 'goal_involvements', 'promoted', 'g_vs_xg',
       'gi_vs_xgi', 'gc_vs_xgc', 'weighted_points', 'weighted_xgi'],
      dtype='object')

In [20]:
# predict next few weeks, rather than single week

data_col = ['assists', 'bonus', 'bps', 'clean_sheets', 'expected_assists', 'expected_goal_involvements', 'expected_goals', 
               'expected_goals_conceded', 'goals_conceded', 'goals_scored', 'minutes', 'own_goals','penalties_missed', 'penalties_saved', 
               'red_cards', 'saves', 'starts', 'total_points', 'was_home', 'yellow_cards', 'opp_xgc', 'opponent_attack_strength', 'opponent_defence_strength',
               'goal_involvements', 'promoted', 'g_vs_xg', 'gi_vs_xgi', 'gc_vs_xgc', 'weighted_points', 'weighted_xgi']

d1 = dict.fromkeys(data_col, 'mean')
gw = 6
full_data = pd.DataFrame()
while gw <= 33:
    temp = players_slim.loc[players_slim['GW'].isin(range(gw-5, gw))]
    prev_weeks = temp.groupby('name').agg({'position': 'first', 'team': 'first', 'team_tier': 'first', **d1, 'value': 'first', 'total_points':['mean', 'sum', 'std']}).round(2)
    prev_weeks.rename(columns={'total_points': 'last_weeks_points'}, inplace=True)
    prev_weeks['GW'] = gw
    p_cols = ['position', 'team', 'team_tier', 'assists', 'bonus', 'bps', 'clean_sheets',
       'expected_assists', 'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'goals_conceded', 'goals_scored', 'minutes',
       'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
       'saves', 'starts', 'last_week_points_mean', 'last_week_points_sum',
       'last_week_points_std', 'was_home', 'yellow_cards', 'opp_xgc', 'opponent_attack_strength', 
       'opponent_defense_strength', 'goal_involvements', 'promoted',
       'g_vs_xg', 'gi_vs_xgi', 'gc_vs_xgc', 'weighted_points', 'weighted_xgi',
       'value', 'GW']
    prev_weeks.set_axis(p_cols, axis=1, inplace=True)
    prev_weeks.reset_index(inplace=True)
    temp2 = players_slim.loc[players_slim['GW'].isin(range(gw, gw+5))][['name', 'total_points', 'was_home', 'opponent_attack_strength', 'opponent_defence_strength', 'opp_xgc']]
    next_weeks = temp2.groupby('name').agg({'opponent_attack_strength': 'mean', 'opponent_defence_strength': 'mean', 'was_home': 'sum', 'total_points': 'sum', 'opp_xgc': 'mean'}).round(2)
    next_weeks.rename(columns={'total_points': 'next_weeks_points', 'was_home': 'is_home', 'opponent_attack_strength': 'nw_opp_att', 'opponent_defence_strength': 'nw_opp_def', 'opp_xgc': 'nw_opp_xgc'}, inplace=True)
    next_weeks['is_home'] = next_weeks['is_home'].astype(int)
    temp2 = prev_weeks.merge(next_weeks, on='name', how='left')
    full_data = pd.concat([full_data, temp2])
    gw+=1

In [21]:
full_data['ppm'] = round(full_data['last_week_points_sum']/full_data['value'], 2)

full_data['points_per_minute'] = full_data['last_week_points_sum']/full_data['minutes']
full_data['xg_per90'] = full_data['expected_goals']/full_data['minutes']*90
full_data['xa_per90'] = full_data['expected_assists']/full_data['minutes']*90
full_data['xgi_per90'] = full_data['expected_goal_involvements']/full_data['minutes']*90
full_data['xgc_per90'] = full_data['expected_goals_conceded']/full_data['minutes']*90

#expected goal involvements per million; could this be a good way to determine which players are "undervalued"?
full_data['xgi_pm'] = round(full_data['expected_goal_involvements']/full_data['value'], 2)
full_data['xg_pm'] = round(full_data['expected_goals']/full_data['value'], 2)
full_data['xa_pm'] = round(full_data['expected_assists']/full_data['value'], 2)

In [22]:
full_data.loc[full_data['position'] == 'GK'].corr()['next_weeks_points'].sort_values(ascending=False).head(15)

next_weeks_points             1.000000
starts                        0.414599
minutes                       0.403624
bps                           0.354542
last_week_points_mean         0.302138
weighted_points               0.298569
last_week_points_sum          0.288478
ppm                           0.274843
last_week_points_std          0.252132
weighted_xgi                  0.250346
clean_sheets                  0.245395
expected_goals_conceded       0.227205
expected_assists              0.227016
expected_goal_involvements    0.227016
saves                         0.222328
Name: next_weeks_points, dtype: float64

In [23]:
full_data.loc[full_data['position'] == 'GK'].corr()['next_weeks_points'].sort_values(ascending=True).head(15)

xgc_per90                   -0.129841
opponent_defense_strength   -0.114495
opponent_attack_strength    -0.110655
GW                          -0.086688
penalties_saved             -0.072381
gc_vs_xgc                   -0.059964
nw_opp_att                  -0.054862
was_home                    -0.046351
gi_vs_xgi                   -0.012567
nw_opp_def                  -0.005416
assists                      0.037118
goal_involvements            0.037118
own_goals                    0.050242
nw_opp_xgc                   0.050256
points_per_minute            0.055605
Name: next_weeks_points, dtype: float64

In [24]:
full_data.loc[full_data['position'] == 'DEF'].corr()['next_weeks_points'].sort_values(ascending=False).head(15)

next_weeks_points             1.000000
bps                           0.255438
last_week_points_mean         0.242292
weighted_points               0.242202
last_week_points_sum          0.230067
minutes                       0.204568
expected_goal_involvements    0.198220
value                         0.195961
starts                        0.194427
ppm                           0.193834
expected_assists              0.192266
weighted_xgi                  0.190579
team_tier                     0.186078
clean_sheets                  0.180057
xgi_pm                        0.179846
Name: next_weeks_points, dtype: float64

In [25]:
full_data.loc[full_data['position'] == 'DEF'].corr()['next_weeks_points'].sort_values(ascending=True).head(15)

nw_opp_att                  -0.102656
xgc_per90                   -0.093335
nw_opp_def                  -0.077129
opponent_defense_strength   -0.069215
opponent_attack_strength    -0.050617
gc_vs_xgc                   -0.019097
GW                          -0.008825
g_vs_xg                     -0.006431
points_per_minute           -0.002094
own_goals                   -0.001992
was_home                     0.004888
red_cards                    0.016771
opp_xgc                      0.018353
yellow_cards                 0.028043
goals_conceded               0.039174
Name: next_weeks_points, dtype: float64

In [26]:
full_data.loc[full_data['position'] == 'MID'].corr()['next_weeks_points'].sort_values(ascending=False).head(15)

next_weeks_points             1.000000
expected_goal_involvements    0.413487
weighted_xgi                  0.409611
last_week_points_mean         0.378452
weighted_points               0.378345
expected_assists              0.360003
xgi_pm                        0.359951
last_week_points_sum          0.356574
goal_involvements             0.349589
expected_goals                0.344934
value                         0.335984
bps                           0.329461
goals_scored                  0.308443
xg_pm                         0.293666
xa_pm                         0.290316
Name: next_weeks_points, dtype: float64

In [27]:
full_data.loc[full_data['position'] == 'MID'].corr()['next_weeks_points'].sort_values(ascending=True).head(15)

xgc_per90                   -0.117212
nw_opp_att                  -0.080124
nw_opp_def                  -0.059787
red_cards                   -0.047084
gc_vs_xgc                   -0.039279
opponent_defense_strength   -0.018982
GW                          -0.007410
opponent_attack_strength    -0.007236
yellow_cards                -0.006236
opp_xgc                     -0.005830
was_home                     0.003722
penalties_missed             0.015589
own_goals                    0.016010
promoted                     0.017263
nw_opp_xgc                   0.038181
Name: next_weeks_points, dtype: float64

In [28]:
full_data.loc[full_data['position'] == 'FWD'].corr()['next_weeks_points'].sort_values(ascending=False).head(15)

next_weeks_points             1.000000
minutes                       0.262139
last_week_points_mean         0.255927
expected_goal_involvements    0.254019
weighted_points               0.253795
weighted_xgi                  0.251655
value                         0.243500
expected_goals                0.235226
goal_involvements             0.226897
bps                           0.215866
last_week_points_sum          0.211173
starts                        0.201418
goals_scored                  0.201080
xgi_pm                        0.200492
is_home                       0.182421
Name: next_weeks_points, dtype: float64

In [29]:
full_data.loc[full_data['position'] == 'FWD'].corr()['next_weeks_points'].sort_values(ascending=True).head(15)

xgc_per90                   -0.120698
nw_opp_att                  -0.088908
red_cards                   -0.068580
points_per_minute           -0.043819
nw_opp_def                  -0.022911
opponent_attack_strength    -0.014472
opponent_defense_strength    0.000601
g_vs_xg                      0.005193
opp_xgc                      0.012150
xa_per90                     0.012568
GW                           0.044156
yellow_cards                 0.064135
gc_vs_xgc                    0.065641
gi_vs_xgi                    0.070104
was_home                     0.074668
Name: next_weeks_points, dtype: float64

In [30]:
full_data.columns

Index(['name', 'position', 'team', 'team_tier', 'assists', 'bonus', 'bps',
       'clean_sheets', 'expected_assists', 'expected_goal_involvements',
       'expected_goals', 'expected_goals_conceded', 'goals_conceded',
       'goals_scored', 'minutes', 'own_goals', 'penalties_missed',
       'penalties_saved', 'red_cards', 'saves', 'starts',
       'last_week_points_mean', 'last_week_points_sum', 'last_week_points_std',
       'was_home', 'yellow_cards', 'opp_xgc', 'opponent_attack_strength',
       'opponent_defense_strength', 'goal_involvements', 'promoted', 'g_vs_xg',
       'gi_vs_xgi', 'gc_vs_xgc', 'weighted_points', 'weighted_xgi', 'value',
       'GW', 'nw_opp_att', 'nw_opp_def', 'is_home', 'next_weeks_points',
       'nw_opp_xgc', 'ppm', 'points_per_minute', 'xg_per90', 'xa_per90',
       'xgi_per90', 'xgc_per90', 'xgi_pm', 'xg_pm', 'xa_pm'],
      dtype='object')

In [31]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7407 entries, 0 to 266
Data columns (total 52 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        7407 non-null   object 
 1   position                    7407 non-null   object 
 2   team                        7407 non-null   object 
 3   team_tier                   7407 non-null   float64
 4   assists                     7407 non-null   float64
 5   bonus                       7407 non-null   float64
 6   bps                         7407 non-null   float64
 7   clean_sheets                7407 non-null   float64
 8   expected_assists            7407 non-null   float64
 9   expected_goal_involvements  7407 non-null   float64
 10  expected_goals              7407 non-null   float64
 11  expected_goals_conceded     7407 non-null   float64
 12  goals_conceded              7407 non-null   float64
 13  goals_scored                7407 n

In [32]:
full_data.replace([np.inf, -np.inf], np.nan, inplace=True)
full_data.fillna(0, inplace=True)
#full_data.drop('name', axis=1, inplace=True)

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from category_encoders.cat_boost import CatBoostEncoder

X = full_data.drop('next_weeks_points', axis=1).reset_index(drop=True)
y = full_data['next_weeks_points'].reset_index(drop=True)

cat_col = ['position', 'team']
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
one_hot_encoded = enc.fit_transform(X[cat_col])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=enc.get_feature_names_out(cat_col)).reset_index(drop=True)
cat_enc = CatBoostEncoder()
cat_encoded = cat_enc.fit_transform(X['name'], y)
cat_enc_df = pd.DataFrame(cat_encoded, columns=cat_enc.get_feature_names_out('name')).reset_index(drop=True)

df_encoded = pd.concat([X, one_hot_df, cat_enc_df], axis=1)
df_encoded.drop(cat_col+['name'], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df_encoded, y, test_size=0.2, random_state=442)

In [34]:
from sklearn.preprocessing import StandardScaler

std_scaler_train = StandardScaler()
std_scaler_test = StandardScaler()

X_train_s = std_scaler_train.fit_transform(X_train)
X_test_s = std_scaler_train.fit_transform(X_test)

In [35]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE

model = LinearRegression()
model.fit(X_train_s, y_train)
pred = model.predict(X_test_s)
rmse = np.sqrt(MSE(y_test, pred))
mae = MAE(y_test, pred)
print("RMSE : % f" %(rmse)) 
print("MAE : % f" %(mae))

RMSE :  8.199869
MAE :  6.352275


In [36]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.1, 0.05, 0.01],
    'subsample': [0.5, 0.75, 1],
    'colsample_bytree': [0.5, 0.75, 1],
    'n_estimators': [500, 1000, 1500],
    'lambda': [1, 3, 5]
}

reg = xgb.XGBRegressor()
grid_search = GridSearchCV(reg, param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=3)
grid_search.fit(X_train, y_train)
print("Best score: ", grid_search.best_score_)
print("Best set of hyperparameters: ", grid_search.best_params_)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
[CV 1/5] END colsample_bytree=0.5, lambda=1, learning_rate=0.1, max_depth=3, n_estimators=500, subsample=0.5;, score=-5.739 total time=   0.3s
[CV 2/5] END colsample_bytree=0.5, lambda=1, learning_rate=0.1, max_depth=3, n_estimators=500, subsample=0.5;, score=-5.508 total time=   0.3s
[CV 3/5] END colsample_bytree=0.5, lambda=1, learning_rate=0.1, max_depth=3, n_estimators=500, subsample=0.5;, score=-5.814 total time=   0.3s
[CV 4/5] END colsample_bytree=0.5, lambda=1, learning_rate=0.1, max_depth=3, n_estimators=500, subsample=0.5;, score=-5.891 total time=   0.3s
[CV 5/5] END colsample_bytree=0.5, lambda=1, learning_rate=0.1, max_depth=3, n_estimators=500, subsample=0.5;, score=-5.712 total time=   0.3s
[CV 1/5] END colsample_bytree=0.5, lambda=1, learning_rate=0.1, max_depth=3, n_estimators=500, subsample=0.75;, score=-5.635 total time=   0.3s
[CV 2/5] END colsample_bytree=0.5, lambda=1, learning_rate=0.1, max_depth=3, 

In [37]:
#reg.fit(X_train_s, y_train)
#pred = reg.predict(X_test_s)
pred = grid_search.predict(X_test_s) 
rmse = np.sqrt(MSE(y_test, pred))
mae = MAE(y_test, pred)
print("RMSE : % f" %(rmse))
print("MAE : % f" %(mae))

RMSE :  8.981918
MAE :  7.058685


In [38]:
'''prediction_test = X_test.copy()
prediction_test['y'] = y_test
prediction_test['pred'] = pred'''

"prediction_test = X_test.copy()\nprediction_test['y'] = y_test\nprediction_test['pred'] = pred"

In [39]:
import pickle
file_name = "xgb_reg.pkl"

# save
pickle.dump(grid_search, open(file_name, "wb"))