In [14]:
import pandas as pd
import re
import pmdarima as pm
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

### load and merge data

In [15]:
season_18_df = pd.read_csv('../data/2018_19_current_season_data.csv')

In [16]:
season_17_df = pd.read_excel('../data/FPL-201718-player-stats.xlsx')

In [17]:
def camelcase_columns(df):
    df.columns = [re.sub( r'(\n)?(\s)+', ' ', c).strip().lower().replace(' ','_').replace('£','p') for c in df.columns]
    return df

In [18]:
season_18_df = camelcase_columns(season_18_df)
season_17_df = camelcase_columns(season_17_df)

In [19]:
season_17_df.columns

Index(['gw_gameweek', 'opp_opposition', 'pts', 'mpminutes_played',
       'gsgoals_scored', 'aassists', 'csclean_sheets', 'gcgoals_conceded',
       'ogown_goals', 'pspenalties_saved', 'pmpenalties_missed',
       'ycyellow_cards', 'rcred_cards', 'ssaves', 'bbonus',
       'bpsbonus_points_system', 'iinfluence', 'ccreativity', 'tthreat',
       'iiict_index', 'nt_net_transfers', 'sb_selected_by', 'p_value', 'name',
       'team', 'position', 'total_score_rank'],
      dtype='object')

In [20]:
season_17_df['season'] = 2017
season_18_df['season'] = 2018

In [21]:
combined_df= season_17_df.append(season_18_df,sort=False)

In [22]:
combined_df['player_id'] = combined_df.name + '-' + combined_df.position

some duplicates :( - I'll ignore for now

In [62]:
combined_df['p_value'] = pd.to_numeric(combined_df.p_value.str.replace('£',''))

In [64]:
game_counts_df = combined_df.groupby(['player_id','season'])[['pts']].count().rename({'pts':'count'}).reset_index()

In [65]:
season_17_players = set(combined_df.loc[combined_df.season==2018,'player_id'])

In [66]:
season_18_players = set(combined_df.loc[combined_df.season==2017,'player_id'])

In [67]:
len(season_17_players.intersection(season_18_players))

336

In [68]:
def fit_autoarima(data, player_id, plot_model = False):
    train = data.loc[(data.player_id == player_id) &
                     (data.season == 2017),'pts']

    test = data.loc[(data.player_id == player_id) &
                     (data.season == 2018) &
                     (data.gw_gameweek < 11),'pts']
    # Fit a simple auto_arima model
    modl = pm.auto_arima(train,seasonal=False,
                         stepwise=True, suppress_warnings=True, D=10, max_D=10,
                         error_action='ignore')
    
    
    # Create predictions for the future, evaluate on test
    preds, conf_int = modl.predict(n_periods=test.shape[0], return_conf_int=True)
    
    if plot_model:
        print(f"Test RMSE: {player_id} %.3f" % np.sqrt(mean_squared_error(test, preds)))

        train_idx = np.arange(1,train.shape[0]+1)
        test_idx = np.arange(train.shape[0]+1,test.shape[0]+train.shape[0]+1)

        plt.plot(train_idx, train, alpha=0.75)
        plt.plot(test_idx, preds, alpha=0.75)  # Forecasts
        plt.scatter(test_idx, test,
                    alpha=0.4, marker='x')  # Test data
        plt.fill_between(test_idx,
                         conf_int[:, 0], conf_int[:, 1],
                         alpha=0.1, color='b')
        plt.title(f"Points forecast - {player_id}")
        plt.xlabel("Gameweek")
        plt.show()
    return preds[0]

In [69]:
models = {'player_id':[],
         'prediction':[]}
for p in season_17_players.intersection(season_18_players):
    print(p)
    try:
        models['prediction'].append(fit_autoarima(combined_df, p))
        models['player_id'].append(p) 
    except:
        print(f'player {p} failed')

van Aanholt-DEF
Hendrick-MID
Hogg-MID
Holebas-DEF
Simpson-DEF
Mavropanos-DEF
Cech-GKP
Kenny-DEF
McArthur-MID
Pickford-GKP
James-MID
Otamendi-DEF
Wood-FWD
Manquillo-DEF
Lössl-GKP
Stones-DEF
Milivojevic-MID
Coleman-DEF
Walcott-MID
Ki Sung-yueng-MID
Darmian-DEF
Clark-DEF
Firmino-FWD
Mané-MID
Atsu-MID
Crouch-FWD
player Crouch-FWD failed
McTominay-MID
Dummett-DEF
Elneny-MID
Mendy-DEF
Calvert-Lewin-FWD
Souaré-DEF
Cresswell-DEF
Koscielny-DEF
Pogba-MID
Bruno-DEF
Sissoko-MID
Gueye-MID
Stankovic-DEF
Lloris-GKP




Walker-DEF
Darlow-GKP
Billing-MID
Iheanacho-FWD
Mustafi-DEF
Redmond-MID
Stephens-MID
Kompany-DEF
Cahill-DEF
Smalling-DEF
Herrera-MID
Lanzini-MID
Rice-DEF
Barnes-MID
player Barnes-MID failed
Begovic-GKP
Jesus-FWD
Okazaki-FWD
Ward-GKP
Son-MID




Bellerín-DEF
Sánchez-DEF
Surman-MID
Antonio-MID
Lukaku-FWD
Mooy-MID
Smith-DEF
Azpilicueta-DEF
Sinclair-FWD
Hart-GKP
Jakupovic-GKP
Hamer-GKP
Trippier-DEF
Bravo-GKP
Maitland-Niles-MID
Riedewald-DEF
Choudhury-MID
Pedro-MID
Holding-DEF
Aké-DEF
Tomkins-DEF
Salah-MID
Emerson-DEF
Llorente-FWD
Adrián-GKP
Shelvey-MID
Zouma-DEF
Yoshida-DEF
Davies-DEF
Löwe-DEF
Vorm-GKP
Mawson-DEF
Mee-DEF
Dubravka-GKP
Özil-MID
Ampadu-MID
Hennessey-GKP
Baines-DEF
Ramsey-MID
Clyne-DEF
Murray-FWD
Mousset-FWD
Kane-FWD
Zabaleta-DEF
Ings-FWD
Ogbonna-DEF
Coleman-GKP
player Coleman-GKP failed
Pereyra-MID




Jones-MID
player Jones-MID failed
Tarkowski-DEF
Keane-DEF
Nketiah-FWD
player Nketiah-FWD failed
Stanislas-MID
Wilshere-MID
Gray-MID
Valencia-DEF
Fabianski-GKP
Deeney-FWD
Lallana-MID
Lennon-MID
Austin-FWD
Ward-Prowse-MID
Cork-MID
Joselu-FWD
Yedlin-DEF
Ederson-GKP
Daniels-DEF
Tosun-FWD
Cathcart-DEF
Deulofeu-MID
Prödl-DEF
Zinchenko-MID
Capoue-MID
March-MID
Mahrez-MID
Gomes-GKP
Schneiderlin-MID
Moreno-DEF
Suttner-DEF
Eriksen-MID
Johnson-DEF
player Johnson-DEF failed
Wilson-FWD
Matip-DEF
Bardsley-DEF
Schindler-DEF
Diangana-MID
Kolasinac-DEF
Kenedy-MID




Davies-MID
Wickham-FWD
Lamela-MID




Britos-DEF
Molumby-MID
player Molumby-MID failed
Rüdiger-DEF
Alexander-Arnold-DEF




Monreal-DEF
Rose-DEF
Bailly-DEF
David Silva-MID
Sims-MID
Lovren-DEF
Hazard-MID
Forster-GKP
Brady-MID
Adam Smith-DEF
Boruc-GKP
Knockaert-MID




Mata-MID
Cleverley-MID
Barkley-MID
Janmaat-DEF
Gazzaniga-GKP
Chilwell-DEF
Christensen-DEF
Stekelenburg-GKP
Lucas Moura-MID
Fosu-Mensah-DEF




Pröpper-MID
Fernandinho-MID
Mignolet-GKP
Hudson-Odoi-MID
Defour-MID
Foyth-DEF
player Foyth-DEF failed
Pritchard-MID
Alderweireld-DEF




Aubameyang-FWD
Chalobah-MID
Dunk-DEF
Agüero-FWD
Danilo-DEF
Solanke-FWD
Dier-MID
Giroud-FWD
Taylor-DEF
Alli-MID
David Luiz-DEF
De Bruyne-MID
Benteke-FWD
Speroni-GKP
Sané-MID
Francis-DEF
Højbjerg-MID
Fuchs-DEF
Sterling-MID
Kabasele-DEF
Diamé-MID
Vardy-FWD
Foster-GKP
Iwobi-MID
Maguire-DEF
McCarthy-GKP
Wanyama-MID
Chamberlain-MID
Schelotto-DEF
Richarlison-MID
Pope-GKP
Willian-MID
Lewis Cook-MID
Mounie-FWD
Lacazette-FWD
Long-FWD
Chicharito-FWD
Obafemi-FWD
player Obafemi-FWD failed
Robertson-DEF
Sabiri-MID
Laporte-DEF
Schlupp-DEF
Murphy-MID
Henderson-MID
Kanté-MID
Lindelöf-DEF
Fraser-MID
Wijnaldum-MID
Jagielka-DEF
Gomez-DEF
Shaqiri-MID
Walters-MID
Ryan-GKP
Oriol Romeu-MID
Reid-DEF
Bernardo Silva-MID
Winks-MID
Stephens-DEF
Rojo-DEF
Welbeck-FWD
Kongolo-DEF
Quaner-FWD
Sigurdsson-MID
Willock-MID
player Willock-MID failed
Steve Cook-DEF
Hadergjonaj-DEF
Lascelles-DEF
Xhaka-MID
Grant-GKP
Barnes-FWD
Lindegaard-GKP
Romero-GKP




Gosling-MID
Bednarek-DEF
Loftus-Cheek-MID
Pérez-FWD
Shaw-DEF
Kelly-DEF
Heaton-GKP
Sako-MID
Gray-FWD
Butcher-MID




player Butcher-MID failed
Townsend-MID
Ibe-MID
Johnson-MID




player Johnson-MID failed
Kayal-MID
Lingard-MID
Quina-MID
Doucouré-MID




McCarthy-MID
Aurier-DEF
Duffy-DEF
Obiang-MID
Whiteman-GKP
Vertonghen-DEF




Depoitre-FWD
Sakho-DEF
Puncheon-MID
Izquierdo-MID
Bertrand-DEF
Locadia-FWD
Bong-DEF
Niasse-FWD
Camacho-MID
player Camacho-MID failed
Fernández-DEF
Foden-MID




Mkhitaryan-MID
Jones-DEF
Albrighton-MID
Westwood-MID
Sturridge-FWD
Rondón-FWD
Mariappa-DEF
Noble-MID
Amartey-MID
Hughes-MID
Gündogan-MID
Zappacosta-DEF
Sánchez-MID
Williams-MID
Gudmundsson-MID
Caballero-GKP
Carroll-FWD
King-FWD
Taylor-MID




player Taylor-MID failed
Long-DEF
Drinkwater-MID
Walker-Peters-DEF
Kouyaté-MID
Matic-MID
Morgan-DEF
van Dijk-DEF
Lemina-MID
Kiko Femenía-DEF
Hyndman-MID
Zanka-DEF
Martial-MID
Lowton-DEF
Chambers-DEF
Elliot-GKP
Evans-DEF
Dann-DEF
Groß-MID
Rashford-FWD
Ndidi-MID
Ward-DEF
Lejeune-DEF
Arter-MID
Schmeichel-GKP
Hayden-MID
Ritchie-MID
Alonso-DEF


In [70]:
predictions = pd.DataFrame(models)

In [71]:
predictions.sort_values(by='prediction',ascending=False).head()

Unnamed: 0,player_id,prediction
0,van Aanholt-DEF,14.353512
70,Bravo-GKP,11.047348
249,Pérez-FWD,8.509688
78,Salah-MID,7.973684
77,Tomkins-DEF,6.908856


In [72]:
starting_costs = combined_df.loc[(combined_df.gw_gameweek==1)&
               (combined_df.season == 2018),['player_id','p_value','team', 'position']].drop_duplicates(['player_id'])

In [73]:
candidates = starting_costs.merge(predictions,on='player_id',how='inner')

In [74]:
candidates.sort_values(by='prediction',ascending=False,inplace=True)

In [75]:
candidates['p_value_sum'] = candidates.p_value.cumsum()

In [76]:
candidates.head(15)

Unnamed: 0,player_id,p_value,team,position,prediction,p_value_sum
36,van Aanholt-DEF,5.5,CRY,DEF,14.353512,5.5
309,Bravo-GKP,5.0,MCI,GKP,11.047348,10.5
35,Pérez-FWD,6.5,NEW,FWD,8.509688,17.0
0,Salah-MID,13.0,LIV,MID,7.973684,30.0
101,Tomkins-DEF,4.5,CRY,DEF,6.908856,34.5
23,Kane-FWD,12.5,TOT,FWD,6.412367,47.0
216,Lejeune-DEF,4.5,NEW,DEF,6.412039,51.5
17,Milivojevic-MID,6.5,CRY,MID,6.274975,58.0
2,Sterling-MID,11.0,MCI,MID,6.026316,69.0
40,Wood-FWD,6.5,BUR,FWD,5.823031,75.5
