# ML Pipeline

> This pipeline uses each match's outcome to learn which player shall make it to the round of 16

- The model trains on data recorded in the prior 3 years of the testing year
- Post training, it predicts the first round of the tournament in two ways:
    - Walkovers (given manually) for players who compete with players who don't have statistics
    - model's predictions for matches where both players have statistics
- Testing data is reconstructed to predict winners of the second round
- Only relies on model for prediction of winners of second round
- Testing data is reconstructed to predict winners of the third round. This is also the last loop of testing.
- Only relies on model for prediction of winners of third round
- Compare with the actual results and winners of third round

# Import Libraries

In [46]:
import pandas as pd
from difflib import get_close_matches
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)


# Inputs

In [47]:
TRAIN_ON = [2015, 2019]
TEST_ON = 2019
mod1 = SVC(kernel='linear',C=0.001, max_iter=1000000)

MODEL = mod1

# Utils

In [48]:
stats = ['aces_per_match', 'double_faults_per_match', 'break_points_faced_per_match',
         'break_points_opportunities_per_match', 'height (cm)',
         'matches_played', '1st_serve',
         'return_games_won', '1st_serve_points_won', '2nd_serve_points_won',
         'service_games_won', '1st_serve_return_points_won',
         '2nd_serve_return_points_won', 'pressure_rating']

# DO NOT CHANGE THIS: It conatins the order of matches so that the match seeds are intact
actual_seed = [48, 43, 34, 42, 35, 53, 57, 40, 52, 49, 63, 54, 55, 59, 46,
               58, 39, 60, 41, 33, 61, 56, 50, 62, 51, 44, 38, 37, 45, 36,
               32, 47, 8, 10, 9, 15, 21, 25, 6, 7, 14, 24, 20, 18, 13, 11,
               26, 19, 16, 4, 3, 2, 28, 5, 12, 1, 17, 27, 30, 22, 29, 0,
               23, 31,]


def change_name(name: str) -> str:
    """change order of name from first last to last first

    Args:
        name (str): name of player as first last

    Returns:
        str: last first name
    """
    split_names = name.split()
    f_name = split_names.pop(0)
    split_names.append(f_name)
    final_name = " ".join(split_names)
    return final_name

def rank_gr(rank: float, group_lim: int = 15):
    ranges = list(range(0, 100, group_lim))
    if rank not in list(range(1, 1000)):
        return len(ranges)

    for idx,num in enumerate(ranges):
        if num - rank >= 0:
            return idx
    return len(ranges)

def stat_df(year: int) -> pd.DataFrame:
    """returns stats of players participating in the next given year's AO

    Args:
        year (int): year of stats needed

    Returns:
        pd.DataFrame: stats of players participating in the next given year's AO
    """
    df = pd.read_csv(f'./data/aus-open-player-stats-{year}.csv')
    # df['rank_group'] = df['rank_x'].apply(lambda x: rank_gr(x))
    # grp_rank_pr = df.groupby('rank_group').agg({'pressure_rating':'mean'}).reset_index()

    # grp_rank_pr.set_index('rank_group', inplace=True)
    # df.set_index('rank_group', inplace=True)

    # df['pressure_rating'].fillna(grp_rank_pr['pressure_rating'], inplace=True)

    # df.reset_index(drop=True, inplace=True)

    df['pressure_rating'].fillna(0, inplace=True)

    df.loc[df.aces == 0, 'aces'] = -np.inf

    per_match_cols = ['aces', 'double_faults',
                      'break_points_opportunities', 'break_points_faced']
    for col in per_match_cols:
        df[f'{col}_per_match'] = df[col] / df.matches_played

    percent_cols = ["1st_serve", "1st_serve_points_won", "2nd_serve_points_won", "break_points_saved",
                    "service_games_won", "1st_serve_return_points_won", "2nd_serve_return_points_won",
                    "break_points_converted", "return_games_won", 'total_points_won', 'return_points_won']
    for col in percent_cols:
        df[col] = df[col].apply(lambda x: round(float(x.strip('%')) / 100, 2))

    per_match_cols = [f'{x}_per_match' for x in per_match_cols]
    per_match_cols.extend(percent_cols)
    per_match_cols.extend(['name', 'year', 'height (cm)', 'matches_played', 'pressure_rating'])
    return df[per_match_cols]




In [49]:
def year_df(year: int) -> pd.DataFrame:
    """provides training data for the given year

    Args:
        year (int): year of the tournament

    Returns:
        pd.DataFrame: training data for the given year with the player's stats in the previous year
    """
    df = pd.read_csv(f'./data/m{year}.csv')
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True).dt.year
    df = df[['Round', 'Winner', 'Loser', 'WRank', 'LRank', 'Date']]

    even = df.iloc[::2,].copy(deep=True)
    odd = df.iloc[1::2,].copy(deep=True)
    even.rename(columns={'Winner': 'player_0', 'Loser': 'player_1',
                'WRank': 'rank_0', 'LRank': 'rank_1'}, inplace=True)
    odd.rename(columns={'Winner': 'player_1', 'Loser': 'player_0',
                'WRank': 'rank_1', 'LRank': 'rank_0'}, inplace=True)
    even['winner'] = 0
    odd['winner'] = 1


    final_df = pd.concat([even, odd]).sort_index()
    final_df['rank'] = final_df['rank_0'] - final_df['rank_1']
    # final_df.drop(columns=['rank'], inplace=True)

    names_to_lookup = set(final_df.player_0)
    names_to_lookup.update(set(final_df.player_1))

    df2 = stat_df(year - 1)

    df2['new_name'] = df2['name'].apply(change_name)
    df2['other_name'] = df2['new_name'].apply(
        lambda x: get_close_matches(x, names_to_lookup, 1, cutoff=0.5)[0])

    df2 = df2.drop(columns=['new_name'])

    final_df[stats] = final_df.apply(lambda x: df2.loc[df2.other_name == x.player_0,stats].copy(deep=True).iloc[0,] - df2.loc[df2.other_name == x.player_1,stats].copy(deep=True).iloc[0,], axis=1)
    return final_df


# Training data

In [50]:
final_df = pd.DataFrame()
for year in range(TRAIN_ON[0],TRAIN_ON[1]):
    final_df = pd.concat([final_df, year_df(year)])
final_df

Unnamed: 0,Round,player_0,player_1,rank_0,rank_1,Date,winner,rank,aces_per_match,double_faults_per_match,...,height (cm),matches_played,1st_serve,return_games_won,1st_serve_points_won,2nd_serve_points_won,service_games_won,1st_serve_return_points_won,2nd_serve_return_points_won,pressure_rating
0,1st Round,Berankis R.,Sijsling I.,85.0,84.0,2015,0,1.0,-5.551724,-1.706897,...,-16.0,32.0,0.00,-0.03,-0.05,0.00,-0.04,0.02,-0.08,39.4
1,1st Round,Brown D.,Dimitrov G.,90.0,11.0,2015,1,79.0,-1.868132,-0.817582,...,5.0,-22.0,0.04,-0.06,-0.04,-0.03,-0.02,-0.04,0.02,-68.1
2,1st Round,Anderson K.,Schwartzman D.,15.0,59.0,2015,0,-44.0,9.104343,2.211335,...,33.0,27.0,0.16,0.03,0.15,0.17,0.44,0.03,0.10,152.1
3,1st Round,Coric B.,Chardy J.,91.0,31.0,2015,1,60.0,-7.867821,-4.334382,...,0.0,10.0,-0.05,0.05,-0.02,0.02,0.01,0.02,0.02,-35.9
4,1st Round,Lacko L.,Gonzalez M.,99.0,98.0,2015,0,1.0,2.426471,0.558824,...,15.0,26.0,-0.02,0.09,0.03,-0.07,-0.02,0.07,0.04,162.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,Quarterfinals,Chung H.,Sandgren T.,58.0,97.0,2018,0,-39.0,1.159184,1.942857,...,0.0,19.0,-0.03,0.09,-0.04,-0.03,-0.08,0.08,0.01,45.9
123,Quarterfinals,Berdych T.,Federer R.,20.0,2.0,2018,1,18.0,2.898785,2.747976,...,11.0,-33.0,-0.06,-0.04,-0.01,-0.08,-0.07,-0.06,0.03,-52.6
124,Semifinals,Cilic M.,Edmund K.,6.0,49.0,2018,0,-43.0,2.104312,-0.351981,...,10.0,5.0,0.00,0.01,0.02,-0.02,-0.03,0.04,-0.02,9.8
125,Semifinals,Chung H.,Federer R.,58.0,2.0,2018,1,56.0,-6.194662,0.469780,...,3.0,-3.0,-0.03,0.01,-0.09,-0.12,-0.16,0.00,0.02,-39.0


# Preprocessing

## Drop NA Values

In [51]:
final_df = final_df.replace([np.inf, -np.inf], np.nan).dropna()

## One Hot Encoding

In [52]:
final_df = pd.get_dummies(final_df, columns=['Round'])
final_df

Unnamed: 0,player_0,player_1,rank_0,rank_1,Date,winner,rank,aces_per_match,double_faults_per_match,break_points_faced_per_match,...,1st_serve_return_points_won,2nd_serve_return_points_won,pressure_rating,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Semifinals,Round_The Final
0,Berankis R.,Sijsling I.,85.0,84.0,2015,0,1.0,-5.551724,-1.706897,-3.454907,...,0.02,-0.08,39.4,1,0,0,0,0,0,0
1,Brown D.,Dimitrov G.,90.0,11.0,2015,1,79.0,-1.868132,-0.817582,-1.215385,...,-0.04,0.02,-68.1,1,0,0,0,0,0,0
2,Anderson K.,Schwartzman D.,15.0,59.0,2015,0,-44.0,9.104343,2.211335,4.269068,...,0.03,0.10,152.1,1,0,0,0,0,0,0
3,Coric B.,Chardy J.,91.0,31.0,2015,1,60.0,-7.867821,-4.334382,-5.548387,...,0.02,0.02,-35.9,1,0,0,0,0,0,0
4,Lacko L.,Gonzalez M.,99.0,98.0,2015,0,1.0,2.426471,0.558824,1.632353,...,0.07,0.04,162.8,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,Chung H.,Sandgren T.,58.0,97.0,2018,0,-39.0,1.159184,1.942857,3.654422,...,0.08,0.01,45.9,0,0,0,0,1,0,0
123,Berdych T.,Federer R.,20.0,2.0,2018,1,18.0,2.898785,2.747976,4.334008,...,-0.06,0.03,-52.6,0,0,0,0,1,0,0
124,Cilic M.,Edmund K.,6.0,49.0,2018,0,-43.0,2.104312,-0.351981,-1.005245,...,0.04,-0.02,9.8,0,0,0,0,0,1,0
125,Chung H.,Federer R.,58.0,2.0,2018,1,56.0,-6.194662,0.469780,1.195447,...,0.00,0.02,-39.0,0,0,0,0,0,1,0


# Modeling

In [53]:
X_train = final_df.drop(columns=['winner','player_0','player_1','rank_0','rank_1'])
y_train = final_df['winner']

In [54]:
model = MODEL
model.fit(X_train, y_train)

# Predicting

In [55]:
# We get data of players participating in 2019, 
# and their stats in 2018
main_df = year_df(TEST_ON)
main_df = pd.get_dummies(main_df, columns=['Round'])
cols_to_have = main_df.columns
main_df = main_df.loc[:63,].copy(deep=True)

In [56]:
main_df.index = actual_seed
main_df.sort_index(inplace=True)
test = main_df.replace([np.inf,-np.inf],np.nan)
test = test.dropna()

# # test.drop(columns=['player_0','player_1','rank_0','rank_1'], inplace=True)
X_test = test.drop(columns=['winner','player_0','player_1','rank_0','rank_1'])
y_test = test['winner']

First Round (Round of 64) prediction accuracy

In [57]:
model.score(X_test, y_test)

0.7678571428571429

# First Round Predictions

In [58]:
round_64 = model.predict(X_test)

In [59]:
test = test.reset_index(drop=True)
test['prediction'] = round_64

In [60]:
# this dataframe helps in including the manual and model predictions
middle_man = main_df.join(test[['player_0', 'player_1',
                                'rank_0', 'rank_1', 'prediction']]
                          .set_index(['player_0', 'player_1',
                                      'rank_0', 'rank_1']),
                          on=['player_0', 'player_1', 'rank_0', 'rank_1'])


## Force a certian prediction

In [61]:
middle_man.loc[(middle_man.aces_per_match == np.inf) |
               (middle_man.aces_per_match == -np.inf), 'prediction'] = middle_man.loc[(middle_man.aces_per_match == np.inf) |
                                                                                      (middle_man.aces_per_match == -np.inf),].apply(lambda x: 1 if x.aces_per_match == -np.inf else 0, axis=1)


In [62]:
middle_man['w_name'] = middle_man.apply(lambda x: x.player_0 if x.prediction == 0 else x.player_1, axis=1)
middle_man['w_rank'] = middle_man.apply(lambda x: x.rank_0 if x.prediction == 0 else x.rank_1, axis=1)

# 2nd Round predictions (Round of 32)

## Preprocessing

In [63]:
round_32 = middle_man[['w_name','w_rank']].copy(deep=True)

In [64]:
r_32_even = round_32.iloc[::2,].copy(deep=True).reset_index(drop=True)
r_32_odd = round_32.iloc[1::2,].copy(deep=True).reset_index(drop=True)

In [65]:
r_32_even.columns = ['player_0','rank_0']
r_32_odd.columns = ['player_1','rank_1']

In [66]:
r_32_merge = pd.concat([r_32_even,r_32_odd], axis=1)
r_32_merge['Round'] = '2nd Round'

In [67]:
df2 = stat_df(TEST_ON - 1)

names_to_lookup = set(middle_man.player_0)
names_to_lookup.update(middle_man.player_1)

df2['new_name'] = df2['name'].apply(change_name)
df2['other_name'] = df2['new_name'].apply(
    lambda x: get_close_matches(x, names_to_lookup, 1, cutoff=0.5)[0])

df2 = df2.drop(columns=['new_name'])

In [68]:
r_32_merge[stats] = r_32_merge.apply(lambda x: df2.loc[df2.other_name == x.player_0,stats].copy(deep=True).iloc[0,] - df2.loc[df2.other_name == x.player_1,stats].copy(deep=True).iloc[0,], axis=1)

In [69]:
r_32_merge['rank'] = r_32_merge['rank_0'] - r_32_merge['rank_1']
r_32_merge = pd.get_dummies(r_32_merge, columns=['Round'])

In [70]:
r_32_merge[['Round_1st Round','Round_3rd Round', 'Round_4th Round', 'Round_Quarterfinals','Round_Semifinals','Round_The Final']] = 0
r_32_merge['Date'] = pd.to_datetime('2019')
r_32_merge['Date'] = pd.to_datetime(r_32_merge['Date'], yearfirst=True).dt.year

## Predicting

In [71]:
X_test = r_32_merge.drop(columns=['player_0', 'player_1', 'rank_0', 'rank_1'])
r_32_pred = model.predict(X_test[X_train.columns])
r_32_merge['prediction'] = r_32_pred

In [72]:
r_32_merge['w_name'] = r_32_merge.apply(lambda x: x.player_0 if x.prediction == 0 else x.player_1, axis=1)
r_32_merge['w_rank'] = r_32_merge.apply(lambda x: x.rank_0 if x.prediction == 0 else x.rank_1, axis=1)

# 3rd Round predictions (Round of 16)

## Preprocessing

In [73]:
r_16 = r_32_merge[['w_name','w_rank']].copy(deep=True)

In [74]:
r_16_even = r_16.iloc[::2,].copy(deep=True).reset_index(drop=True)
r_16_odd = r_16.iloc[1::2,].copy(deep=True).reset_index(drop=True)

In [75]:
r_16_even.columns = ['player_0','rank_0']
r_16_odd.columns = ['player_1','rank_1']

In [76]:
r_16_merge = pd.concat([r_16_even,r_16_odd], axis=1)
r_16_merge['Round'] = '3rd Round'

In [77]:
r_16_merge[stats] = r_16_merge.apply(lambda x: df2.loc[df2.other_name == x.player_0,stats].copy(deep=True).iloc[0,] - df2.loc[df2.other_name == x.player_1,stats].copy(deep=True).iloc[0,], axis=1)

In [78]:
r_16_merge['rank'] = r_16_merge['rank_0'] - r_16_merge['rank_1']
r_16_merge = pd.get_dummies(r_16_merge, columns=['Round'])

In [79]:
r_16_merge[['Round_1st Round','Round_2nd Round', 'Round_4th Round', 'Round_Quarterfinals','Round_Semifinals','Round_The Final']] = 0
r_16_merge['Date'] = pd.to_datetime('2019')
r_16_merge['Date'] = pd.to_datetime(r_16_merge['Date'], yearfirst=True).dt.year

In [80]:
X_test = r_16_merge.drop(columns=['player_0','player_1','rank_0','rank_1'])
r_16_pred = model.predict(X_test[X_train.columns])

In [81]:
r_16_merge['prediction']= r_16_pred

# Final Contenders

In [82]:
r_16_merge['w_name'] = r_16_merge.apply(lambda x: x.player_0 if x.prediction == 0 else x.player_1, axis=1)
r_16_merge['w_rank'] = r_16_merge.apply(lambda x: x.rank_0 if x.prediction == 0 else x.rank_1, axis=1)
r_16_merge[['player_0','player_1','rank_0','rank_1','w_name','w_rank']]

Unnamed: 0,player_0,player_1,rank_0,rank_1,w_name,w_rank
0,Djokovic N.,Shapovalov D.,1.0,27.0,Djokovic N.,1.0
1,Goffin D.,Medvedev D.,22.0,19.0,Medvedev D.,19.0
2,Fognini F.,Jaziri M.,13.0,43.0,Fognini F.,13.0
3,Kohlschreiber P.,Nishikori K.,32.0,9.0,Nishikori K.,9.0
4,Zverev A.,Simon G.,4.0,30.0,Zverev A.,4.0
5,Chung H.,Raonic M.,25.0,17.0,Chung H.,25.0
6,Coric B.,Cecchinato M.,12.0,18.0,Coric B.,12.0
7,Pouille L.,Thiem D.,31.0,8.0,Thiem D.,8.0
8,Cilic M.,Verdasco F.,7.0,28.0,Cilic M.,7.0
9,Bautista Agut R.,Khachanov K.,24.0,11.0,Khachanov K.,11.0


In [83]:
test_df = pd.read_csv(f'./data/m{TEST_ON}.csv')
winners = test_df.loc[test_df.Round == '3rd Round','Winner']

conf_matrix = pd.DataFrame(columns=['name','actual','pred'])
names = set(test_df.Loser)
names.update(test_df.Winner)
conf_matrix['name'] = pd.Series(list(names))
conf_matrix['actual'] = 0
conf_matrix['pred'] = 0
conf_matrix.loc[conf_matrix['name'].isin(winners),'actual'] = 1
pred_winners = r_16_merge['w_name']
conf_matrix.loc[conf_matrix['name'].isin(pred_winners),'pred'] = 1
conf_matrix

Unnamed: 0,name,actual,pred
0,Istomin D.,0,0
1,Krueger M.,0,0
2,Fucsovics M.,0,0
3,Vanni L.,0,0
4,Simon G.,0,0
...,...,...,...
123,Isner J.,0,1
124,Kubler J.,0,0
125,Polmans M.,0,0
126,Gulbis E.,0,0


In [84]:
confusion_matrix(conf_matrix['actual'],conf_matrix['pred'])

array([[105,   7],
       [  7,   9]])

In [85]:
test_df = pd.read_csv(f'./data/m{TEST_ON}.csv')
winners = test_df.loc[test_df.Round == '2nd Round','Winner']

conf_matrix = pd.DataFrame(columns=['name','actual','pred'])
names = set(test_df.Loser)
names.update(test_df.Winner)
conf_matrix['name'] = pd.Series(list(names))
conf_matrix['actual'] = 0
conf_matrix['pred'] = 0
conf_matrix.loc[conf_matrix['name'].isin(winners),'actual'] = 1
pred_winners = r_16['w_name']
conf_matrix.loc[conf_matrix['name'].isin(pred_winners),'pred'] = 1
conf_matrix

Unnamed: 0,name,actual,pred
0,Istomin D.,0,0
1,Krueger M.,0,0
2,Fucsovics M.,0,0
3,Vanni L.,0,0
4,Simon G.,0,1
...,...,...,...
123,Isner J.,0,1
124,Kubler J.,0,0
125,Polmans M.,0,0
126,Gulbis E.,0,0


In [86]:
confusion_matrix(conf_matrix['actual'],conf_matrix['pred'])

array([[87,  9],
       [ 9, 23]])

In [87]:
acc = accuracy_score(conf_matrix['actual'],conf_matrix['pred'])
prec = precision_score(conf_matrix['actual'],conf_matrix['pred'])
rec = recall_score(conf_matrix['actual'],conf_matrix['pred'])
f1 = f1_score(conf_matrix['actual'],conf_matrix['pred'])
print(f'Accuracy: {acc}\nPrecision: {prec}\nRecall: {rec}\nF1: {f1}')

Accuracy: 0.859375
Precision: 0.71875
Recall: 0.71875
F1: 0.71875


In [88]:
test_df = pd.read_csv(f'./data/m{TEST_ON}.csv')
winners = test_df.loc[test_df.Round == '1st Round','Winner']

conf_matrix = pd.DataFrame(columns=['name','actual','pred'])
names = set(test_df.Loser)
names.update(test_df.Winner)
conf_matrix['name'] = pd.Series(list(names))
conf_matrix['actual'] = 0
conf_matrix['pred'] = 0
conf_matrix.loc[conf_matrix['name'].isin(winners),'actual'] = 1
pred_winners = middle_man['w_name']
conf_matrix.loc[conf_matrix['name'].isin(pred_winners),'pred'] = 1
conf_matrix

Unnamed: 0,name,actual,pred
0,Istomin D.,0,0
1,Krueger M.,0,0
2,Fucsovics M.,1,1
3,Vanni L.,0,0
4,Simon G.,1,1
...,...,...,...
123,Isner J.,0,1
124,Kubler J.,0,0
125,Polmans M.,0,0
126,Gulbis E.,0,0


In [89]:
confusion_matrix(conf_matrix['actual'],conf_matrix['pred'])

array([[50, 14],
       [14, 50]])

In [90]:
acc = accuracy_score(conf_matrix['actual'],conf_matrix['pred'])
prec = precision_score(conf_matrix['actual'],conf_matrix['pred'])
rec = recall_score(conf_matrix['actual'],conf_matrix['pred'])
f1 = f1_score(conf_matrix['actual'],conf_matrix['pred'])
print(f'Accuracy: {acc}\nPrecision: {prec}\nRecall: {rec}\nF1: {f1}')

Accuracy: 0.78125
Precision: 0.78125
Recall: 0.78125
F1: 0.78125
