In [38]:
import pandas as pd
import numpy as np
import seaborn as sn
import tensorflow as tf
import matplotlib.pyplot as plt
from math import log
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn import neighbors, metrics

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from time import sleep


## Betting odds Dataframe

Source: <a href="http://tennis-data.co.uk/" target="_blank">http://tennis-data.co.uk/</a>

In [117]:
df_betting=pd.DataFrame()
for i in range(2002,2021):
    
    url="http://tennis-data.co.uk/%d/%d.xlsx" %(i,i)
    df=pd.read_excel(url)
    df_betting= pd.concat([df_betting,df],ignore_index=True)

Create a datframe with Atp matches since 2002

In [40]:
def checkName(Name):
    if '..' in Name:
        Name=Name.replace('..','.')
    else:
        Name=Name
        
    if '. ' in Name:
        Name=Name.replace('. ','.')
    else:
        Name=Name    
        
    length=len(Name)
    if Name[length-1]!='.':
        Name=Name+'.'
    else:
        Name=Name
    return Name

In [41]:
df_betting_players=pd.DataFrame()
#Keep same syntax for player Name in order to have an unique ID for each player
df_betting_players["Winner"]=df_betting["Winner"].apply(lambda x: checkName(x))
df_betting_players["Loser"]=df_betting["Loser"].apply(lambda x: checkName(x))

## Players Dataframe

Source: <a href="https://www.kaggle.com/romanzdk/atp-players-overviews" target="_blank">https://www.kaggle.com/romanzdk/atp-players-overviews/</a>

In [42]:
df_players=pd.read_csv('player_overviews.csv')
df_players=df_players.drop(df_players.columns[4:17],axis=1)
df_players=df_players.drop(df_players.columns[-1],axis=1)
df_players.columns=['AtpId','AtpName','FirstName','Name','Height','Handedness']
df_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness
0,a001,sadiq-abdullahi,Sadiq,Abdullahi,0.0,
1,a005,nelson-aerts,Nelson,Aerts,188.0,
2,a004,egan-adams,Egan,Adams,178.0,
3,a006,ronald-agenor,Ronald,Agenor,180.0,
4,a007,juan-aguilera,Juan,Aguilera,183.0,


### Change Name syntax to be similar to betting dataframe

For a future merge with the betting dataframe, we need to apply the same syntax to the Name column

In [43]:
def PlayerName(Name,FirstName):
    length= len(Name)
    if "-" in FirstName:
        FirstName=FirstName.split("-")
        player=Name+' '+FirstName[0][0]+'.'+FirstName[1][0]+'.'
    elif " " in FirstName:
        FirstName=FirstName.split()
        if len(FirstName)>1:
            player=Name+' '+FirstName[0][0]+'.'+FirstName[1][0]+'.'
        else:
            player=Name+' '+FirstName[0]+'.'
    elif Name[length-1]=='.':
        player=Name+FirstName[0]+'.'
        
    else:
        player=Name+' '+FirstName[0]+'.'
    return player.title()

In [44]:
df_players['NewName']=df_players.apply(lambda x: PlayerName(x["Name"],x["FirstName"]), axis=1)

Some players have their name and first letter of their firstname similar. We have to distinguished them 

### Update players dataframe

The dataset is not udpated and some current players are missing. The idea is to scrap datas from the https://www.atptour.com/ website.

In [45]:
#Player informations' are accessible through their atp overview webpage. 
#To get this page we need the name and the id given by the website.
#A first scrap on 'https://www.atptour.com/en/rankings/singles/ gives the url of the overview page
#Then for each player, we scrap First name and Last name in their overview webpage and create a dictionnary

def AtpPlayers():
    AtpIdList=[]
    AtpNameList=[]
    FirstNameList=[]
    LastNameList=[]
    ActualRankingList=[]
    atpRank='https://www.atptour.com/en/rankings/singles/?rankDate=2020-9-14&countryCode=all&rankRange=0-700'
    req = Request(atpRank, headers={'User-Agent': ''})
    webpageAtp = urlopen(req).read()
    soup = BeautifulSoup(webpageAtp, 'html.parser')
    for i in range(0,700):
        url_overview='https://www.atptour.com' + soup.select('.player-cell a')[i]['href']
        AtpName=url_overview.split("/")[5]
        AtpId=url_overview.split("/")[6]
        
        req2 = Request(url_overview, headers={'User-Agent': 'Mozilla/5.0'})
        webpage_overview = urlopen(req2).read()
        soup2 = BeautifulSoup(webpage_overview, 'html.parser')
        try:
            FirstName = soup2.find( attrs={'class': 'first-name'}).get_text()
        except AttributeError:
            FirstName=np.nan
        
        try:
            Name = soup2.find( attrs={'class': 'last-name'}).get_text()
        except AttributeError:
            Name=np.nan
            
        
            
        AtpIdList.append(AtpId)
        AtpNameList.append(AtpName)
        FirstNameList.append(FirstName)
        LastNameList.append(Name)
        ActualRankingList.append(i+1)
        
        Dictionnary={'AtpId':AtpIdList,'AtpName':AtpNameList,'FirstName':FirstNameList,'Name':LastNameList,'ActualRanking':ActualRankingList}

    return Dictionnary

In [46]:
df_AtpPlayers=AtpPlayers() #get a dictionnary with scraped datas
df_AtpPlayers=pd.DataFrame(df_AtpPlayers) #convert dictionnary into dataframe
df_AtpPlayers['NewName']=df_AtpPlayers.apply(lambda x: PlayerName(x["Name"],x["FirstName"]), axis=1)
#df_AtpPlayers=df_AtpPlayers.drop(["FirstName"],axis=1) #column not needed anymore

In [47]:
df_total_players=pd.concat([df_players, df_AtpPlayers], ignore_index=True) # concat the scraping dataframe with the original one
df_total_players.drop_duplicates(subset ="AtpId",keep = "last", inplace = True) # KEEP UNIQUE VALUES
print(df_total_players.shape)
df_total_players.head()

(11188, 8)


Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,NewName,ActualRanking
0,a001,sadiq-abdullahi,Sadiq,Abdullahi,0.0,,Abdullahi S.,
1,a005,nelson-aerts,Nelson,Aerts,188.0,,Aerts N.,
2,a004,egan-adams,Egan,Adams,178.0,,Adams E.,
3,a006,ronald-agenor,Ronald,Agenor,180.0,,Agenor R.,
4,a007,juan-aguilera,Juan,Aguilera,183.0,,Aguilera J.,


In [48]:
df_total_players=df_total_players.sort_values(["Name","FirstName"]).reset_index(drop=True)

In [49]:
def duplicatedNames(df):
    column=[df.loc[0,"NewName"]]
    for i in range(1,len(df)):
        j=0
        while df.loc[i,"NewName"]in column:
            df.loc[i,"NewName"]=df.loc[i,"Name"]+' '+df.loc[i,"FirstName"][0:j+1]+'.'
            j+=1
        
        column.append(df.loc[i,"NewName"])
    return column

In [50]:
df_total_players["Name"]=duplicatedNames(df_total_players)

In [51]:
#df_total_players=df_total_players.drop(["FirstName"],axis=1) #column not needed anymore
df_total_players=df_total_players.drop(["NewName"],axis=1) #column not needed anymore
#df_players.rename(columns={'NewName':'Name'}, inplace=True) #column renamed
df_total_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,ActualRanking
0,a451,bob-abbott,Bob,Abbott B.,0.0,,
1,a755,hashim-abdal,Hashim,Abdal H.,0.0,,
2,ad60,robert-abdesselam,Robert,Abdesselam R.,,,
3,a742,ahmed-abdrabuh,Ahmed,Abdrabuh A.,0.0,,
4,a706,syrym-abdukhalikov,Syrym,Abdukhalikov S.,0.0,,


In [91]:
df_total_players[df_total_players["Name"].str.contains("Kuznetsov")]

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,ActualRanking
5512,k737,alex-kuznetsov,Alex,Kuznetsov A.,183.0,Right-Handed,
5513,kb54,andrey-kuznetsov,Andrey,Kuznetsov An.,,,569.0


In [53]:
print(df_players.shape)

(10911, 7)


In [54]:
df_total_players[df_total_players["Name"]=="Nadal R."]

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,ActualRanking
7064,n409,rafael-nadal,Rafael,Nadal R.,,,2.0


### Select only commun players

In [61]:
df_selected_players=df_total_players[df_total_players["Name"].isin(df_betting_players['Winner'])| df_total_players["Name"].isin(df_betting_players['Loser'])].reset_index(drop=True) 

In [62]:
print(df_selected_players.shape)

(1201, 7)


Height, Handedness and Backhand are interesting criterias to analyse but we saw in the model prediction notebook that backhand had too many 'Nan' values even after scraping. Thanks to the AtId and AtpNAme we scraped on the website https://www.atptour.com/ 

In [63]:
def scraping(atpId,atpName,height,handedness):
    
    try:
        url_atp = 'https://www.atptour.com/en/players/'+atpName+'/'+atpId+'/overview'
        req = Request(url_atp, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = BeautifulSoup(webpage, 'html.parser')

        if ((height!=height) or (height==0.0)): #check NaN value or height equals to 0
            try:
                height = soup.find( attrs={'class': 'table-height-cm-wrapper'}).get_text()
                height= height.replace('(','').replace(')','').replace('cm','')
                if str(height)=='0':
                    height=np.nan
                else:
                    height=height
            except AttributeError:
                height=np.nan
        else:
            height=height

        if handedness!=handedness: #check NaN value
            try:
                plays= soup.find_all( attrs={'class':"table-value"})
                if len(plays)>=2:
                    plays=plays[2].get_text()
                    if len(plays)>1:
                        plays=plays.split(", ")

                        handedness=plays[0].replace("\r","")
                        handedness=handedness.replace("\n","")  
                    else:
                        handedness=plays.replace("\r","")
                        handedness=handedness.replace("\n","")
                    if handedness=='':
                        handedness=np.nan
                    else:
                        handedness=handedness
                else:
                    handedness=np.nan
            except AttributeError:
                handedness=np.nan
        else:
            handedness=handedness

        
    except:
        height=handedness=np.nan
    
    
    
    return height,handedness

In [64]:
def UpdateRanking(atpId,atpName):
    url_atp = 'https://www.atptour.com/en/players/'+atpName+'/'+atpId+'/overview'
    req = Request(url_atp, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, 'html.parser')
    try:
        Ranking = soup.find( attrs={'class': 'data-number'}).get_text()
    except:
        Ranking=np.nan
    try:
        int(Ranking)
        return Ranking
    except ValueError: #For inactive players 
        return np.nan

In [65]:
df_selected_players.dtypes

AtpId             object
AtpName           object
FirstName         object
Name              object
Height           float64
Handedness        object
ActualRanking    float64
dtype: object

In [66]:
df_selected_players[['Height','Handedness']]=df_selected_players.apply(lambda x : scraping(x['AtpId'],x['AtpName'],x['Height'],x['Handedness']),axis=1, result_type="expand")
df_selected_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,ActualRanking
0,a618,mohammed-abdulla,Mohammed,Abdulla M.,,,
1,a391,maximilian-abel,Maximilian,Abel M.,,Right-Handed,
2,a389,jose-acasuso,Jose,Acasuso J.,191.0,Right-Handed,
3,a305,jacob-adaktusson,Jacob,Adaktusson J.,193.0,Right-Handed,
4,a310,emin-agaev,Emin,Agaev E.,,Right-Handed,


In [467]:
df_selected_players["Height"] = df_selected_players.Height.astype(float) # convert to float

In [67]:
df_selected_players['Id']=df_selected_players.index

In [68]:
df_copy=df_selected_players.copy()

In [69]:
df_selected_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,ActualRanking,Id
0,a618,mohammed-abdulla,Mohammed,Abdulla M.,,,,0
1,a391,maximilian-abel,Maximilian,Abel M.,,Right-Handed,,1
2,a389,jose-acasuso,Jose,Acasuso J.,191.0,Right-Handed,,2
3,a305,jacob-adaktusson,Jacob,Adaktusson J.,193.0,Right-Handed,,3
4,a310,emin-agaev,Emin,Agaev E.,,Right-Handed,,4


## Model Dataframe

### Clean Dataframe

We can notice that depending the year, Betting Broker are not the same. So it's better to use the Min/Max and Average of betting odds. 

In [118]:
#Fill NaN values
df_betting['MaxW']=df_betting['MaxW'].fillna(round(df_betting[['B365W', 'CBW', 'GBW', 'IWW','SBW', 'B&WW', 'EXW', 'PSW', 'UBW', 'LBW', 'SJW']].max(axis=1),2))
df_betting['MaxL']=df_betting['MaxL'].fillna(round(df_betting[['B365L', 'CBL', 'GBL', 'IWL','SBL', 'B&WL', 'EXL', 'PSL', 'UBL', 'LBL', 'SJL']].max(axis=1),2))
df_betting['AvgW']=df_betting['AvgW'].fillna(round(df_betting[['B365W', 'CBW', 'GBW', 'IWW','SBW', 'B&WW', 'EXW', 'PSW', 'UBW', 'LBW', 'SJW']].mean(axis=1),2))
df_betting['AvgL']=df_betting['AvgL'].fillna(round(df_betting[['B365L', 'CBL', 'GBL', 'IWL','SBL', 'B&WL', 'EXL', 'PSL', 'UBL', 'LBL', 'SJL']].mean(axis=1),2))

In [119]:
# drop useless datas
df_betting=df_betting[df_betting['MaxW'].notna() & df_betting['MaxL'].notna()] # select rows where betting odds are present
df_betting=df_betting.drop(df_betting[df_betting["Comment"]!="Completed"].index) #keep only completed matches
df_betting=df_betting.drop(['Comment','Date','ATP','Best of'],axis=1) # drop useless columns
df_betting=df_betting.drop(['B365W', 'CBW', 'GBW', 'IWW','SBW', 'B&WW', 'EXW', 'PSW', 'UBW', 'LBW', 'SJW','B365L', 'CBL', 'GBL', 'IWL','SBL', 'B&WL', 'EXL', 'PSL', 'UBL', 'LBL', 'SJL'],axis=1)
df_betting=df_betting.drop(['W1','L1','W2','L2','W3','L3','W4','L4','W5','L5','Wsets','Lsets','WPts','LPts'],axis=1)
df_betting=df_betting.drop(df_betting[(df_betting["WRank"]=='NR') |  (df_betting["LRank"]=='NR')].index) # drop matches with No ranked players
df_betting=df_betting.dropna()
df_betting["LRank"] = df_betting.LRank.astype(float)# no more 'NR' players so we can convert into float

### Preprocessing data

In [120]:
#Keep same syntax for player Name in order to have an unique ID for each player
df_betting["Winner"]=df_betting["Winner"].apply(lambda x: checkName(x))
df_betting["Loser"]=df_betting["Loser"].apply(lambda x: checkName(x))

In [121]:
#Some Series have been renamed over years
df_betting['Series'].replace({"International Gold":"ATP500","Masters Cup":"ATP Finals","Masters":"ATP Finals"},inplace =True)
#Delete ended white space
df_betting['Location']=df_betting['Location'].apply(lambda x : x.rstrip())

In [122]:
#Rename columns: Winner -> Player1 & Loser -> Player2
df_betting.rename(columns={'Winner':'Player1','Loser':'Player2','WRank':'P1Rank','LRank':'P2Rank','MaxW':'MaxP1','MaxL':'MaxP2','AvgW':'AvgP1','AvgL':'AvgP2'}, inplace=True)
#Create the column for the label prediction: Player1 Win
df_betting['P1Winner']=1 

In the actual format all winners are Player 1, it will be a problem for the model.
So for random rows, Players 1 and Player 2 datas are switched

In [123]:
mask=df_betting.sample(frac=.5).index
df_betting.loc[mask,['Player1','Player2','P1Rank','P2Rank','MaxP1','MaxP2','AvgP1','AvgP2']] = df_betting.loc[mask,['Player2','Player1','P2Rank','P1Rank','MaxP2','MaxP1','AvgP2','AvgP1']].values
df_betting.loc[mask,['P1Winner']]=0

In [124]:
df_betting=df_betting.merge(df_selected_players,left_on='Player1',right_on="Name", how='left',suffixes=['P2','P1']) #first merge for winner player
df_betting=df_betting.merge(df_selected_players,left_on='Player2',right_on="Name", how='left',suffixes=['P1','P2'])#second merge for loser player

In [125]:
df_betting=df_betting.drop(['FirstNameP1','FirstNameP2','NameP1','NameP2',"AtpIdP1","AtpIdP2","ActualRankingP1","ActualRankingP2","AtpNameP1","AtpNameP2","Player1","Player2"],axis=1) # keep only the player ID
df_betting=df_betting.dropna() # delete rows with NaN values
df_betting=df_betting.drop_duplicates() #delete duplicated rows

In [132]:
df_betting["HeightP1"] = df_betting.HeightP1.astype(float) # convert to float
df_betting["HeightP2"] = df_betting.HeightP2.astype(float) # convert to float

In [126]:
df_betting.head(10)

Unnamed: 0,Location,Tournament,Series,Court,Surface,Round,P1Rank,P2Rank,MaxP1,MaxP2,AvgP1,AvgP2,P1Winner,HeightP1,HandednessP1,IdP1,HeightP2,HandednessP2,IdP2
0,Adelaide,AAPT Championships,International,Outdoor,Hard,1st Round,25.0,46.0,1.6,2.2,1.55,2.17,1,175,Left-Handed,37.0,185,Right-Handed,575.0
1,Adelaide,AAPT Championships,International,Outdoor,Hard,1st Round,309.0,9.0,4.25,1.15,4.25,1.15,0,196,Right-Handed,342.0,185,Right-Handed,448.0
2,Adelaide,AAPT Championships,International,Outdoor,Hard,1st Round,10.0,57.0,1.36,3.8,1.3,3.09,0,185,Right-Handed,941.0,188,Right-Handed,679.0
3,Adelaide,AAPT Championships,International,Outdoor,Hard,1st Round,93.0,80.0,2.55,1.5,2.35,1.47,0,183,Right-Handed,959.0,183,Right-Handed,682.0
4,Adelaide,AAPT Championships,International,Outdoor,Hard,1st Round,104.0,53.0,1.5,2.4,1.5,2.27,1,196,Right-Handed,839.0,191,Right-Handed,128.0
5,Adelaide,AAPT Championships,International,Outdoor,Hard,2nd Round,25.0,150.0,1.45,2.75,1.43,2.53,1,175,Left-Handed,37.0,185,Right-Handed,684.0
6,Adelaide,AAPT Championships,International,Outdoor,Hard,2nd Round,24.0,50.0,1.33,3.2,1.31,2.92,1,191,Right-Handed,305.0,188,Left-Handed,391.0
7,Adelaide,AAPT Championships,International,Outdoor,Hard,2nd Round,37.0,80.0,1.6,2.38,1.54,2.22,1,193,Right-Handed,632.0,183,Right-Handed,682.0
9,Adelaide,AAPT Championships,International,Outdoor,Hard,2nd Round,57.0,89.0,1.4,2.75,1.38,2.62,1,188,Right-Handed,679.0,191,Left-Handed,633.0
10,Adelaide,AAPT Championships,International,Outdoor,Hard,2nd Round,104.0,26.0,1.65,2.1,1.65,1.98,1,196,Right-Handed,839.0,193,Right-Handed,960.0


Columns label encoding

In [140]:
label_dictionnary=[] 

In [142]:
def Labelizer(column):
    label=sorted(column.unique())
    dictionnary = { val : idx for idx,val in enumerate(label) }
    label_dictionnary.append(dictionnary) #save label encoder for validation test
    colLab=column.map(dictionnary)
    return colLab

In [481]:
df_betting=df_betting.apply(lambda x: Labelizer(x) if x.dtype==object else x,axis=0)

## Model

### Multilayers perceptron

In [None]:
X=df_betting.drop(['Tournament','Round','P1Winner'],axis=1)
y=df_betting['P1Winner']


Ml_model=tf.keras.Sequential([
    tf.keras.layers.Dense(16,input_dim=z.shape[1],activation="relu"),
    #tf.keras.layers.Dense(12,activation="relu"),
    tf.keras.layers.Dense(8,activation="relu"),
    #tf.keras.layers.Dense(4,activation="relu"),
    tf.keras.layers.Dense(1,activation="sigmoid"),
])

#Compile the model

Ml_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#Train the model

history=Ml_model.fit(X,y,epochs=200, batch_size=250,verbose=0)