In [574]:
import pandas as pd
import numpy as np
import seaborn as sn
import tensorflow as tf
import matplotlib.pyplot as plt
from math import log
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn import neighbors, metrics

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException 
import re
from datetime import date

import pickle


## Betting odds Dataframe

Source: <a href="http://tennis-data.co.uk/alldata.php" target="_blank">http://tennis-data.co.uk/alldata.php</a>

In [265]:
df_betting=pd.DataFrame()
for i in range(2002,2021):
    
    url="http://tennis-data.co.uk/%d/%d.xlsx" %(i,i)
    df=pd.read_excel(url)
    df_betting= pd.concat([df_betting,df],ignore_index=True)

In [266]:
#Correction for Names not respecting the syntax 
df_betting["Winner"]=df_betting["Winner"].replace({"Kwon S.W.":"Kwon S."})
df_betting["Loser"]=df_betting["Loser"].replace({"Kwon S.W.":"Kwon S."})

Create a datframe with Atp matches since 2002

In [148]:
def checkName(Name):
    if '..' in Name:
        Name=Name.replace('..','.')
    else:
        Name=Name
        
    if '. ' in Name:
        Name=Name.replace('. ','.')
    else:
        Name=Name    
        
    length=len(Name)
    if Name[length-1]!='.':
        Name=Name+'.'
    else:
        Name=Name
    return Name

In [149]:
df_betting_players=pd.DataFrame()
#Keep same syntax for player Name in order to have an unique ID for each player
df_betting_players["Winner"]=df_betting["Winner"].apply(lambda x: checkName(x))
df_betting_players["Loser"]=df_betting["Loser"].apply(lambda x: checkName(x))
df_betting_players["WinnerFName"]=df_betting_players["Winner"].apply(lambda x : " ".join(x.split(" ")[0:-1]))
df_betting_players["LoserFName"]=df_betting_players["Loser"].apply(lambda x : " ".join(x.split(" ")[0:-1]))

In [150]:
df_betting_players.head()

Unnamed: 0,Winner,Loser,WinnerFName,LoserFName
0,Arazi H.,Kratochvil M.,Arazi,Kratochvil
1,Carlsen K.,Black W.,Carlsen,Black
2,Chela J.I.,Delgado R.,Chela,Delgado
3,Draper S.,Sa A.,Draper,Sa
4,Enqvist T.,Rios M.,Enqvist,Rios


## Players Dataframe

Source: <a href="https://www.kaggle.com/romanzdk/atp-players-overviews" target="_blank">https://www.kaggle.com/romanzdk/atp-players-overviews/</a>

In [151]:
df_players=pd.read_csv('player_overviews.csv')
df_players=df_players.drop(df_players.columns[4:17],axis=1)
df_players=df_players.drop(df_players.columns[-1],axis=1)
df_players.columns=['AtpId','AtpName','FirstName','Name','Height','Handedness']
df_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness
0,a001,sadiq-abdullahi,Sadiq,Abdullahi,0.0,
1,a005,nelson-aerts,Nelson,Aerts,188.0,
2,a004,egan-adams,Egan,Adams,178.0,
3,a006,ronald-agenor,Ronald,Agenor,180.0,
4,a007,juan-aguilera,Juan,Aguilera,183.0,


### Change Name syntax to be similar to betting dataframe

For a future merge with the betting dataframe, we need to apply the same syntax to the Name column

In [152]:
def PlayerName(Name,FirstName):
    length= len(Name)
    if "-" in FirstName:
        FirstName=FirstName.split("-")
        player=Name+' '+FirstName[0][0]+'.'+FirstName[1][0]+'.'
    elif " " in FirstName:
        FirstName=FirstName.split()
        if len(FirstName)>1:
            player=Name+' '+FirstName[0][0]+'.'+FirstName[1][0]+'.'
        else:
            player=Name+' '+FirstName[0]+'.'
    elif Name[length-1]=='.':
        player=Name+FirstName[0]+'.'
        
    else:
        player=Name+' '+FirstName[0]+'.'
    return player.title()

In [153]:
df_players['NewName']=df_players.apply(lambda x: PlayerName(x["Name"],x["FirstName"]), axis=1)

Some players have their name and first letter of their firstname similar. We have to distinguished them 

### Update players dataframe

The dataset is not udpated and some current players are missing. The idea is to scrape datas from the https://www.atptour.com/ website.

In [237]:
#Player informations' are accessible through their atp overview webpage. 
#To get this page we need the name and the id given by the website.
#A first scrape on 'https://www.atptour.com/en/rankings/singles/ gives the url of the overview page
#Then for each player, we scrape First name and Last name in their overview webpage and create a dictionnary

def AtpPlayers():
    AtpIdList=[]
    AtpNameList=[]
    FirstNameList=[]
    LastNameList=[]
    ActualRankingList=[]
    UrlPhotoList=[]
    
    ##Find last ranking update date
    url='https://www.atptour.com/en/rankings/singles'
    req = Request(url, headers={'User-Agent': ''})
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, 'html.parser')
    update_date=soup.find(attrs={'class':'dropdown-default-label'}).get_text()
    update_date=update_date.replace('.','-').rstrip().lstrip() #clean string 

    ##Access the ranking with the first 900 players
    url='https://www.atptour.com/en/rankings/singles/?rankDate='+update_date+'&countryCode=all&rankRange=0-900'
    req = Request(url, headers={'User-Agent': ''})
    webpage = urlopen(req).read()
    soup1 = BeautifulSoup(webpage, 'html.parser')
    for i in range(0,900):
        
        ##Access the player overview and scrape their AtpId, AtpName, FirstName and Name
        
        url_overview='https://www.atptour.com' + soup1.select('.player-cell a')[i]['href']
        #AtpName and AtpId are present in the href
        AtpName=url_overview.split("/")[5]
        AtpId=url_overview.split("/")[6]
        
        req = Request(url_overview, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup2 = BeautifulSoup(webpage, 'html.parser')
        try:
            FirstName = soup2.find( attrs={'class': 'first-name'}).get_text()
        except AttributeError:
            FirstName=np.nan
        
        try:
            Name = soup2.find( attrs={'class': 'last-name'}).get_text()
            Name=Name.title()
        except AttributeError:
            Name=np.nan
            
        try:
            Photo = soup2.find( attrs={'class': 'player-profile-hero-image'}).select('img')[0]['src']
            Photo="https://www.atptour.com"+Photo
        except AttributeError:
            try:
                Photo = soup2.find( attrs={'class': 'small-headshot'}).select('img')[0]['src']
                Photo="https://www.atptour.com"+Photo
            except AttributeError:
                Photo=np.nan
        
            
        AtpIdList.append(AtpId)
        AtpNameList.append(AtpName)
        FirstNameList.append(FirstName)
        LastNameList.append(Name)
        ActualRankingList.append(i+1) #create an Actual ranking list useful for validation process
        UrlPhotoList.append(Photo)
        
        Dictionnary={'AtpId':AtpIdList,'AtpName':AtpNameList,'FirstName':FirstNameList,'Name':LastNameList,'ActualRanking':ActualRankingList,"Photo":UrlPhotoList}
    return Dictionnary

In [238]:
df_AtpPlayers=AtpPlayers() #get a dictionnary with scraped datas
df_AtpPlayers=pd.DataFrame(df_AtpPlayers) #convert dictionnary into dataframe
df_AtpPlayers['NewName']=df_AtpPlayers.apply(lambda x: PlayerName(x["Name"],x["FirstName"]), axis=1)
#df_AtpPlayers=df_AtpPlayers.drop(["FirstName"],axis=1) #column not needed anymore

In [239]:
df_total_players=pd.concat([df_players, df_AtpPlayers], ignore_index=True) # concat the scraping dataframe with the original one
df_total_players.drop_duplicates(subset ="AtpId",keep = "last", inplace = True) # KEEP UNIQUE VALUES
print(df_total_players.shape)
df_total_players.head()

(11345, 9)


Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,NewName,ActualRanking,Photo
0,a001,sadiq-abdullahi,Sadiq,Abdullahi,0.0,,Abdullahi S.,,
1,a005,nelson-aerts,Nelson,Aerts,188.0,,Aerts N.,,
2,a004,egan-adams,Egan,Adams,178.0,,Adams E.,,
3,a006,ronald-agenor,Ronald,Agenor,180.0,,Agenor R.,,
4,a007,juan-aguilera,Juan,Aguilera,183.0,,Aguilera J.,,


In [240]:
#df_total_players=df_total_players.sort_values(["Name","FirstName"]).reset_index(drop=True)

In [241]:
#df_total_players["Name"]=duplicatedNames(df_total_players)

In [242]:
#df_total_players=df_total_players.drop(["NewName"],axis=1) #column not needed anymore
#df_total_players.head()

In [243]:
print(df_total_players.shape)

(11345, 9)


### Select only commun players

In [245]:
df_selected_players=df_total_players[df_total_players["Name"].isin(df_betting_players["WinnerFName"])|df_total_players["Name"].isin(df_betting_players["LoserFName"])].reset_index(drop=True) 

In [246]:
df_selected_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,NewName,ActualRanking,Photo
0,a012,marc-albert,Marc,Albert,0.0,,Albert M.,,
1,a021,anand-amritraj,Anand,Amritraj,185.0,,Amritraj A.,,
2,a022,vijay-amritraj,Vijay,Amritraj,193.0,,Amritraj V.,,
3,a030,roberto-arguello,Roberto,Arguello,173.0,,Arguello R.,,
4,a051,juan-carlos-ayala,Juan Carlos,Ayala,,,Ayala J.C.,,


In [247]:
def SimilarNames(df):
    column=[df.loc[0,"NewName"]]
    for i in range(1,len(df)):
        
        FirstName=df.loc[i,"FirstName"]
        Name=df.loc[i,"Name"]
        
        j=0
        while df.loc[i,"NewName"]in column:
            
            df.loc[i,"NewName"]=Name+' '+FirstName[0:j+1]+'.'
            j+=1
        
        
        column.append(df.loc[i,"NewName"])
    return column

In [248]:
def UniquePlayers(df_selected_players, df_betting_players):
    
    ##########################################################################################
    ## With this function we want to deal with Players which have the same "NewName".       ##
    ## Some players in the df_betting_players are unique but not in the df_selected_players.##
    ## In this case, the idea is to keep only the player with an actual ranking in order to ##
    ## have a logical further merge. If there is no actual players we keep them all.        ##
    ## The second step will be to use the SimilarNames function to differentiate players    ##
    ## with the same Name in df_selected_players                                            ##
    ##########################################################################################

    newdf=[]
    for i in range(0,len(df_selected_players)):
        
        occurrence=df_selected_players[df_selected_players["NewName"].str.contains(df_selected_players.iloc[i]["NewName"])] # number of occurrence of the NewName in the dataframe
        
        #find unique players in the betting dataframe containing the "NewName" value
        betting_unique=df_betting_players['Winner'][df_betting_players["Winner"].str.contains(df_selected_players.iloc[i]["NewName"])]
        betting_unique=betting_unique.append(df_betting_players['Loser'][df_betting_players["Loser"].str.contains(df_selected_players.iloc[i]["NewName"])])
        betting_unique=betting_unique.unique()
                                             
        if (len(betting_unique)==1) & (occurrence.shape[0]!=1): #if there is a unique player in df_selected_players_betting_payers for multiple players in df_selected_players
            currentPlayers= occurrence[occurrence['ActualRanking'].notna()] #select rows with an 'ActualRanking' value
            if len(currentPlayers)!=0: 
                for j in currentPlayers.index:
                    newdf.append(df_selected_players.iloc[j]) #add only rows with an ActualRanking value
            else:
                newdf.append(df_selected_players.iloc[i])
        else:
            newdf.append(df_selected_players.iloc[i])
            
    newdf=pd.DataFrame(newdf) # convert to pandas dataframe
    newdf.drop_duplicates(subset ="AtpId",keep = "last", inplace = True) # drop duplicated Atp players
    newdf=newdf.sort_values(["Name","FirstName"]).reset_index(drop=True) # sort players by their Name and firstname
    newdf["NewName"]=SimilarNames(newdf) # apply SimilarNames function
    
    return newdf

In [249]:
df_selected_players=UniquePlayers(df_selected_players,df_betting_players)

In [444]:
df_selected_players[df_selected_players["NewName"].str.contains("Galan")]

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,NewName,ActualRanking,Photo,Id
567,ge33,daniel-elahi-galan,Daniel Elahi,Galan,191.0,Right-Handed,Galan D.E.,114.0,https://www.atptour.com/-/media/tennis/players...,567


In [443]:
df_betting_players[df_betting_players["LoserFName"].str.contains("Galan")]

Unnamed: 0,Winner,Loser,WinnerFName,LoserFName
37563,Karlovic I.,Galan D.E.,Karlovic,Galan
45594,Polansky P.,Galan D.E.,Polansky,Galan
47197,Ruud C.,Galan D.E.,Ruud,Galan
48114,Travaglia S.,Galan D.E.,Travaglia,Galan
49119,Tabilo A.,Galan D.E.,Tabilo,Galan
49371,Paul T.,Galan D.E.,Paul,Galan
49939,Djokovic N.,Galan D.,Djokovic,Galan


In [252]:
df_total_players[df_total_players["NewName"].str.contains("Minaur")]

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,NewName,ActualRanking,Photo
10935,dh58,alex-de-minaur,Alex,De Minaur,,,De Minaur A.,25.0,https://www.atptour.com/-/media/tennis/players...


In [253]:
df_players[df_players["NewName"].str.contains("Andujar")]

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,NewName
155,a596,pablo-andujar,Pablo,Andujar,180.0,Right-Handed,Andujar P.


In [254]:
print(df_selected_players.shape)

(1961, 9)


Height, Handedness and Backhand are interesting criterias to analyse but we saw in the model prediction notebook that backhand had too many 'Nan' values even after scraping. Thanks to the AtId and AtpNAme we scraped on the website https://www.atptour.com/ 

In [255]:
def scraping(atpId,atpName,height,handedness):
    
    try:
        url_atp = 'https://www.atptour.com/en/players/'+atpName+'/'+atpId+'/overview'
        req = Request(url_atp, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = BeautifulSoup(webpage, 'html.parser')

        if ((height!=height) or (height==0.0)): #check NaN value or height equals to 0
            try:
                height = soup.find( attrs={'class': 'table-height-cm-wrapper'}).get_text()
                height= height.replace('(','').replace(')','').replace('cm','')
                if str(height)=='0':
                    height=np.nan
                else:
                    height=height
            except AttributeError:
                height=np.nan
        else:
            height=height

        if handedness!=handedness: #check NaN value
            try:
                plays= soup.find_all( attrs={'class':"table-value"})
                if len(plays)>=2:
                    plays=plays[2].get_text()
                    if len(plays)>1:
                        plays=plays.split(", ")

                        handedness=plays[0].replace("\r","")
                        handedness=handedness.replace("\n","")  
                    else:
                        handedness=plays.replace("\r","")
                        handedness=handedness.replace("\n","")
                    if handedness=='':
                        handedness=np.nan
                    else:
                        handedness=handedness
                else:
                    handedness=np.nan
            except AttributeError:
                handedness=np.nan
        else:
            handedness=handedness

        
    except:
        height=handedness=np.nan
    
    
    
    return height,handedness

In [256]:
def UpdateRanking(atpId,atpName):
    url_atp = 'https://www.atptour.com/en/players/'+atpName+'/'+atpId+'/overview'
    req = Request(url_atp, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, 'html.parser')
    try:
        Ranking = soup.find( attrs={'class': 'data-number'}).get_text()
    except:
        Ranking=np.nan
    try:
        int(Ranking)
        return Ranking
    except ValueError: #For inactive players 
        return np.nan

In [589]:
print(df_selected_players.dtypes)

AtpId             object
AtpName           object
FirstName         object
Name              object
Height           float64
Handedness        object
NewName           object
ActualRanking    float64
Photo             object
Id                 int64
dtype: object


In [258]:
df_selected_players[['Height','Handedness']]=df_selected_players.apply(lambda x : scraping(x['AtpId'],x['AtpName'],x['Height'],x['Handedness']),axis=1, result_type="expand")
df_selected_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,NewName,ActualRanking,Photo
0,a618,mohammed-abdulla,Mohammed,Abdulla,,,Abdulla M.,,
1,a391,maximilian-abel,Maximilian,Abel,,Right-Handed,Abel M.,,
2,a389,jose-acasuso,Jose,Acasuso,191.0,Right-Handed,Acasuso J.,,
3,a305,jacob-adaktusson,Jacob,Adaktusson,193.0,Right-Handed,Adaktusson J.,,
4,a310,emin-agaev,Emin,Agaev,,Right-Handed,Agaev E.,,


In [259]:
df_selected_players["Height"] = df_selected_players.Height.astype(float) # convert to float

In [260]:
df_selected_players['Id']=df_selected_players.index

In [261]:
df_copy=df_selected_players.copy()

In [262]:
df_selected_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,NewName,ActualRanking,Photo,Id
0,a618,mohammed-abdulla,Mohammed,Abdulla,,,Abdulla M.,,,0
1,a391,maximilian-abel,Maximilian,Abel,,Right-Handed,Abel M.,,,1
2,a389,jose-acasuso,Jose,Acasuso,191.0,Right-Handed,Acasuso J.,,,2
3,a305,jacob-adaktusson,Jacob,Adaktusson,193.0,Right-Handed,Adaktusson J.,,,3
4,a310,emin-agaev,Emin,Agaev,,Right-Handed,Agaev E.,,,4


In [576]:
df_selected_players.to_pickle("webapp/static/dataframes/df_selected_players.pkl")

## Model Dataframe

### Clean Dataframe

We can notice that depending the year, Betting Broker are not the same. So it's better to use the Min/Max and Average of betting odds. 

In [267]:
#Fill NaN values
df_betting['MaxW']=df_betting['MaxW'].fillna(round(df_betting[['B365W', 'CBW', 'GBW', 'IWW','SBW', 'B&WW', 'EXW', 'PSW', 'UBW', 'LBW', 'SJW']].max(axis=1),2))
df_betting['MaxL']=df_betting['MaxL'].fillna(round(df_betting[['B365L', 'CBL', 'GBL', 'IWL','SBL', 'B&WL', 'EXL', 'PSL', 'UBL', 'LBL', 'SJL']].max(axis=1),2))
df_betting['AvgW']=df_betting['AvgW'].fillna(round(df_betting[['B365W', 'CBW', 'GBW', 'IWW','SBW', 'B&WW', 'EXW', 'PSW', 'UBW', 'LBW', 'SJW']].mean(axis=1),2))
df_betting['AvgL']=df_betting['AvgL'].fillna(round(df_betting[['B365L', 'CBL', 'GBL', 'IWL','SBL', 'B&WL', 'EXL', 'PSL', 'UBL', 'LBL', 'SJL']].mean(axis=1),2))

In [268]:
# drop useless datas
df_betting=df_betting[df_betting['MaxW'].notna() & df_betting['MaxL'].notna()] # select rows where betting odds are present
df_betting=df_betting.drop(df_betting[df_betting["Comment"]!="Completed"].index) #keep only completed matches
df_betting=df_betting.drop(['Comment','Date','ATP','Best of'],axis=1) # drop useless columns
df_betting=df_betting.drop(['B365W', 'CBW', 'GBW', 'IWW','SBW', 'B&WW', 'EXW', 'PSW', 'UBW', 'LBW', 'SJW','B365L', 'CBL', 'GBL', 'IWL','SBL', 'B&WL', 'EXL', 'PSL', 'UBL', 'LBL', 'SJL'],axis=1)
df_betting=df_betting.drop(['W1','L1','W2','L2','W3','L3','W4','L4','W5','L5','Wsets','Lsets','WPts','LPts'],axis=1)
df_betting=df_betting.drop(df_betting[(df_betting["WRank"]=='NR') |  (df_betting["LRank"]=='NR')].index) # drop matches with No ranked players
df_betting=df_betting.dropna()
df_betting["LRank"] = df_betting.LRank.astype(float)# no more 'NR' players so we can convert into float

### Preprocessing data

In [269]:
#Keep same syntax for player Name in order to have an unique ID for each player
df_betting["Winner"]=df_betting["Winner"].apply(lambda x: checkName(x))
df_betting["Loser"]=df_betting["Loser"].apply(lambda x: checkName(x))

In [270]:
#Some Series have been renamed over years
df_betting['Series'].replace({"International Gold":"ATP500","Masters Cup":"ATP Finals","Masters":"ATP Finals"},inplace =True)
#Delete ended white space
df_betting['Location']=df_betting['Location'].apply(lambda x : x.rstrip())

In [271]:
#Rename columns: Winner -> Player1 & Loser -> Player2
df_betting.rename(columns={'Winner':'Player1','Loser':'Player2','WRank':'P1Rank','LRank':'P2Rank','MaxW':'MaxP1','MaxL':'MaxP2','AvgW':'AvgP1','AvgL':'AvgP2'}, inplace=True)
#Create the column for the label prediction: Player1 Win
df_betting['P1Winner']=1 

In the actual format all winners are Player 1, it will be a problem for the model.
So for random rows, Players 1 and Player 2 datas are switched

In [272]:
mask=df_betting.sample(frac=.5).index
df_betting.loc[mask,['Player1','Player2','P1Rank','P2Rank','MaxP1','MaxP2','AvgP1','AvgP2']] = df_betting.loc[mask,['Player2','Player1','P2Rank','P1Rank','MaxP2','MaxP1','AvgP2','AvgP1']].values
df_betting.loc[mask,['P1Winner']]=0

In [273]:
df_betting=df_betting.merge(df_selected_players,left_on='Player1',right_on="NewName", how='left',suffixes=['P2','P1']) #first merge for winner player
df_betting=df_betting.merge(df_selected_players,left_on='Player2',right_on="NewName", how='left',suffixes=['P1','P2'])#second merge for loser player

In [274]:
df_betting=df_betting.drop(['MaxP1','MaxP2','NewNameP1','NewNameP2','Tournament','Round','FirstNameP1','FirstNameP2','NameP1','NameP2',"AtpIdP1","AtpIdP2","ActualRankingP1","ActualRankingP2","AtpNameP1","AtpNameP2","Player1","Player2","PhotoP1","PhotoP2"],axis=1) # keep only the player ID
df_betting=df_betting.dropna() # delete rows with NaN values
df_betting=df_betting.drop_duplicates() #delete duplicated rows

In [275]:
df_betting["HeightP1"] = df_betting.HeightP1.astype(float) # convert to float
df_betting["HeightP2"] = df_betting.HeightP2.astype(float) # convert to float

In [276]:
#move column P1Winner at the end 
df_betting=df_betting[['Location', 'Series', 'Court', 'Surface', 'P1Rank', 'P2Rank', 'AvgP1', 'AvgP2','HeightP1', 'HandednessP1', 'IdP1', 'HeightP2', 'HandednessP2', 'IdP2','P1Winner']]
df_betting.head()

Unnamed: 0,Location,Series,Court,Surface,P1Rank,P2Rank,AvgP1,AvgP2,HeightP1,HandednessP1,IdP1,HeightP2,HandednessP2,IdP2,P1Winner
0,Adelaide,International,Outdoor,Hard,25.0,46.0,1.55,2.17,175.0,Left-Handed,65.0,185.0,Right-Handed,953.0,1
1,Adelaide,International,Outdoor,Hard,9.0,309.0,1.15,4.25,185.0,Right-Handed,730.0,196.0,Right-Handed,559.0,1
2,Adelaide,International,Outdoor,Hard,10.0,57.0,1.3,3.09,185.0,Right-Handed,1509.0,188.0,Right-Handed,1102.0,0
3,Adelaide,International,Outdoor,Hard,93.0,80.0,2.35,1.47,183.0,Right-Handed,1531.0,183.0,Right-Handed,1113.0,0
4,Adelaide,International,Outdoor,Hard,53.0,104.0,2.27,1.5,191.0,Right-Handed,203.0,196.0,Right-Handed,1335.0,0


Columns label encoding

In [277]:
label_dictionnary=[] 

In [278]:
def Labelizer(column):
    label=sorted(column.unique())
    dictionnary = { val : idx for idx,val in enumerate(label) }
    label_dictionnary.append(dictionnary) #save label encoder for validation test
    colLab=column.map(dictionnary)
    return colLab

In [279]:
df_betting=df_betting.apply(lambda x: Labelizer(x) if x.dtype==object else x,axis=0)

In [583]:
#save labelizer dictionnary
f = open("webapp/static/dataframes/label_dictionnary.pkl","wb")
pickle.dump(label_dictionnary,f)
f.close()


In [588]:
print(len(label_dictionnary))

6


In [280]:
df_betting.head()

Unnamed: 0,Location,Series,Court,Surface,P1Rank,P2Rank,AvgP1,AvgP2,HeightP1,HandednessP1,IdP1,HeightP2,HandednessP2,IdP2,P1Winner
0,2,4,1,3,25.0,46.0,1.55,2.17,175.0,0,65.0,185.0,1,953.0,1
1,2,4,1,3,9.0,309.0,1.15,4.25,185.0,1,730.0,196.0,1,559.0,1
2,2,4,1,3,10.0,57.0,1.3,3.09,185.0,1,1509.0,188.0,1,1102.0,0
3,2,4,1,3,93.0,80.0,2.35,1.47,183.0,1,1531.0,183.0,1,1113.0,0
4,2,4,1,3,53.0,104.0,2.27,1.5,191.0,1,203.0,196.0,1,1335.0,0


In [281]:
print(label_dictionnary)

[{"'s-Hertogenbosch": 0, 'Acapulco': 1, 'Adelaide': 2, 'Amersfoort': 3, 'Antalya': 4, 'Antwerp': 5, 'Atlanta': 6, 'Auckland': 7, 'Bangkok': 8, 'Barcelona': 9, 'Basel': 10, 'Bastad': 11, 'Beijing': 12, 'Belgrade': 13, 'Bogota': 14, 'Brisbane': 15, 'Bucharest': 16, 'Budapest': 17, 'Buenos Aires': 18, 'Casablanca': 19, 'Chengdu': 20, 'Chennai': 21, 'Cincinnati': 22, 'Cologne': 23, 'Copenhagen': 24, 'Cordoba': 25, 'Costa Do Sauipe': 26, 'Delray Beach': 27, 'Doha': 28, 'Dubai': 29, 'Dusseldorf': 30, 'Eastbourne': 31, 'Estoril': 32, 'Geneva': 33, 'Gstaad': 34, 'Halle': 35, 'Hamburg': 36, 'Ho Chi Min City': 37, 'Hong Kong': 38, 'Houston': 39, 'Indian Wells': 40, 'Indianapolis': 41, 'Istanbul': 42, 'Johannesburg': 43, 'Kitzbuhel': 44, 'Kuala Lumpur': 45, 'Las Vegas': 46, 'London': 47, 'Long Island': 48, 'Los Angeles': 49, 'Los Cabos': 50, 'Lyon': 51, 'Madrid': 52, 'Mallorca': 53, 'Marrakech': 54, 'Marseille': 55, 'Melbourne': 56, 'Memphis': 57, 'Metz': 58, 'Miami': 59, 'Milan': 60, 'Monte Carl

## Model

### Multilayers perceptron

In [569]:
X=df_betting.drop(['P1Winner'],axis=1)
y=df_betting['P1Winner']


Ml_model=tf.keras.Sequential([
    tf.keras.layers.Dense(16,input_dim=X.shape[1],activation="relu"),
    #tf.keras.layers.Dense(12,activation="relu"),
    tf.keras.layers.Dense(8,activation="relu"),
    #tf.keras.layers.Dense(4,activation="relu"),
    tf.keras.layers.Dense(1,activation="sigmoid"),
])

#Compile the model

Ml_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#Train the model

history=Ml_model.fit(X,y,epochs=200, batch_size=250,verbose=0)

In [635]:
Ml_model.save("webapp/model")

INFO:tensorflow:Assets written to: webapp/model/assets


## Validation 

The idea is to scrape https://www.oddsportal.com/matches/tennis/ and find ATP matches of the day.

In [445]:
def PlayerSyntax(playerName):
    
    playerName=checkName(playerName)#first syntax check for dots
    
    char_positions=[pos for pos, char in enumerate(playerName) if char == "-"] #array of character position in string
    if char_positions: ##  if not empty array
        for i in char_positions:
            if playerName[i-1].isupper(): #check if the character before is an uppercase
                #convert to list in order to change specific character 
                new=list(playerName)
                new[i]="."
                playerName="".join(new)
                
    if playerName == "Kwon Soonwoo":
        playerName="Kwon S."
    elif playerName == "Ramos A.":
        playerName="Ramos-Vinolas A."
    elif playerName=="McDonald M.":
        playerName="Mcdonald M."
    elif playerName=="Galan Riveros D.E.":
        playerName="Galan D.E."
    
    #elif playerName == "Andujar-Alba P.":
    #    playerName="Andujar P."
    #elif playerName == "Carreno-Busta P.":
    #    playerName="Carreno Busta P."
    
    #Harris G.
        

    return playerName    


In [630]:
def DayMatches():
    #############################################################################
    ## Scrape Location, Players, Odds and Winner of each ATP matches of the day ##
    #############################################################################
    url_odds="https://www.oddsportal.com/matches/tennis/"
    options = webdriver.ChromeOptions()
    options.add_argument('headless')

    driver = webdriver.Chrome(executable_path=r'/Users/pierremecchia/Desktop/chromedriver',options=options)
    driver.get(url_odds)
    sleep(2)

    table=driver.find_elements_by_xpath("//*[@id='table-matches']/table/tbody/tr[contains(@class,'dark')]")

    list_href=[]
    list_location=[]
    list_player1=[]
    list_player2=[]
    list_odd1=[]
    list_odd2=[]
    list_winnerP1=[]    

    ## find all ATP tournament and keep their link
    for row in table:
        if ("ATP" in row.text) and (not "Doubles" in row.text): #only interested in single atp matches
            href=row.find_element_by_xpath('./th[1]/a[2]').get_attribute('href')
            if href not in list_href:
                list_href.append(href)
    driver.quit()
    

    for href in list_href:

        driver = webdriver.Chrome(executable_path=r'/Users/pierremecchia/Desktop/chromedriver',options=options)
        driver.get(href)
        sleep(2)
        ##find location of the atp tournament
        h1=driver.find_element_by_xpath('//*[@id="col-content"]/h1').text
        
        #clean location string
        location=h1.replace('Betting Odds','')
        location=location.replace('ATP ','')
        location=re.sub(r'\([^)]*\)', '', location)#delete special characters 
        location=location.rstrip().lstrip() #delete white space at the beginning and end of string
        
        matches=driver.find_elements_by_xpath("//*[@id='tournamentTable']/tbody/tr") #List of matches in the tournament
        doScrap=False
        for match in matches:
            try:
                className=match.get_attribute("class")
                if className=='center nob-border': #create a counter in order to scrape only matches of the day
                    date=match.find_element_by_xpath('./th[1]/span').text
                    if 'Today' in date:
                        doScrap=True
                    else:
                        doScrap=False
                if doScrap==True:
                    if not('dark' or 'nob-border' or 'table-dummyrow')in className:

                        try:
                            if 'deactivate' in className: #for matches completed or currently playing (live matches)
                                players=match.find_element_by_xpath('./td[2]/a').text

                                if players=="": #append for live matches
                                    players=match.find_element_by_xpath('./td[2]/a[2]').text

                                #find odds
                                odd1=match.find_element_by_xpath('./td[4]/a').text
                                odd1_className=match.find_element_by_xpath('./td[4]').get_attribute("class")

                                odd2=match.find_element_by_xpath('./td[5]/a').text
                                odd2_className=match.find_element_by_xpath('./td[5]').get_attribute("class")
                                
                                #find winner of the match
                                winnerP1=match.find_element_by_xpath('./td[3]').text

                                if ("ret" in winnerP1) or ("canc" in winnerP1): #no winner for canceled or retirement matches
                                    winnerP1=np.nan
                                elif (("result-ok" not in odd1_className) & ("result-ok" not in odd2_className)): #match not finished
                                    winnerP1=np.nan
                                elif int(winnerP1[0])<int(winnerP1[2]):
                                    winnerP1=0
                                else:
                                    winnerP1=1
                            else: # for matches not played yet
                                players=match.find_element_by_xpath('./td[2]/a[not(contains(@class,"ico-tv-tournament"))]').text
                                odd1=match.find_element_by_xpath('./td[3]/a').text
                                odd2=match.find_element_by_xpath('./td[4]/a').text
                                winnerP1=np.nan
                            
                            #Clean players names
                            player1=players.split(" - ")[0]
                            player1=PlayerSyntax(player1)
                            player2=players.split(" - ")[1]
                            player2=PlayerSyntax(player2)
                            
                        
                            list_location.append(location)
                            list_player1.append(player1)
                            list_player2.append(player2)
                            list_odd1.append(odd1)
                            list_odd2.append(odd2)
                            list_winnerP1.append(winnerP1)
                            
                        except :
                            continue

            except NoSuchElementException:
                continue
                
        driver.quit()
            
    dictionnary={"Location":list_location,"Player1":list_player1,"Player2":list_player2,"AvgP1":list_odd1,"AvgP2":list_odd2,"P1Winner":list_winnerP1} #create a dictionnary
    df_matchs=pd.DataFrame(dictionnary) #convert dictionnary into pandas dataframe
    
    return df_matchs
            
    

In [631]:
def TournamentsData(locations_list):
    
    ##################################################################################
    ## Scrape Series, Surface and Court of tournaments present in the locations_list ##
    ##################################################################################
    
    
    url_tourn="https://www.atptour.com/en/tournaments/"

    driver = webdriver.Chrome(executable_path=r'/Users/pierremecchia/Desktop/chromedriver')
    driver.get(url_tourn)
    sleep(4)
    
    #list all tournaments in the current month
    month_tournament=driver.find_elements_by_xpath("//*[@id='contentAccordionWrapper']/div[contains(@class,'expand')][1]/div[2]/div/table/tbody/tr")

    list_series=[]
    list_court=[]
    list_location=[]
    list_surface=[]
    
    for tournament in month_tournament:
        try:
            #scrape location for each tournament in the list
            location=tournament.find_element_by_xpath('./td[2]/span[1]').text 
            town=location.split(",")[0]
            
            if town in locations_list: #identify tournaments that we have to scrape
                try:
                    #define the serie of the tournament thanks to the image source
                    series=tournament.find_element_by_xpath('./td[1]/img').get_attribute("src")
                    if "250.png" in series:
                        series="ATP250"
                    elif "500.png" in series:
                        series="ATP500"
                    elif "1000.png" in series:
                        series="Masters 1000"
                    elif "grandslam.png" in series:
                        series="Grand Slam"
                    elif "finals.svg" in series:
                        series="ATP Finals"
                    else:
                        series=np.nan
                except:
                    series=np.nan
                try:
                    #scrape court and surface of the tournament
                    playground=tournament.find_element_by_xpath('./td[3]/table/tbody/tr/td[2]/div/div').text
                    playground=playground.split(" ")
                    court=playground[0]
                    surface=playground[1]
                    
                except:
                    court=np.nan
                    surface=np.nan
                    
                    
                list_series.append(series)
                list_surface.append(surface)
                list_court.append(court)
                list_location.append(town)
        except:
            continue
    driver.quit()

    loc_dict={"Location":list_location,"Series":list_series,"Court":list_court,"Surface":list_surface} #create a dictionnary
    df_tournaments=pd.DataFrame(loc_dict) # convert dictionnary into pandas dataframe
    
    return df_tournaments

In [632]:
df_matchs=DayMatches() #Scrape atp matches of the day

In [633]:
df_matchs.shape

(16, 6)

In [619]:
locations_list=df_matchs['Location'].unique() #location list of tournaments of the day
df_tournaments=TournamentsData(locations_list) # scrape additionnals datas of the tournaments of the day
df_tournaments.head()

Unnamed: 0,Location,Series,Court,Surface
0,Madrid,Masters 1000,Outdoor,Clay


In [592]:
df_validation=df_matchs.merge(df_tournaments,on="Location")
df_validation.head(20)

Unnamed: 0,Location,Player1,Player2,AvgP1,AvgP2,P1Winner,Series,Court,Surface
0,Madrid,Hurkacz H.,Millman J.,1.44,2.81,0.0,Masters 1000,Outdoor,Clay
1,Madrid,Pella G.,Sinner J.,8.49,1.07,,Masters 1000,Outdoor,Clay
2,Madrid,Auger-Aliassime F.,Ruud C.,1.85,1.97,0.0,Masters 1000,Outdoor,Clay
3,Madrid,Herbert P.,Davidovich Fokina A.,2.38,1.59,0.0,Masters 1000,Outdoor,Clay
4,Madrid,Khachanov K.,Nishikori K.,2.02,1.8,0.0,Masters 1000,Outdoor,Clay
5,Madrid,Nishioka Y.,Krajinovic F.,3.86,1.29,1.0,Masters 1000,Outdoor,Clay
6,Madrid,Norrie C.,Krajinovic F.,2.72,1.46,,Masters 1000,Outdoor,Clay
7,Madrid,Paul T.,Rublev A.,5.9,1.14,,Masters 1000,Outdoor,Clay
8,Madrid,Popyrin Al.,Struff J.L.,2.11,1.73,,Masters 1000,Outdoor,Clay
9,Madrid,Bublik A.,Shapovalov D.,2.74,1.45,,Masters 1000,Outdoor,Clay


In [593]:
def PlayerNotFound(Player,df_selected_players):
    if Player not in (df_selected_players["NewName"]):
        firstName=Player.split(" ")[-1]
        Name=" ".join(Player.split(" ")[0:-1])

        if df_selected_players["NewName"].str.contains(Name+" "+firstName[0]).any():
            similarPlayer=[]
            i=0
            while len(similarPlayer) != 1:
                similarPlayer=df_selected_players["NewName"][df_selected_players["NewName"].str.contains(Name+" "+firstName[0:i+1])].reset_index(drop=True)
                i+=1
                if i==len(firstName):
                    similarPlayer=Player
                    break
            Player=similarPlayer
        elif df_selected_players["NewName"].str.contains(Name.replace("-"," ")+" "+firstName[0]).any():
            similarPlayer=[]
            i=0
            while len(similarPlayer) != 1:
                similarPlayer=df_selected_players["NewName"][df_selected_players["NewName"].str.contains(Name.replace("-"," ")+" "+firstName[0:i+1])].reset_index(drop=True)
                i+=1
                if i==len(firstName):
                    similarPlayer=Player
                    break
            Player=similarPlayer
        elif df_selected_players["NewName"].str.contains(Name.split("-")[0]+" "+firstName[0]).any():
            similarPlayer=[]
            i=0
            while len(similarPlayer) != 1:
                similarPlayer=df_selected_players["NewName"][df_selected_players["NewName"].str.contains(Name.split("-")[0]+" "+firstName[0:i+1])].reset_index(drop=True)
                i+=1
                if i==len(firstName):
                    similarPlayer=Player
                    break
            Player=similarPlayer
        else:
            Player=Player
        
    return Player

In [594]:
df_validation['Player1']=df_validation.apply(lambda x: PlayerNotFound(x["Player1"],df_selected_players),axis=1)
df_validation['Player2']=df_validation.apply(lambda x: PlayerNotFound(x["Player2"],df_selected_players),axis=1)
df_validation.head(20)

Unnamed: 0,Location,Player1,Player2,AvgP1,AvgP2,P1Winner,Series,Court,Surface
0,Madrid,Hurkacz H.,Millman J.,1.44,2.81,0.0,Masters 1000,Outdoor,Clay
1,Madrid,Pella G.,Sinner J.,8.49,1.07,,Masters 1000,Outdoor,Clay
2,Madrid,Auger-Aliassime F.,Ruud C.,1.85,1.97,0.0,Masters 1000,Outdoor,Clay
3,Madrid,Herbert P.H.,Davidovich Fokina A.,2.38,1.59,0.0,Masters 1000,Outdoor,Clay
4,Madrid,Khachanov K.,Nishikori K.,2.02,1.8,0.0,Masters 1000,Outdoor,Clay
5,Madrid,Nishioka Y.,Krajinovic F.,3.86,1.29,1.0,Masters 1000,Outdoor,Clay
6,Madrid,Norrie C.,Krajinovic F.,2.72,1.46,,Masters 1000,Outdoor,Clay
7,Madrid,Paul T.,Rublev A.,5.9,1.14,,Masters 1000,Outdoor,Clay
8,Madrid,Popyrin A.,Struff J.L.,2.11,1.73,,Masters 1000,Outdoor,Clay
9,Madrid,Bublik A.,Shapovalov D.,2.74,1.45,,Masters 1000,Outdoor,Clay


In [595]:
df_validation=df_validation.merge(df_selected_players,left_on='Player1',right_on="NewName", how='left',suffixes=['P2','P1']) #first merge for winner player
df_validation=df_validation.merge(df_selected_players,left_on='Player2',right_on="NewName", how='left',suffixes=['P1','P2'])#second merge for loser player

In [596]:
df_validation=df_validation.drop(['FirstNameP1','FirstNameP2','NameP1','NameP2',"AtpIdP1","AtpIdP2","AtpNameP1","AtpNameP2"],axis=1) # drop useless features
df_validation=df_validation[['Location','Player1','Player2','Series', 'Court', 'Surface', 'ActualRankingP1', 'ActualRankingP2', 'AvgP1', 'AvgP2','HeightP1', 'HandednessP1', 'NewNameP1', 'IdP1','PhotoP1', 'HeightP2', 'HandednessP2', 'NewNameP2', 'IdP2','PhotoP2','P1Winner']]#ordering columns

In [597]:
df_validation.head(20)

Unnamed: 0,Location,Player1,Player2,Series,Court,Surface,ActualRankingP1,ActualRankingP2,AvgP1,AvgP2,...,HandednessP1,NewNameP1,IdP1,PhotoP1,HeightP2,HandednessP2,NewNameP2,IdP2,PhotoP2,P1Winner
0,Madrid,Hurkacz H.,Millman J.,Masters 1000,Outdoor,Clay,16.0,43.0,1.44,2.81,...,Right-Handed,Hurkacz H.,764,https://www.atptour.com/-/media/tennis/players...,183.0,Right-Handed,Millman J.,1153,https://www.atptour.com/-/media/tennis/players...,0.0
1,Madrid,Pella G.,Sinner J.,Masters 1000,Outdoor,Clay,50.0,19.0,8.49,1.07,...,Left-Handed,Pella G.,1319,https://www.atptour.com/-/media/tennis/players...,188.0,Right-Handed,Sinner J.,1595,https://www.atptour.com/-/media/tennis/players...,
2,Madrid,Auger-Aliassime F.,Ruud C.,Masters 1000,Outdoor,Clay,20.0,24.0,1.85,1.97,...,Right-Handed,Auger-Aliassime F.,78,https://www.atptour.com/-/media/tennis/players...,183.0,Right-Handed,Ruud C.,1489,https://www.atptour.com/-/media/tennis/players...,0.0
3,Madrid,Herbert P.H.,Davidovich Fokina A.,Masters 1000,Outdoor,Clay,78.0,48.0,2.38,1.59,...,Right-Handed,Herbert P.H.,734,https://www.atptour.com/-/media/tennis/players...,183.0,Right-Handed,Davidovich Fokina A.,398,https://www.atptour.com/-/media/tennis/players...,0.0
4,Madrid,Khachanov K.,Nishikori K.,Masters 1000,Outdoor,Clay,23.0,39.0,2.02,1.8,...,Right-Handed,Khachanov K.,886,https://www.atptour.com/-/media/tennis/players...,178.0,Right-Handed,Nishikori K.,1256,https://www.atptour.com/-/media/tennis/players...,0.0
5,Madrid,Nishioka Y.,Krajinovic F.,Masters 1000,Outdoor,Clay,60.0,33.0,3.86,1.29,...,Left-Handed,Nishioka Y.,1257,https://www.atptour.com/-/media/tennis/players...,185.0,Right-Handed,Krajinovic F.,952,https://www.atptour.com/-/media/tennis/players...,1.0
6,Madrid,Norrie C.,Krajinovic F.,Masters 1000,Outdoor,Clay,58.0,33.0,2.72,1.46,...,Left-Handed,Norrie C.,1262,https://www.atptour.com/-/media/tennis/players...,185.0,Right-Handed,Krajinovic F.,952,https://www.atptour.com/-/media/tennis/players...,
7,Madrid,Paul T.,Rublev A.,Masters 1000,Outdoor,Clay,57.0,7.0,5.9,1.14,...,Right-Handed,Paul T.,1311,https://www.atptour.com/-/media/tennis/players...,188.0,Right-Handed,Rublev A.,1478,https://www.atptour.com/-/media/tennis/players...,
8,Madrid,Popyrin A.,Struff J.L.,Masters 1000,Outdoor,Clay,79.0,40.0,2.11,1.73,...,Right-Handed,Popyrin A.,1360,https://www.atptour.com/-/media/tennis/players...,193.0,Right-Handed,Struff J.L.,1687,https://www.atptour.com/-/media/tennis/players...,
9,Madrid,Bublik A.,Shapovalov D.,Masters 1000,Outdoor,Clay,42.0,14.0,2.74,1.45,...,Right-Handed,Bublik A.,266,https://www.atptour.com/-/media/tennis/players...,185.0,Left-Handed,Shapovalov D.,1557,https://www.atptour.com/-/media/tennis/players...,


In [598]:
df_export=df_validation.copy()
#df_validation.to_csv('MatchesDay.csv', index=False)

In [599]:
df_validation=df_validation.drop(["Player1","Player2","NewNameP1","NewNameP2","PhotoP1","PhotoP2"],axis=1) # features already present in idP1 and idP2

In [600]:
df_validation['AvgP1']=df_validation.AvgP1.astype(float)#convert column to float
df_validation['AvgP2']=df_validation.AvgP2.astype(float)#convert column to float

In [601]:
df_val_test=df_validation.copy()

In [602]:
df_val_test=df_val_test[df_val_test['IdP1'].notna()&df_val_test['IdP2'].notna()&df_val_test['ActualRankingP1'].notna()&df_val_test['ActualRankingP2'].notna()] #keep rows without NaN values in Id columns
df_val_test.head(15)

Unnamed: 0,Location,Series,Court,Surface,ActualRankingP1,ActualRankingP2,AvgP1,AvgP2,HeightP1,HandednessP1,IdP1,HeightP2,HandednessP2,IdP2,P1Winner
0,Madrid,Masters 1000,Outdoor,Clay,16.0,43.0,1.44,2.81,196.0,Right-Handed,764,183.0,Right-Handed,1153,0.0
1,Madrid,Masters 1000,Outdoor,Clay,50.0,19.0,8.49,1.07,183.0,Left-Handed,1319,188.0,Right-Handed,1595,
2,Madrid,Masters 1000,Outdoor,Clay,20.0,24.0,1.85,1.97,193.0,Right-Handed,78,183.0,Right-Handed,1489,0.0
3,Madrid,Masters 1000,Outdoor,Clay,78.0,48.0,2.38,1.59,188.0,Right-Handed,734,183.0,Right-Handed,398,0.0
4,Madrid,Masters 1000,Outdoor,Clay,23.0,39.0,2.02,1.8,198.0,Right-Handed,886,178.0,Right-Handed,1256,0.0
5,Madrid,Masters 1000,Outdoor,Clay,60.0,33.0,3.86,1.29,170.0,Left-Handed,1257,185.0,Right-Handed,952,1.0
6,Madrid,Masters 1000,Outdoor,Clay,58.0,33.0,2.72,1.46,188.0,Left-Handed,1262,185.0,Right-Handed,952,
7,Madrid,Masters 1000,Outdoor,Clay,57.0,7.0,5.9,1.14,185.0,Right-Handed,1311,188.0,Right-Handed,1478,
8,Madrid,Masters 1000,Outdoor,Clay,79.0,40.0,2.11,1.73,196.0,Right-Handed,1360,193.0,Right-Handed,1687,
9,Madrid,Masters 1000,Outdoor,Clay,42.0,14.0,2.74,1.45,196.0,Right-Handed,266,185.0,Left-Handed,1557,


In [603]:
def Validation_Labelizer(df):
    dict_idx=0
    for idx,column in enumerate(df):
        
        if df[column].dtype==object:
            df.iloc[:,idx]=df.iloc[:,idx].map(label_dictionnary[dict_idx])
            dict_idx+=1
    return df

In [604]:
df_val_test=Validation_Labelizer(df_val_test)
df_val_test.head(10)

Unnamed: 0,Location,Series,Court,Surface,ActualRankingP1,ActualRankingP2,AvgP1,AvgP2,HeightP1,HandednessP1,IdP1,HeightP2,HandednessP2,IdP2,P1Winner
0,52,5,1,1,16.0,43.0,1.44,2.81,196.0,1,764,183.0,1,1153,0.0
1,52,5,1,1,50.0,19.0,8.49,1.07,183.0,0,1319,188.0,1,1595,
2,52,5,1,1,20.0,24.0,1.85,1.97,193.0,1,78,183.0,1,1489,0.0
3,52,5,1,1,78.0,48.0,2.38,1.59,188.0,1,734,183.0,1,398,0.0
4,52,5,1,1,23.0,39.0,2.02,1.8,198.0,1,886,178.0,1,1256,0.0
5,52,5,1,1,60.0,33.0,3.86,1.29,170.0,0,1257,185.0,1,952,1.0
6,52,5,1,1,58.0,33.0,2.72,1.46,188.0,0,1262,185.0,1,952,
7,52,5,1,1,57.0,7.0,5.9,1.14,185.0,1,1311,188.0,1,1478,
8,52,5,1,1,79.0,40.0,2.11,1.73,196.0,1,1360,193.0,1,1687,
9,52,5,1,1,42.0,14.0,2.74,1.45,196.0,1,266,185.0,0,1557,


In [605]:
X_validation=df_val_test.drop(['P1Winner'],axis=1)
#y_validation=df_val_test['P1Winner']
df_val_test["Prediction"]=Ml_model.predict(X_validation)

In [621]:
df_val_test.head(20)

Unnamed: 0,Location,Series,Court,Surface,ActualRankingP1,ActualRankingP2,AvgP1,AvgP2,HeightP1,HandednessP1,IdP1,HeightP2,HandednessP2,IdP2,P1Winner,Prediction
0,52,5,1,1,16.0,43.0,1.44,2.81,196.0,1,764,183.0,1,1153,0.0,0.610119
1,52,5,1,1,50.0,19.0,8.49,1.07,183.0,0,1319,188.0,1,1595,,0.02459
2,52,5,1,1,20.0,24.0,1.85,1.97,193.0,1,78,183.0,1,1489,0.0,0.692636
3,52,5,1,1,78.0,48.0,2.38,1.59,188.0,1,734,183.0,1,398,0.0,0.359437
4,52,5,1,1,23.0,39.0,2.02,1.8,198.0,1,886,178.0,1,1256,0.0,0.433861
5,52,5,1,1,60.0,33.0,3.86,1.29,170.0,0,1257,185.0,1,952,1.0,0.139613
6,52,5,1,1,58.0,33.0,2.72,1.46,188.0,0,1262,185.0,1,952,,0.228149
7,52,5,1,1,57.0,7.0,5.9,1.14,185.0,1,1311,188.0,1,1478,,0.061199
8,52,5,1,1,79.0,40.0,2.11,1.73,196.0,1,1360,193.0,1,1687,,0.358401
9,52,5,1,1,42.0,14.0,2.74,1.45,196.0,1,266,185.0,0,1557,,0.585266


In [620]:
df_export['PredictP1']=df_val_test['Prediction']

df_export=df_export[df_export['IdP1'].notna()&df_export['IdP2'].notna()&df_export['ActualRankingP1'].notna()&df_export['ActualRankingP2'].notna()] #keep rows without NaN values in Id columns
df_export.head(20)

Unnamed: 0,Location,Player1,Player2,Series,Court,Surface,ActualRankingP1,ActualRankingP2,AvgP1,AvgP2,...,NewNameP1,IdP1,PhotoP1,HeightP2,HandednessP2,NewNameP2,IdP2,PhotoP2,P1Winner,PredictP1
0,Madrid,Hurkacz H.,Millman J.,Masters 1000,Outdoor,Clay,16.0,43.0,1.44,2.81,...,Hurkacz H.,764,https://www.atptour.com/-/media/tennis/players...,183.0,Right-Handed,Millman J.,1153,https://www.atptour.com/-/media/tennis/players...,0.0,0.610119
1,Madrid,Pella G.,Sinner J.,Masters 1000,Outdoor,Clay,50.0,19.0,8.49,1.07,...,Pella G.,1319,https://www.atptour.com/-/media/tennis/players...,188.0,Right-Handed,Sinner J.,1595,https://www.atptour.com/-/media/tennis/players...,,0.02459
2,Madrid,Auger-Aliassime F.,Ruud C.,Masters 1000,Outdoor,Clay,20.0,24.0,1.85,1.97,...,Auger-Aliassime F.,78,https://www.atptour.com/-/media/tennis/players...,183.0,Right-Handed,Ruud C.,1489,https://www.atptour.com/-/media/tennis/players...,0.0,0.692636
3,Madrid,Herbert P.H.,Davidovich Fokina A.,Masters 1000,Outdoor,Clay,78.0,48.0,2.38,1.59,...,Herbert P.H.,734,https://www.atptour.com/-/media/tennis/players...,183.0,Right-Handed,Davidovich Fokina A.,398,https://www.atptour.com/-/media/tennis/players...,0.0,0.359437
4,Madrid,Khachanov K.,Nishikori K.,Masters 1000,Outdoor,Clay,23.0,39.0,2.02,1.8,...,Khachanov K.,886,https://www.atptour.com/-/media/tennis/players...,178.0,Right-Handed,Nishikori K.,1256,https://www.atptour.com/-/media/tennis/players...,0.0,0.433861
5,Madrid,Nishioka Y.,Krajinovic F.,Masters 1000,Outdoor,Clay,60.0,33.0,3.86,1.29,...,Nishioka Y.,1257,https://www.atptour.com/-/media/tennis/players...,185.0,Right-Handed,Krajinovic F.,952,https://www.atptour.com/-/media/tennis/players...,1.0,0.139613
6,Madrid,Norrie C.,Krajinovic F.,Masters 1000,Outdoor,Clay,58.0,33.0,2.72,1.46,...,Norrie C.,1262,https://www.atptour.com/-/media/tennis/players...,185.0,Right-Handed,Krajinovic F.,952,https://www.atptour.com/-/media/tennis/players...,,0.228149
7,Madrid,Paul T.,Rublev A.,Masters 1000,Outdoor,Clay,57.0,7.0,5.9,1.14,...,Paul T.,1311,https://www.atptour.com/-/media/tennis/players...,188.0,Right-Handed,Rublev A.,1478,https://www.atptour.com/-/media/tennis/players...,,0.061199
8,Madrid,Popyrin A.,Struff J.L.,Masters 1000,Outdoor,Clay,79.0,40.0,2.11,1.73,...,Popyrin A.,1360,https://www.atptour.com/-/media/tennis/players...,193.0,Right-Handed,Struff J.L.,1687,https://www.atptour.com/-/media/tennis/players...,,0.358401
9,Madrid,Bublik A.,Shapovalov D.,Masters 1000,Outdoor,Clay,42.0,14.0,2.74,1.45,...,Bublik A.,266,https://www.atptour.com/-/media/tennis/players...,185.0,Left-Handed,Shapovalov D.,1557,https://www.atptour.com/-/media/tennis/players...,,0.585266


In [608]:
df_export.to_csv('MatchesDay.csv', index=False)