In [231]:
import pandas as pd
import numpy as np
import seaborn as sn
import tensorflow as tf
import matplotlib.pyplot as plt
from math import log
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn import neighbors, metrics

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException 
import re
from datetime import date



## Betting odds Dataframe

Source: <a href="http://tennis-data.co.uk/" target="_blank">http://tennis-data.co.uk/</a>

In [232]:
df_betting=pd.DataFrame()
for i in range(2002,2021):
    
    url="http://tennis-data.co.uk/%d/%d.xlsx" %(i,i)
    df=pd.read_excel(url)
    df_betting= pd.concat([df_betting,df],ignore_index=True)

In [233]:
#Correction for Names not respecting the syntax 
df_betting["Winner"]=df_betting["Winner"].replace({"Kwon S.W.":"Kwon S."})
df_betting["Loser"]=df_betting["Loser"].replace({"Kwon S.W.":"Kwon S."})

Create a datframe with Atp matches since 2002

In [234]:
def checkName(Name):
    if '..' in Name:
        Name=Name.replace('..','.')
    else:
        Name=Name
        
    if '. ' in Name:
        Name=Name.replace('. ','.')
    else:
        Name=Name    
        
    length=len(Name)
    if Name[length-1]!='.':
        Name=Name+'.'
    else:
        Name=Name
    return Name

In [235]:
df_betting_players=pd.DataFrame()
#Keep same syntax for player Name in order to have an unique ID for each player
df_betting_players["Winner"]=df_betting["Winner"].apply(lambda x: checkName(x))
df_betting_players["Loser"]=df_betting["Loser"].apply(lambda x: checkName(x))

## Players Dataframe

Source: <a href="https://www.kaggle.com/romanzdk/atp-players-overviews" target="_blank">https://www.kaggle.com/romanzdk/atp-players-overviews/</a>

In [236]:
df_players=pd.read_csv('player_overviews.csv')
df_players=df_players.drop(df_players.columns[4:17],axis=1)
df_players=df_players.drop(df_players.columns[-1],axis=1)
df_players.columns=['AtpId','AtpName','FirstName','Name','Height','Handedness']
df_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness
0,a001,sadiq-abdullahi,Sadiq,Abdullahi,0.0,
1,a005,nelson-aerts,Nelson,Aerts,188.0,
2,a004,egan-adams,Egan,Adams,178.0,
3,a006,ronald-agenor,Ronald,Agenor,180.0,
4,a007,juan-aguilera,Juan,Aguilera,183.0,


### Change Name syntax to be similar to betting dataframe

For a future merge with the betting dataframe, we need to apply the same syntax to the Name column

In [237]:
def PlayerName(Name,FirstName):
    length= len(Name)
    if "-" in FirstName:
        FirstName=FirstName.split("-")
        player=Name+' '+FirstName[0][0]+'.'+FirstName[1][0]+'.'
    elif " " in FirstName:
        FirstName=FirstName.split()
        if len(FirstName)>1:
            player=Name+' '+FirstName[0][0]+'.'+FirstName[1][0]+'.'
        else:
            player=Name+' '+FirstName[0]+'.'
    elif Name[length-1]=='.':
        player=Name+FirstName[0]+'.'
        
    else:
        player=Name+' '+FirstName[0]+'.'
    return player.title()

In [238]:
df_players['NewName']=df_players.apply(lambda x: PlayerName(x["Name"],x["FirstName"]), axis=1)

Some players have their name and first letter of their firstname similar. We have to distinguished them 

### Update players dataframe

The dataset is not udpated and some current players are missing. The idea is to scrape datas from the https://www.atptour.com/ website.

In [239]:
#Player informations' are accessible through their atp overview webpage. 
#To get this page we need the name and the id given by the website.
#A first scrape on 'https://www.atptour.com/en/rankings/singles/ gives the url of the overview page
#Then for each player, we scrape First name and Last name in their overview webpage and create a dictionnary

def AtpPlayers():
    AtpIdList=[]
    AtpNameList=[]
    FirstNameList=[]
    LastNameList=[]
    ActualRankingList=[]
    
    ##Find last ranking update date
    url='https://www.atptour.com/en/rankings/singles'
    req = Request(url, headers={'User-Agent': ''})
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, 'html.parser')
    update_date=soup.find(attrs={'class':'dropdown-default-label'}).get_text()
    update_date=update_date.replace('.','-').rstrip().lstrip() #clean string 

    ##Access the ranking with the first 700 players
    url='https://www.atptour.com/en/rankings/singles/?rankDate='+update_date+'&countryCode=all&rankRange=0-700'
    req = Request(url, headers={'User-Agent': ''})
    webpage = urlopen(req).read()
    soup1 = BeautifulSoup(webpage, 'html.parser')
    for i in range(0,700):
        
        ##Access the player overview and scrape their AtpId, AtpName, FirstName and Name
        
        url_overview='https://www.atptour.com' + soup1.select('.player-cell a')[i]['href']
        #AtpName and AtpId are present in the href
        AtpName=url_overview.split("/")[5]
        AtpId=url_overview.split("/")[6]
        
        req = Request(url_overview, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup2 = BeautifulSoup(webpage, 'html.parser')
        try:
            FirstName = soup2.find( attrs={'class': 'first-name'}).get_text()
        except AttributeError:
            FirstName=np.nan
        
        try:
            Name = soup2.find( attrs={'class': 'last-name'}).get_text()
        except AttributeError:
            Name=np.nan
            
        
            
        AtpIdList.append(AtpId)
        AtpNameList.append(AtpName)
        FirstNameList.append(FirstName)
        LastNameList.append(Name)
        ActualRankingList.append(i+1) #create an Actual ranking list useful for validation process
        
        Dictionnary={'AtpId':AtpIdList,'AtpName':AtpNameList,'FirstName':FirstNameList,'Name':LastNameList,'ActualRanking':ActualRankingList}

    return Dictionnary

In [240]:
df_AtpPlayers=AtpPlayers() #get a dictionnary with scraped datas
df_AtpPlayers=pd.DataFrame(df_AtpPlayers) #convert dictionnary into dataframe
df_AtpPlayers['NewName']=df_AtpPlayers.apply(lambda x: PlayerName(x["Name"],x["FirstName"]), axis=1)
#df_AtpPlayers=df_AtpPlayers.drop(["FirstName"],axis=1) #column not needed anymore

In [241]:
df_total_players=pd.concat([df_players, df_AtpPlayers], ignore_index=True) # concat the scraping dataframe with the original one
df_total_players.drop_duplicates(subset ="AtpId",keep = "last", inplace = True) # KEEP UNIQUE VALUES
print(df_total_players.shape)
df_total_players.head()

(11192, 8)


Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,NewName,ActualRanking
0,a001,sadiq-abdullahi,Sadiq,Abdullahi,0.0,,Abdullahi S.,
1,a005,nelson-aerts,Nelson,Aerts,188.0,,Aerts N.,
2,a004,egan-adams,Egan,Adams,178.0,,Adams E.,
3,a006,ronald-agenor,Ronald,Agenor,180.0,,Agenor R.,
4,a007,juan-aguilera,Juan,Aguilera,183.0,,Aguilera J.,


In [242]:
df_total_players=df_total_players.sort_values(["Name","FirstName"]).reset_index(drop=True)

In [243]:
def duplicatedNames(df):
    column=[df.loc[0,"NewName"]]
    for i in range(1,len(df)):
        
        FirstName=df.loc[i,"FirstName"]
        Name=df.loc[i,"Name"]
        
        j=0
        while df.loc[i,"NewName"]in column:
            
            df.loc[i,"NewName"]=Name+' '+FirstName[0:j+1]+'.'
            j+=1
        
        
        column.append(df.loc[i,"NewName"])
    return column

In [244]:
df_total_players["Name"]=duplicatedNames(df_total_players)

In [245]:
#df_total_players=df_total_players.drop(["FirstName"],axis=1) #column not needed anymore
df_total_players=df_total_players.drop(["NewName"],axis=1) #column not needed anymore
#df_players.rename(columns={'NewName':'Name'}, inplace=True) #column renamed
df_total_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,ActualRanking
0,a451,bob-abbott,Bob,Abbott B.,0.0,,
1,a755,hashim-abdal,Hashim,Abdal H.,0.0,,
2,ad60,robert-abdesselam,Robert,Abdesselam R.,,,
3,a742,ahmed-abdrabuh,Ahmed,Abdrabuh A.,0.0,,
4,a706,syrym-abdukhalikov,Syrym,Abdukhalikov S.,0.0,,


In [319]:
df_total_players[df_total_players["Name"].str.contains("Popyrin")]

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,ActualRanking
7977,p09z,alexei-popyrin,Alexei,Popyrin A.,,,82.0


In [320]:
df_players[df_players["Name"].str.contains("Popyrin")]

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,NewName


In [247]:
print(df_players.shape)

(10911, 7)


### Select only commun players

In [248]:
df_selected_players=df_total_players[df_total_players["Name"].isin(df_betting_players['Winner'])| df_total_players["Name"].isin(df_betting_players['Loser'])].reset_index(drop=True) 

In [321]:
df_selected_players[df_selected_players["Name"].str.contains("Popyrin")]

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,ActualRanking,Id
855,p09z,alexei-popyrin,Alexei,Popyrin A.,196.0,Right-Handed,82.0,855


In [250]:
print(df_selected_players.shape)

(1205, 7)


Height, Handedness and Backhand are interesting criterias to analyse but we saw in the model prediction notebook that backhand had too many 'Nan' values even after scraping. Thanks to the AtId and AtpNAme we scraped on the website https://www.atptour.com/ 

In [251]:
def scraping(atpId,atpName,height,handedness):
    
    try:
        url_atp = 'https://www.atptour.com/en/players/'+atpName+'/'+atpId+'/overview'
        req = Request(url_atp, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = BeautifulSoup(webpage, 'html.parser')

        if ((height!=height) or (height==0.0)): #check NaN value or height equals to 0
            try:
                height = soup.find( attrs={'class': 'table-height-cm-wrapper'}).get_text()
                height= height.replace('(','').replace(')','').replace('cm','')
                if str(height)=='0':
                    height=np.nan
                else:
                    height=height
            except AttributeError:
                height=np.nan
        else:
            height=height

        if handedness!=handedness: #check NaN value
            try:
                plays= soup.find_all( attrs={'class':"table-value"})
                if len(plays)>=2:
                    plays=plays[2].get_text()
                    if len(plays)>1:
                        plays=plays.split(", ")

                        handedness=plays[0].replace("\r","")
                        handedness=handedness.replace("\n","")  
                    else:
                        handedness=plays.replace("\r","")
                        handedness=handedness.replace("\n","")
                    if handedness=='':
                        handedness=np.nan
                    else:
                        handedness=handedness
                else:
                    handedness=np.nan
            except AttributeError:
                handedness=np.nan
        else:
            handedness=handedness

        
    except:
        height=handedness=np.nan
    
    
    
    return height,handedness

In [252]:
def UpdateRanking(atpId,atpName):
    url_atp = 'https://www.atptour.com/en/players/'+atpName+'/'+atpId+'/overview'
    req = Request(url_atp, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, 'html.parser')
    try:
        Ranking = soup.find( attrs={'class': 'data-number'}).get_text()
    except:
        Ranking=np.nan
    try:
        int(Ranking)
        return Ranking
    except ValueError: #For inactive players 
        return np.nan

In [253]:
df_selected_players.dtypes

AtpId             object
AtpName           object
FirstName         object
Name              object
Height           float64
Handedness        object
ActualRanking    float64
dtype: object

In [254]:
df_selected_players[['Height','Handedness']]=df_selected_players.apply(lambda x : scraping(x['AtpId'],x['AtpName'],x['Height'],x['Handedness']),axis=1, result_type="expand")
df_selected_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,ActualRanking
0,a618,mohammed-abdulla,Mohammed,Abdulla M.,,,
1,a391,maximilian-abel,Maximilian,Abel M.,,Right-Handed,
2,a389,jose-acasuso,Jose,Acasuso J.,191.0,Right-Handed,
3,a305,jacob-adaktusson,Jacob,Adaktusson J.,193.0,Right-Handed,
4,a310,emin-agaev,Emin,Agaev E.,,Right-Handed,


In [255]:
df_selected_players["Height"] = df_selected_players.Height.astype(float) # convert to float

In [256]:
df_selected_players['Id']=df_selected_players.index

In [257]:
df_copy=df_selected_players.copy()

In [258]:
df_selected_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,ActualRanking,Id
0,a618,mohammed-abdulla,Mohammed,Abdulla M.,,,,0
1,a391,maximilian-abel,Maximilian,Abel M.,,Right-Handed,,1
2,a389,jose-acasuso,Jose,Acasuso J.,191.0,Right-Handed,,2
3,a305,jacob-adaktusson,Jacob,Adaktusson J.,193.0,Right-Handed,,3
4,a310,emin-agaev,Emin,Agaev E.,,Right-Handed,,4


In [259]:
df_selected_players[df_selected_players["Name"].str.contains("Cecch")]

Unnamed: 0,AtpId,AtpName,FirstName,Name,Height,Handedness,ActualRanking,Id
172,cf01,marco-cecchinato,Marco,Cecchinato M.,185.0,Right-Handed,93.0,172


## Model Dataframe

### Clean Dataframe

We can notice that depending the year, Betting Broker are not the same. So it's better to use the Min/Max and Average of betting odds. 

In [260]:
#Fill NaN values
df_betting['MaxW']=df_betting['MaxW'].fillna(round(df_betting[['B365W', 'CBW', 'GBW', 'IWW','SBW', 'B&WW', 'EXW', 'PSW', 'UBW', 'LBW', 'SJW']].max(axis=1),2))
df_betting['MaxL']=df_betting['MaxL'].fillna(round(df_betting[['B365L', 'CBL', 'GBL', 'IWL','SBL', 'B&WL', 'EXL', 'PSL', 'UBL', 'LBL', 'SJL']].max(axis=1),2))
df_betting['AvgW']=df_betting['AvgW'].fillna(round(df_betting[['B365W', 'CBW', 'GBW', 'IWW','SBW', 'B&WW', 'EXW', 'PSW', 'UBW', 'LBW', 'SJW']].mean(axis=1),2))
df_betting['AvgL']=df_betting['AvgL'].fillna(round(df_betting[['B365L', 'CBL', 'GBL', 'IWL','SBL', 'B&WL', 'EXL', 'PSL', 'UBL', 'LBL', 'SJL']].mean(axis=1),2))

In [261]:
# drop useless datas
df_betting=df_betting[df_betting['MaxW'].notna() & df_betting['MaxL'].notna()] # select rows where betting odds are present
df_betting=df_betting.drop(df_betting[df_betting["Comment"]!="Completed"].index) #keep only completed matches
df_betting=df_betting.drop(['Comment','Date','ATP','Best of'],axis=1) # drop useless columns
df_betting=df_betting.drop(['B365W', 'CBW', 'GBW', 'IWW','SBW', 'B&WW', 'EXW', 'PSW', 'UBW', 'LBW', 'SJW','B365L', 'CBL', 'GBL', 'IWL','SBL', 'B&WL', 'EXL', 'PSL', 'UBL', 'LBL', 'SJL'],axis=1)
df_betting=df_betting.drop(['W1','L1','W2','L2','W3','L3','W4','L4','W5','L5','Wsets','Lsets','WPts','LPts'],axis=1)
df_betting=df_betting.drop(df_betting[(df_betting["WRank"]=='NR') |  (df_betting["LRank"]=='NR')].index) # drop matches with No ranked players
df_betting=df_betting.dropna()
df_betting["LRank"] = df_betting.LRank.astype(float)# no more 'NR' players so we can convert into float

### Preprocessing data

In [262]:
#Keep same syntax for player Name in order to have an unique ID for each player
df_betting["Winner"]=df_betting["Winner"].apply(lambda x: checkName(x))
df_betting["Loser"]=df_betting["Loser"].apply(lambda x: checkName(x))

In [263]:
#Some Series have been renamed over years
df_betting['Series'].replace({"International Gold":"ATP500","Masters Cup":"ATP Finals","Masters":"ATP Finals"},inplace =True)
#Delete ended white space
df_betting['Location']=df_betting['Location'].apply(lambda x : x.rstrip())

In [264]:
#Rename columns: Winner -> Player1 & Loser -> Player2
df_betting.rename(columns={'Winner':'Player1','Loser':'Player2','WRank':'P1Rank','LRank':'P2Rank','MaxW':'MaxP1','MaxL':'MaxP2','AvgW':'AvgP1','AvgL':'AvgP2'}, inplace=True)
#Create the column for the label prediction: Player1 Win
df_betting['P1Winner']=1 

In the actual format all winners are Player 1, it will be a problem for the model.
So for random rows, Players 1 and Player 2 datas are switched

In [265]:
mask=df_betting.sample(frac=.5).index
df_betting.loc[mask,['Player1','Player2','P1Rank','P2Rank','MaxP1','MaxP2','AvgP1','AvgP2']] = df_betting.loc[mask,['Player2','Player1','P2Rank','P1Rank','MaxP2','MaxP1','AvgP2','AvgP1']].values
df_betting.loc[mask,['P1Winner']]=0

In [266]:
df_betting=df_betting.merge(df_selected_players,left_on='Player1',right_on="Name", how='left',suffixes=['P2','P1']) #first merge for winner player
df_betting=df_betting.merge(df_selected_players,left_on='Player2',right_on="Name", how='left',suffixes=['P1','P2'])#second merge for loser player

In [267]:
df_betting=df_betting.drop(['FirstNameP1','FirstNameP2','NameP1','NameP2',"AtpIdP1","AtpIdP2","ActualRankingP1","ActualRankingP2","AtpNameP1","AtpNameP2","Player1","Player2"],axis=1) # keep only the player ID
df_betting=df_betting.dropna() # delete rows with NaN values
df_betting=df_betting.drop_duplicates() #delete duplicated rows

In [268]:
df_betting["HeightP1"] = df_betting.HeightP1.astype(float) # convert to float
df_betting["HeightP2"] = df_betting.HeightP2.astype(float) # convert to float

In [269]:
df_betting.head(10)

Unnamed: 0,Location,Tournament,Series,Court,Surface,Round,P1Rank,P2Rank,MaxP1,MaxP2,AvgP1,AvgP2,P1Winner,HeightP1,HandednessP1,IdP1,HeightP2,HandednessP2,IdP2
0,Adelaide,AAPT Championships,International,Outdoor,Hard,1st Round,46.0,25.0,2.2,1.6,2.17,1.55,0,185.0,Right-Handed,576.0,175.0,Left-Handed,37.0
1,Adelaide,AAPT Championships,International,Outdoor,Hard,1st Round,9.0,309.0,1.15,4.25,1.15,4.25,1,185.0,Right-Handed,448.0,196.0,Right-Handed,342.0
2,Adelaide,AAPT Championships,International,Outdoor,Hard,1st Round,10.0,57.0,1.36,3.8,1.3,3.09,0,185.0,Right-Handed,945.0,188.0,Right-Handed,683.0
3,Adelaide,AAPT Championships,International,Outdoor,Hard,1st Round,93.0,80.0,2.55,1.5,2.35,1.47,0,183.0,Right-Handed,963.0,183.0,Right-Handed,686.0
4,Adelaide,AAPT Championships,International,Outdoor,Hard,1st Round,104.0,53.0,1.5,2.4,1.5,2.27,1,196.0,Right-Handed,843.0,191.0,Right-Handed,128.0
5,Adelaide,AAPT Championships,International,Outdoor,Hard,2nd Round,150.0,25.0,2.75,1.45,2.53,1.43,0,185.0,Right-Handed,688.0,175.0,Left-Handed,37.0
6,Adelaide,AAPT Championships,International,Outdoor,Hard,2nd Round,24.0,50.0,1.33,3.2,1.31,2.92,1,191.0,Right-Handed,305.0,188.0,Left-Handed,391.0
7,Adelaide,AAPT Championships,International,Outdoor,Hard,2nd Round,80.0,37.0,2.38,1.6,2.22,1.54,0,183.0,Right-Handed,686.0,193.0,Right-Handed,635.0
9,Adelaide,AAPT Championships,International,Outdoor,Hard,2nd Round,57.0,89.0,1.4,2.75,1.38,2.62,1,188.0,Right-Handed,683.0,191.0,Left-Handed,636.0
10,Adelaide,AAPT Championships,International,Outdoor,Hard,2nd Round,104.0,26.0,1.65,2.1,1.65,1.98,1,196.0,Right-Handed,843.0,193.0,Right-Handed,964.0


In [270]:
df_betting["Series"].unique()

array(['International', 'Grand Slam', 'ATP500', 'ATP Finals', 'ATP250',
       'Masters 1000'], dtype=object)

Columns label encoding

In [271]:
label_dictionnary=[] 

In [272]:
def Labelizer(column):
    label=sorted(column.unique())
    dictionnary = { val : idx for idx,val in enumerate(label) }
    label_dictionnary.append(dictionnary) #save label encoder for validation test
    colLab=column.map(dictionnary)
    return colLab

In [273]:
df_betting=df_betting.apply(lambda x: Labelizer(x) if x.dtype==object else x,axis=0)

## Model

### Multilayers perceptron

In [274]:
X=df_betting.drop(['Tournament','Round','P1Winner','MaxP1','MaxP2',],axis=1)
y=df_betting['P1Winner']


Ml_model=tf.keras.Sequential([
    tf.keras.layers.Dense(16,input_dim=X.shape[1],activation="relu"),
    #tf.keras.layers.Dense(12,activation="relu"),
    tf.keras.layers.Dense(8,activation="relu"),
    #tf.keras.layers.Dense(4,activation="relu"),
    tf.keras.layers.Dense(1,activation="sigmoid"),
])

#Compile the model

Ml_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#Train the model

history=Ml_model.fit(X,y,epochs=200, batch_size=250,verbose=0)

## Validation 

The idea is to scrape https://www.oddsportal.com/matches/tennis/ and find ATP matches of the day.

In [306]:
def PlayerSyntax(playerName):
    
    playerName=checkName(playerName)#first syntax check for dots
    
    char_positions=[pos for pos, char in enumerate(playerName) if char == "-"] #array of character position in string
    if char_positions: ##  if not empty array
        for i in char_positions:
            if playerName[i-1].isupper(): #check if the character before is an uppercase
                #convert to list in order to change specific character 
                new=list(playerName)
                new[i]="."
                playerName="".join(new)
                
    if playerName == "Kwon Soonwoo":
        playerName="Kwon S."
    elif playerName == "Ramos A.":
        playerName="Ramos-Vinolas A."
    elif playerName == "Carreno-Busta P.":
        playerName="Carreno Busta P."

    return playerName    


In [342]:
def DayMatches():
    #############################################################################
    ## Scrape Location, Players, Odds and Winner of each ATP matches of the day ##
    #############################################################################
    url_odds="https://www.oddsportal.com/matches/tennis/"

    driver = webdriver.Chrome(executable_path=r'/Users/pierremecchia/Desktop/chromedriver')
    driver.get(url_odds)
    sleep(2)

    table=driver.find_elements_by_xpath("//*[@id='table-matches']/table/tbody/tr[contains(@class,'dark')]")

    list_href=[]
    list_location=[]
    list_player1=[]
    list_player2=[]
    list_odd1=[]
    list_odd2=[]
    list_winnerP1=[]    

    ## find all ATP tournament and keep their link
    for row in table:
        if ("ATP" in row.text) and (not "Doubles" in row.text): #only interested in single atp matches
            href=row.find_element_by_xpath('./th[1]/a[2]').get_attribute('href')
            if href not in list_href:
                list_href.append(href)
    driver.quit()
    

    for href in list_href:

        driver = webdriver.Chrome(executable_path=r'/Users/pierremecchia/Desktop/chromedriver')
        driver.get(href)
        sleep(2)
        ##find location of the atp tournament
        h1=driver.find_element_by_xpath('//*[@id="col-content"]/h1').text
        
        #clean location string
        location=h1.replace('Betting Odds','')
        location=location.replace('ATP ','')
        location=re.sub(r'\([^)]*\)', '', location)#delete special characters 
        location=location.rstrip().lstrip() #delete white space at the beginning and end of string
        
        matches=driver.find_elements_by_xpath("//*[@id='tournamentTable']/tbody/tr") #List of matches in the tournament
        doScrap=False
        for match in matches:
            try:
                className=match.get_attribute("class")
                if className=='center nob-border': #create a counter in order to scrape only matches of the day
                    date=match.find_element_by_xpath('./th[1]/span').text
                    if 'Today' in date:
                        doScrap=True
                    else:
                        doScrap=False
                if doScrap==True:
                    if not('dark' or 'nob-border' or 'table-dummyrow')in className:

                        try:
                            if 'deactivate' in className: #for matches completed or currently playing (live matches)
                                players=match.find_element_by_xpath('./td[2]/a').text

                                if players=="": #append for live matches
                                    players=match.find_element_by_xpath('./td[2]/a[2]').text

                                #find odds
                                odd1=match.find_element_by_xpath('./td[4]/a').text
                                odd1_className=match.find_element_by_xpath('./td[4]').get_attribute("class")

                                odd2=match.find_element_by_xpath('./td[5]/a').text
                                odd2_className=match.find_element_by_xpath('./td[5]').get_attribute("class")
                                
                                #find winner of the match
                                winnerP1=match.find_element_by_xpath('./td[3]').text

                                if ("ret" in winnerP1) or ("canc" in winnerP1): #no winner for canceled or retirement matches
                                    winnerP1=np.nan
                                elif (("result-ok" not in odd1_className) & ("result-ok" not in odd2_className)): #match not finished
                                    winnerP1=np.nan
                                elif int(winnerP1[0])<int(winnerP1[2]):
                                    winnerP1=0
                                else:
                                    winnerP1=1
                            else: # for matches not played yet
                                players=match.find_element_by_xpath('./td[2]/a[not(contains(@class,"ico-tv-tournament"))]').text
                                odd1=match.find_element_by_xpath('./td[3]/a').text
                                odd2=match.find_element_by_xpath('./td[4]/a').text
                                winnerP1=np.nan
                            
                            #Clean players names
                            player1=players.split(" - ")[0]
                            player1=PlayerSyntax(player1)
                            player2=players.split(" - ")[1]
                            player2=PlayerSyntax(player2)
                            
                        
                            list_location.append(location)
                            list_player1.append(player1)
                            list_player2.append(player2)
                            list_odd1.append(odd1)
                            list_odd2.append(odd2)
                            list_winnerP1.append(winnerP1)
                            
                        except :
                            continue

            except NoSuchElementException:
                continue
                
        driver.quit()
            
    dictionnary={"Location":list_location,"Player1":list_player1,"Player2":list_player2,"AvgP1":list_odd1,"AvgP2":list_odd2,"WinnerP1":list_winnerP1} #create a dictionnary
    df_matchs=pd.DataFrame(dictionnary) #convert dictionnary into pandas dataframe
    
    return df_matchs
            
    

In [343]:
def TournamentsData(locations_list):
    
    ##################################################################################
    ## Scrape Series, Surface and Court of tournaments present in the locations_list ##
    ##################################################################################
    
    
    url_tourn="https://www.atptour.com/en/tournaments/"

    driver = webdriver.Chrome(executable_path=r'/Users/pierremecchia/Desktop/chromedriver')
    driver.get(url_tourn)
    sleep(2)
    
    #list all tournaments in the current month
    month_tournament=driver.find_elements_by_xpath("//*[@id='contentAccordionWrapper']/div[contains(@class,'expand')][1]/div[2]/div/table/tbody/tr")

    list_series=[]
    list_court=[]
    list_location=[]
    list_surface=[]
    
    for tournament in month_tournament:
        try:
            #scrape location for each tournament in the list
            location=tournament.find_element_by_xpath('./td[2]/span[1]').text 
            town=location.split(",")[0]
            
            if town in locations_list: #identify tournaments that we have to scrape
                try:
                    #define the serie of the tournament thanks to the image source
                    series=tournament.find_element_by_xpath('./td[1]/img').get_attribute("src")
                    if "250.png" in series:
                        series="ATP250"
                    elif "500.png" in series:
                        series="ATP500"
                    elif "1000.png" in series:
                        series="Masters 1000"
                    elif "grandslam.png" in series:
                        series="Grand Slam"
                    elif "finals.svg" in series:
                        series="ATP Finals"
                    else:
                        series=np.nan
                except:
                    series=np.nan
                try:
                    #scrape court and surface of the tournament
                    playground=tournament.find_element_by_xpath('./td[3]/table/tbody/tr/td[2]/div/div').text
                    playground=playground.split(" ")
                    court=playground[0]
                    surface=playground[1]
                    
                except:
                    court=np.nan
                    surface=np.nan
                    
                    
                list_series.append(series)
                list_surface.append(surface)
                list_court.append(court)
                list_location.append(town)
        except:
            continue
    driver.quit()

    loc_dict={"Location":list_location,"Series":list_series,"Court":list_court,"Surface":list_surface} #create a dictionnary
    df_tournaments=pd.DataFrame(loc_dict) # convert dictionnary into pandas dataframe
    
    return df_tournaments

In [344]:
df_matchs=DayMatches() #Scrape atp matches of the day
locations_list=df_matchs['Location'].unique() #location list of tournaments of the day
df_tournaments=TournamentsData(locations_list) # scrape additionnals datas of the tournaments of the day

In [345]:
df_validation=df_matchs.merge(df_tournaments,on="Location")
df_validation.head(10)

Unnamed: 0,Location,Player1,Player2,AvgP1,AvgP2,WinnerP1,Series,Court,Surface
0,Monte Carlo,Tomic B.,Caruso S.,3.27,1.32,0,Masters 1000,Outdoor,Clay
1,Monte Carlo,Koepfer D.,Londero J.I.,1.55,2.4,1,Masters 1000,Outdoor,Clay
2,Monte Carlo,Martinez P.,Delbonis F.,2.07,1.72,0,Masters 1000,Outdoor,Clay
3,Monte Carlo,Travaglia S.,Majchrzak K.,1.49,2.56,1,Masters 1000,Outdoor,Clay
4,Monte Carlo,Goffin D.,Cilic M.,1.74,2.11,1,Masters 1000,Outdoor,Clay
5,Monte Carlo,Laaksonen H.,Cecchinato M.,2.28,1.6,0,Masters 1000,Outdoor,Clay
6,Monte Carlo,Hanfmann Y.,Popyrin Al.,1.79,1.97,0,Masters 1000,Outdoor,Clay
7,Monte Carlo,Sousa J.,Fabbiano T.,1.48,2.57,0,Masters 1000,Outdoor,Clay
8,Monte Carlo,Thompson J.,Paire B.,1.72,2.14,1,Masters 1000,Outdoor,Clay
9,Cagliari,Djere L.,Sonego L.,1.65,2.27,0,ATP250,Outdoor,Clay


In [346]:
df_ex=df_validation.copy()

In [347]:
df_ex=df_ex.merge(df_selected_players,left_on='Player1',right_on="Name", how='left',suffixes=['P2','P1']) #first merge for winner player
df_ex=df_ex.merge(df_selected_players,left_on='Player2',right_on="Name", how='left',suffixes=['P1','P2'])#second merge for loser player

In [348]:
df_ex=df_ex.drop(['FirstNameP1','FirstNameP2','NameP1','NameP2',"AtpIdP1","AtpIdP2","AtpNameP1","AtpNameP2"],axis=1) # drop useless features 

In [349]:
df_ex.head(20)

Unnamed: 0,Location,Player1,Player2,AvgP1,AvgP2,WinnerP1,Series,Court,Surface,HeightP1,HandednessP1,ActualRankingP1,IdP1,HeightP2,HandednessP2,ActualRankingP2,IdP2
0,Monte Carlo,Tomic B.,Caruso S.,3.27,1.32,0,Masters 1000,Outdoor,Clay,196.0,Right-Handed,211.0,1073,185.0,Right-Handed,87.0,170.0
1,Monte Carlo,Koepfer D.,Londero J.I.,1.55,2.4,1,Masters 1000,Outdoor,Clay,180.0,Left-Handed,54.0,556,180.0,Right-Handed,94.0,640.0
2,Monte Carlo,Martinez P.,Delbonis F.,2.07,1.72,0,Masters 1000,Outdoor,Clay,185.0,Right-Handed,100.0,684,193.0,Left-Handed,85.0,244.0
3,Monte Carlo,Travaglia S.,Majchrzak K.,1.49,2.56,1,Masters 1000,Outdoor,Clay,185.0,Right-Handed,69.0,1077,180.0,Right-Handed,118.0,662.0
4,Monte Carlo,Goffin D.,Cilic M.,1.74,2.11,1,Masters 1000,Outdoor,Clay,180.0,Right-Handed,14.0,386,198.0,Right-Handed,43.0,192.0
5,Monte Carlo,Laaksonen H.,Cecchinato M.,2.28,1.6,0,Masters 1000,Outdoor,Clay,185.0,Right-Handed,135.0,600,185.0,Right-Handed,93.0,172.0
6,Monte Carlo,Hanfmann Y.,Popyrin Al.,1.79,1.97,0,Masters 1000,Outdoor,Clay,193.0,Right-Handed,105.0,434,,,,
7,Monte Carlo,Sousa J.,Fabbiano T.,1.48,2.57,0,Masters 1000,Outdoor,Clay,185.0,Right-Handed,107.0,1024,170.0,Right-Handed,172.0,318.0
8,Monte Carlo,Thompson J.,Paire B.,1.72,2.14,1,Masters 1000,Outdoor,Clay,183.0,Right-Handed,62.0,1067,196.0,Right-Handed,33.0,817.0
9,Cagliari,Djere L.,Sonego L.,1.65,2.27,0,ATP250,Outdoor,Clay,188.0,Right-Handed,57.0,263,191.0,Right-Handed,34.0,1022.0
