In [10]:
import pandas as pd
import numpy as np
import seaborn as sn
import tensorflow as tf
import matplotlib.pyplot as plt
from math import log
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn import neighbors, metrics

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from time import sleep


## Players Dataframe

Source: <a href="https://www.kaggle.com/romanzdk/atp-players-overviews" target="_blank">https://www.kaggle.com/romanzdk/atp-players-overviews/</a>

In [3]:
df_players=pd.read_csv('player_overviews.csv')
df_players=df_players.drop(df_players.columns[4:17],axis=1)
df_players.columns=['AtpId','AtpName','FirstName','Name','height','handedness','backhand']
df_players.head()

Unnamed: 0,AtpId,AtpName,FirstName,Name,height,handedness,backhand
0,a001,sadiq-abdullahi,Sadiq,Abdullahi,0.0,,
1,a005,nelson-aerts,Nelson,Aerts,188.0,,
2,a004,egan-adams,Egan,Adams,178.0,,
3,a006,ronald-agenor,Ronald,Agenor,180.0,,
4,a007,juan-aguilera,Juan,Aguilera,183.0,,


The dataset is not udpated and some current players are missing. The idea is to scrap datas from the https://www.atptour.com/ website.

In [34]:
#Player informations' are accessible through their atp overview webpage. 
#To get this page we need the name and the id given by the website.
#A first scrap on 'https://www.atptour.com/en/rankings/singles/ gives the url of the overview page
#Then for each player, we scrap First name and Last name in their overview webpage and create a dictionnary

def AtpPlayers():
    AtpIdList=[]
    AtpNameList=[]
    FirstNameList=[]
    LastNameList=[]
    ActualRankingList=[]
    atpRank='https://www.atptour.com/en/rankings/singles/?rankDate=2020-9-14&countryCode=all&rankRange=0-700'
    req = Request(atpRank, headers={'User-Agent': ''})
    webpageAtp = urlopen(req).read()
    soup = BeautifulSoup(webpageAtp, 'html.parser')
    for i in range(0,700):
        url_overview='https://www.atptour.com' + soup.select('.player-cell a')[i]['href']
        AtpName=url_overview.split("/")[5]
        AtpId=url_overview.split("/")[6]
        
        req2 = Request(url_overview, headers={'User-Agent': 'Mozilla/5.0'})
        webpage_overview = urlopen(req2).read()
        soup2 = BeautifulSoup(webpage_overview, 'html.parser')
        try:
            FirstName = soup2.find( attrs={'class': 'first-name'}).get_text()
        except AttributeError:
            FirstName=np.nan
        
        try:
            Name = soup2.find( attrs={'class': 'last-name'}).get_text()
        except AttributeError:
            Name=np.nan
            
        
            
        AtpIdList.append(AtpId)
        AtpNameList.append(AtpName)
        FirstNameList.append(FirstName)
        LastNameList.append(Name)
        ActualRankingList.append(i+1)
        
        Dictionnary={'AtpId':AtpIdList,'AtpName':AtpNameList,'FirstName':FirstNameList,'Name':LastNameList,'ActualRanking':ActualRankingList}

    return Dictionnary

In [35]:
df_AtpPlayers=AtpPlayers() #get a dictionnary with scraped datas
df_AtpPlayers=pd.DataFrame(df_AtpPlayers) #convert dictionnary into dataframe
df_total_players=pd.concat([df_players, df_AtpPlayers], ignore_index=True) # concat the scraping dataframe with the original one
df_total_players.drop_duplicates(subset ="AtpId",keep = "last", inplace = True) # KEEP UNIQUE VALUES
print(df_total_players.shape)
df_total_players.head()

(11188, 8)


Unnamed: 0,AtpId,AtpName,FirstName,Name,height,handedness,backhand,ActualRanking
0,a001,sadiq-abdullahi,Sadiq,Abdullahi,0.0,,,
1,a005,nelson-aerts,Nelson,Aerts,188.0,,,
2,a004,egan-adams,Egan,Adams,178.0,,,
3,a006,ronald-agenor,Ronald,Agenor,180.0,,,
4,a007,juan-aguilera,Juan,Aguilera,183.0,,,


In [17]:
df_total_players["AtpName"].isna().sum()

0

Height, Handedness and Backhand are interesting criterias to analyse but we saw in the model prediction notebook that backhand had too many 'Nan' values even after scraping. Thanks to the AtId and AtpNAme we scraped on the website https://www.atptour.com/ 

In [18]:
def scraping(atpId,atpName,height,handedness):
    
    try:
        url_atp = 'https://www.atptour.com/en/players/'+atpName+'/'+atpId+'/overview'
        req = Request(url_atp, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = BeautifulSoup(webpage, 'html.parser')
        sleep(1)

        if ((height!=height) or (height==0.0)): #check NaN value or height equals to 0
            try:
                height = soup.find( attrs={'class': 'table-height-cm-wrapper'}).get_text()
                height= height.replace('(','').replace(')','').replace('cm','')
                if str(height)=='0':
                    height=np.nan
                else:
                    height=height
            except AttributeError:
                height=np.nan
        else:
            height=height

        if handedness!=handedness: #check NaN value
            try:
                plays= soup.find_all( attrs={'class':"table-value"})
                if len(plays)>=2:
                    plays=plays[2].get_text()
                    if len(plays)>1:
                        plays=plays.split(", ")

                        handedness=plays[0].replace("\r","")
                        handedness=handedness.replace("\n","")  
                    else:
                        handedness=plays.replace("\r","")
                        handedness=handedness.replace("\n","")
                    if handedness=='':
                        handedness=np.nan
                    else:
                        handedness=handedness
                else:
                    handedness=np.nan
            except AttributeError:
                handedness=np.nan
        else:
            handedness=handedness



        ##Ranking scraping
        try:
            ranking = soup.find( attrs={'class': 'data-number'}).get_text()
        except:
            ranking=np.nan
        try:
            int(ranking)
            return ranking
        except ValueError: #For inactive players 
            return np.nan
        
        sleep(5)
        
    except:
        height=handedness=ranking=np.nan
    
    
    
    return height,handedness,ranking

In [19]:
def PlayersRanking(atpId,atpName):
    url_atp = 'https://www.atptour.com/en/players/'+atpName+'/'+atpId+'/overview'
    req = Request(url_atp, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, 'html.parser')
    try:
        Ranking = soup.find( attrs={'class': 'data-number'}).get_text()
    except:
        Ranking=np.nan
    try:
        int(Ranking)
        return Ranking
    except ValueError: #For inactive players 
        return np.nan

In [20]:
df_total_players[['ScrapHeight','ScrapHand','ScrapRank']]=df_total_players.apply(lambda x : scraping(x['AtpId'],x['AtpName'],x['height'],x['handedness']),axis=1,result_type="expand")
df_total_players.head()

ValueError: Must have equal len keys and value when setting with an iterable

### Change Name syntax to be similar to global_dateset

For a future merge with the betting dataframe, we need to apply the same syntax to the Name column

In [None]:
def PlayerName(Name,FirstName):
    length= len(Name)
    if "-" in FirstName:
        FirstName=FirstName.split("-")
        player=Name+' '+FirstName[0][0]+'.'+FirstName[1][0]+'.'
    elif " " in FirstName:
        FirstName=FirstName.split()
        if len(FirstName)>1:
            player=Name+' '+FirstName[0][0]+'.'+FirstName[1][0]+'.'
        else:
            player=Name+' '+FirstName[0]+'.'
    elif Name[length-1]=='.':
        player=Name+FirstName[0]+'.'
        
    else:
        player=Name+' '+FirstName[0]+'.'
    return player.title()

In [None]:
df_total_players['Name']=df_total_players.apply(lambda x: PlayerName(x["Name"],x["FirstName"]), axis=1)
df_total_players=df_total_players.drop(["FirstName"],axis=1) #column not needed anymore
df_total_players.head()