# Importing Libraries

In [2]:
import pandas as pd

#importando selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException   
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import UnexpectedAlertPresentException
from selenium.webdriver.chrome.options import Options

import numpy as np
import os
import time

# Creating a function

> This function takes the team statistic data from each season of the site [WhoScored](https://www.whoscored.com)

#### Inputs

- `weburl` : the url of the championship year

    - [Example](https://www.whoscored.com/Regions/31/Tournaments/95/Brazil-Brasileirão) 2020 championship homepage


- `pathc`  : Chrome Driver Path

- `year`   : The year you wish to get data

#### Outputs

- `csv` table with each team's statistics in each season

- The orders of the teams represents the standing in that year

#### Columns

`equipe` - team

`gols`- total number of goals scored in the season

`cartao_amarelo` - total number of yellow cards in the season

`cartao_vermelho` - total number of red cards in the season

`posse_de_bola`- percentage of ball possession

`passes_certos` - percentage of pass accuracy

`disputa_aerea`- avg. of aerial disputes won

`ranking` - team's score for the season according to WhoScore


In [99]:
#Criando a função

def get_team_stats(weburl, pathc, year):

    import time
    driver = browser = webdriver.Chrome(pathc)         #setting the driver
    print('Driver Path OK...')
    
    driver.get(weburl)                                   #getting the driver to read the url
    print('Accessing the website...')
    time.sleep(3)
    
    #going to the team statistics page
    print('Going to statistics page')
    driver.find_element_by_xpath('//*[@id="sub-navigation"]/ul/li[3]/a').click()
    time.sleep(3)    
    
    times = []
    gols = []
    chutes = []
    cas = []
    cvs = []
    posses = []
    passes = []
    aereowon = []
    rankings = []

    vals = (list(np.arange(1,21,1)))
    
    
    print('Getting the values of the table...')
   
    
    for i in vals :
        time = driver.find_element_by_xpath('//*[@id="top-team-stats-summary-content"]/tr[{}]/td[1]'.format(i)).text
        gol = driver.find_element_by_xpath('//*[@id="top-team-stats-summary-content"]/tr[{}]/td[2]'.format(i)).text
        chute = driver.find_element_by_xpath('//*[@id="top-team-stats-summary-content"]/tr[{}]/td[3]'.format(i)).text
        ca = driver.find_element_by_xpath('//*[@id="top-team-stats-summary-content"]/tr[{}]/td[4]/span[1]'.format(i)).text
        cv = driver.find_element_by_xpath('//*[@id="top-team-stats-summary-content"]/tr[{}]/td[4]/span[2]'.format(i)).text
        posse = driver.find_element_by_xpath('//*[@id="top-team-stats-summary-content"]/tr[{}]/td[5]'.format(i)).text
        passe = driver.find_element_by_xpath('//*[@id="top-team-stats-summary-content"]/tr[{}]/td[6]'.format(i)).text
        aereo = driver.find_element_by_xpath('//*[@id="top-team-stats-summary-content"]/tr[{}]/td[7]'.format(i)).text
        ranking = driver.find_element_by_xpath('//*[@id="top-team-stats-summary-content"]/tr[{}]/td[8]'.format(i)).text
        times.append(time)
        gols.append(gol)
        chutes.append(chute)
        cas.append(ca)
        cvs.append(cv)
        posses.append(posse)
        passes.append(passe)
        aereowon.append(aereo)
        rankings.append(ranking)
    driver.close()
    
    print('Crating Dataframe...')
    
    #criando o dataframe a partir das listas
    df = pd.DataFrame({"equipe":times, "gols":gols,
                          "cartao_amarelo": cas, "cartao_vermelho":cvs,
                          "posse_de_bola": posses, "passes_certos":passes,
                          "disputa_aerea": aereowon, "ranking":rankings })
    
    print(df.columns)
    
    print('Saving Dataframe...')
    df.to_csv('team_statistics_brasileirao_{}.csv'.format(year), index = False)
    
    print('Completed Successfully!')

# Testing the function

In [None]:
#testando a função

weburl = 'https://www.whoscored.com/Regions/31/Tournaments/95/Seasons/8158/Stages/18472/TeamStatistics/Brazil-Brasileirão-2020'
pathc = 'D:\Program Files (x86)\chromedriver.exe'
get_team_stats(weburl = url, pathc = pathc, year = '2020_Teste')

# Each season link that I got with Selenium `2020-2001`

In [61]:
#link de cada ano do Brasileirao (2020-2001)
web = 'https://www.whoscored.com'

year_option = [
               "/Regions/31/Tournaments/95/Seasons/8158/Brazil-Brasileir%C3%A3o",
               "/Regions/31/Tournaments/95/Seasons/7683/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/7243/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/6700/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/6242/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/5713/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/4185/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/3753/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/3254/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/2848/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/2232/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/1721/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/1434/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/1161/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/937/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/739/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/507/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/351/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/2373/Brazil-Brasileir%C3%A3o",
                "/Regions/31/Tournaments/95/Seasons/3079/Brazil-Brasileir%C3%A3o"
                ]

In [66]:
#concatenando as urls
#concat the urls
urls = []
for i in year_option:
    link = web+i
    print(link)
    urls.append(link)

https://www.whoscored.com/Regions/31/Tournaments/95/Seasons/8158/Brazil-Brasileir%C3%A3o
https://www.whoscored.com/Regions/31/Tournaments/95/Seasons/7683/Brazil-Brasileir%C3%A3o
https://www.whoscored.com/Regions/31/Tournaments/95/Seasons/7243/Brazil-Brasileir%C3%A3o
https://www.whoscored.com/Regions/31/Tournaments/95/Seasons/6700/Brazil-Brasileir%C3%A3o
https://www.whoscored.com/Regions/31/Tournaments/95/Seasons/6242/Brazil-Brasileir%C3%A3o
https://www.whoscored.com/Regions/31/Tournaments/95/Seasons/5713/Brazil-Brasileir%C3%A3o
https://www.whoscored.com/Regions/31/Tournaments/95/Seasons/4185/Brazil-Brasileir%C3%A3o
https://www.whoscored.com/Regions/31/Tournaments/95/Seasons/3753/Brazil-Brasileir%C3%A3o
https://www.whoscored.com/Regions/31/Tournaments/95/Seasons/3254/Brazil-Brasileir%C3%A3o
https://www.whoscored.com/Regions/31/Tournaments/95/Seasons/2848/Brazil-Brasileir%C3%A3o
https://www.whoscored.com/Regions/31/Tournaments/95/Seasons/2232/Brazil-Brasileir%C3%A3o
https://www.whoscored

# Webscraping data from all seasons (2020-2001)

In [79]:
#Definindo o caminho do driver
pathc = 'D:\Program Files (x86)\chromedriver.exe'

#Criando uma lista de anos decrescente, de acordo com os links
lista_de_anos = list(np.arange(2020,2012,-1))    

In [None]:
for u, i in zip(urls, lista_de_anos):
    get_team_stats(weburl = u, pathc = pathc, year = i )
    print('Just Got data from {}'.format(i))
    print('#'*15)

## PS

- Even thought my year list range (lista_de_anos) goes from 2001 to 2020, WhoScored only have data from 2013 to 2020. 

- After iterating 2013, it ran an error, but it's because the lack of data from 2001 to 2012.

- If you are going to use this for youself, make sure to check the data range seasons.