# Web scrapping 
## Data obtained from the games of the main football leagues in Europe

The data was obtained from the spanish website as.com

I created a function that obtains the data from the football games of a given season and then computes the number of goals scored by local and away teams. Here it is the function:

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def goles_loc_vis(url,num_jor):
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    ### jor_1= jornadas[1]
    str_jor = []
    jornadas = []
    jornadas2 = []
    for i in range(0, num_jor):
        str_jor.append("jornada-%s" %(i+1))
    
    goles_locales = 0
    goles_visitantes = 0
    ########################
    for i in range(0,num_jor):
        jor_1 = soup.find(id=str_jor[i])
        games_jor_1 = jor_1.find_all('tr', itemscope="")
    #games_jor_1---- list of strings that match the search criteria

        game_1_jor_1 = games_jor_1[1] #Starting by 1, game 1

        datos_tags = jor_1.select("tr > td ")
        datos = [gt.get_text() for gt in datos_tags]

        Locales_1 = []
        Resultados_1 = []
        Visitantes_1 = []
        for idx in range(0, len(datos)):
            if idx % 3 == 0:
                Locales_1.append(datos[idx].replace ('\n', ''))
            elif (idx - 1) % 3 == 0:
                Resultados_1.append(datos[idx].replace ('\n', '').replace('  ',''))
            else: 
                Visitantes_1.append(datos[idx].replace ('\n', ''))
    
      
        #### crear dataframe con resultado como bloque
        jor1 = pd.DataFrame({
            "Local": Locales_1, 
            "Resultado": Resultados_1, 
            "Visitante": Visitantes_1
        })
    
        jornadas.append(jor1)
        

        ######----- crear data-frame con goles loc y vis separados: jor2
    
        local_goals = jornadas[i]["Resultado"].str.extract("(?P<local_goals>\d+)", expand=False)
        local_goals = local_goals.astype('int')
    
        away_goals = jornadas[i]["Resultado"].str.extract("(?P<away_goals>\ \d+)", expand=False)

        away_goals = away_goals.astype('int')
    
    
        goles_locales = goles_locales + pd.Series.sum(local_goals)
    
        goles_visitantes = goles_visitantes + pd.Series.sum(away_goals)
    
        jor2 = pd.DataFrame({
            "Local": Locales_1, 
            "GL": local_goals, 
            "GV": away_goals,
            "Visitante": Visitantes_1
        })
    
        jornadas2.append(jor2)
    
    
    #Change the order of the columns from jornadas2
    for i in range(0,len(jornadas2)):
        jornadas2[i] = jornadas2[i][['Local', 'GL', 'GV', 'Visitante']] 
    
    
    print ('Number of (Local goals, away goals): \n')
    print (goles_locales,goles_visitantes)
        
    return goles_locales,goles_visitantes, jornadas2

Here are some examples to ilustrate the use of the function:

In [3]:
#Function applied to the Spanish Division 2016-17 League

(gl1, gv1, jor1) = goles_loc_vis('https://resultados.as.com/resultados/futbol/primera/2016_2017/calendario/',38)

Number of (Local goals, away goals): 

632 486


In [4]:
#Function applied to the Spanish Division 2015-16 League

(gl2, gv2, jor2) = goles_loc_vis('https://resultados.as.com/resultados/futbol/primera/2015_2016/calendario/', 38)

Number of (Local goals, away goals): 

615 428


In [5]:
#Function applied to the Premier League 2015-16 

(gl3, gv3, jor3) = goles_loc_vis('https://resultados.as.com/resultados/futbol/inglaterra/2015_2016/calendario/',38)

Number of (Local goals, away goals): 

567 459


In [6]:
#Function applied to the Serie A 2016-17 


(gl4, gv4, jor4) = goles_loc_vis('https://resultados.as.com/resultados/futbol/italia/2016_2017/calendario/',38)

Number of (Local goals, away goals): 

633 490


In [7]:
#Function applied to the Ligue 1 2015-16 

(gl5,gv4, jor5) = goles_loc_vis('https://resultados.as.com/resultados/futbol/francia/2015_2016/calendario/',38)

Number of (Local goals, away goals): 

546 414


In [8]:
#Function applied to the Spanish Division 2017-18 League (only 30 matchdays)

(gl6,gv6,jor6) = goles_loc_vis ('https://resultados.as.com/resultados/futbol/primera/calendario/',30)


Number of (Local goals, away goals): 

461 337


Next, we present another function. This one calculates the number of wins from home and away teams and the number of draws in a particular season. The input of this function is a list of lists (as obtained with the previous function).

In [9]:
def count_loc_aw_wins(lst):
    home_wins = 0
    away_wins = 0
    draws = 0
    num_jor = len(lst) 
    leng = len(lst[0].index) # number of games per matchday
    for i in range(0, num_jor-1):
         for j in range(0,leng):
            if lst[i].at[j,'GL'] > lst[i].at[j,'GV']:
                home_wins = home_wins + 1
            elif lst[i].at[j,'GL'] < lst[i].at[j,'GV']:
                away_wins = away_wins + 1
            else:
                draws = draws + 1
    print ('\n The number of Home Wins is ', home_wins)
    print ('\n The number of Away Wins is ', away_wins)
    print ('\n The number of Draws is ', draws)
    
    return
    
                

In [10]:
len(jor3[1].index)

10

In [11]:
#Function applied to the Spanish Division 2016-17 League

count_loc_aw_wins(jor1)


 The number of Home Wins is  177

 The number of Away Wins is  107

 The number of Draws is  86


In [12]:
#Function applied to the Spanish Division 2015-16 League

count_loc_aw_wins(jor2)


 The number of Home Wins is  176

 The number of Away Wins is  102

 The number of Draws is  92


In [13]:
#Function applied to the Premier League 2015-16 

count_loc_aw_wins(jor3)


 The number of Home Wins is  151

 The number of Away Wins is  116

 The number of Draws is  103


In [14]:
#Function applied to the Serie A 2016-17 

count_loc_aw_wins(jor4)


 The number of Home Wins is  178

 The number of Away Wins is  113

 The number of Draws is  79


In [15]:
#Function applied to the Ligue 1 2015-16 

count_loc_aw_wins(jor5)


 The number of Home Wins is  155

 The number of Away Wins is  108

 The number of Draws is  107


In [16]:
#Function applied to the Spanish Division 2017-18 League (only 30 matchdays)

count_loc_aw_wins(jor6)


 The number of Home Wins is  139

 The number of Away Wins is  86

 The number of Draws is  65


Next, we have another function that counts the top scorer team from each of the matchdays, both considering and without considering the home and away topscorer for each matchday. 

Then, we can see the teams that were the topscorers in more matchdays.

We considered the h and a topscorers to represent the cases when a team scores a lot of goals away but another team scored more at home.

In [17]:
from collections import Counter


def top_scorers(lst):
    num_jor = len(lst)
    top_home_season = []
    top_away_season = []
    total_h_a = []
    total = []
    for i in range(0, num_jor-1):
        df = lst[i]
        top_home = df.loc[df['GL'].idxmax()][0]
        top_away = df.loc[df['GV'].idxmax()][3]
        top_home_season.append(top_home)
        top_away_season.append(top_away)
        #----total_h_a: considering the topscorer home and away from each matchday-----
        total_h_a.append(top_home)
        total_h_a.append(top_away)
        
        #----total: without considering home or away.---
        if top_home > top_away:
            total.append(top_home)
        elif top_home < top_away:
            total.append(top_away)
        else:
            total.append(top_home)
            total.append(top_away)
   
    #print ('\n Top scorers home: \n',top_home_season)
    #print ('\n Top scorers away: \n',top_away_season) 
    #print ('\n\n Top scorers Total: \n',total)     
    
    print ('\nThese teams were the top scorers: \n ', Counter(total))
    print ('\n\n \nThese teams were the top scorers(considering home and away topscorers each matchday): \n ', Counter(total_h_a))
    print('\n Home topscorers', Counter(top_home_season))
    print('\n Away topscorers', Counter(top_away_season))

    return 

In [18]:
#Function applied to the Spanish Division 2016-17 League

top_scorers(jor1)


These teams were the top scorers: 
  Counter({' Real Madrid': 12, ' Eibar': 5, ' Sevilla': 4, ' Málaga': 3, ' Barcelona': 2, ' R. Sociedad': 2, ' Villarreal': 2, ' Espanyol': 1, ' Celta': 1, ' Sporting': 1, ' Deportivo': 1, ' Granada': 1, ' Las Palmas': 1, ' Valencia': 1})


 
These teams were the top scorers(considering home and away topscorers each matchday): 
  Counter({' Barcelona': 17, ' Real Madrid': 13, ' Atlético': 6, ' Sevilla': 6, ' Eibar': 5, ' Málaga': 4, ' Celta': 4, ' Espanyol': 3, ' Athletic': 3, ' Las Palmas': 2, ' R. Sociedad': 2, ' Deportivo': 2, ' Villarreal': 2, ' Sporting': 1, ' Betis': 1, ' Granada': 1, ' Leganés': 1, ' Valencia': 1})

 Home topscorers Counter({' Barcelona': 9, ' Real Madrid': 4, ' Atlético': 3, ' Málaga': 3, ' Sevilla': 3, ' R. Sociedad': 2, ' Athletic': 2, ' Celta': 2, ' Espanyol': 2, ' Eibar': 2, ' Las Palmas': 1, ' Betis': 1, ' Deportivo': 1, ' Granada': 1, ' Leganés': 1})

 Away topscorers Counter({' Real Madrid': 9, ' Barcelona': 8, ' Atlét

In [19]:
#Function applied to the Spanish Division 2015-16 League

top_scorers(jor2)


These teams were the top scorers: 
  Counter({' Real Madrid': 13, ' Espanyol': 3, ' R. Sociedad': 3, ' Granada': 3, ' Sevilla': 2, ' Sporting': 2, ' Barcelona': 2, ' Levante': 2, ' Villarreal': 2, ' Valencia': 1, ' Deportivo': 1, ' Málaga': 1, ' Rayo': 1, ' Las Palmas': 1})


 
These teams were the top scorers(considering home and away topscorers each matchday): 
  Counter({' Barcelona': 18, ' Real Madrid': 13, ' Atlético': 7, ' Espanyol': 4, ' R. Sociedad': 4, ' Málaga': 4, ' Eibar': 3, ' Rayo': 3, ' Granada': 3, ' Athletic': 2, ' Celta': 2, ' Sevilla': 2, ' Sporting': 2, ' Levante': 2, ' Villarreal': 2, ' Valencia': 1, ' Deportivo': 1, ' Las Palmas': 1})

 Home topscorers Counter({' Barcelona': 10, ' Real Madrid': 9, ' Málaga': 3, ' Sevilla': 2, ' Atlético': 2, ' Granada': 2, ' Espanyol': 1, ' Athletic': 1, ' Celta': 1, ' Eibar': 1, ' R. Sociedad': 1, ' Sporting': 1, ' Levante': 1, ' Villarreal': 1, ' Las Palmas': 1})

 Away topscorers Counter({' Barcelona': 8, ' Atlético': 5, ' Rea

In [20]:
#Function applied to the Premier League 2015-16 

top_scorers(jor3)


These teams were the top scorers: 
  Counter({' Liverpool': 6, ' Tottenham': 5, ' Leicester': 4, ' M. City': 3, ' West Ham': 3, ' Southampton': 3, ' M. United': 2, ' Watford': 2, ' Sunderland': 2, ' Swansea City': 2, ' Everton': 1, ' Newcastle': 1, ' Stoke City': 1, ' Norwich City': 1, ' Chelsea': 1})


 
These teams were the top scorers(considering home and away topscorers each matchday): 
  Counter({' Everton': 7, ' Liverpool': 7, ' M. City': 6, ' Chelsea': 6, ' Arsenal': 6, ' Southampton': 5, ' Tottenham': 5, ' Leicester': 4, ' M. United': 4, ' Sunderland': 4, ' Crystal Palace': 3, ' West Ham': 3, ' Bournemouth': 3, ' Newcastle': 3, ' Aston Villa': 2, ' Watford': 2, ' Swansea City': 2, ' Stoke City': 1, ' Norwich City': 1})

 Home topscorers Counter({' M. City': 6, ' Everton': 4, ' Southampton': 3, ' Chelsea': 3, ' Newcastle': 3, ' Sunderland': 3, ' Liverpool': 3, ' Tottenham': 2, ' M. United': 2, ' Arsenal': 2, ' Leicester': 1, ' West Ham': 1, ' Crystal Palace': 1, ' Norwich City'

In [21]:
#Function applied to the Serie A 2016-17 

top_scorers(jor4)


These teams were the top scorers: 
  Counter({' Roma': 10, ' Torino': 6, ' Nápoles': 4, ' Milan': 4, ' Lazio': 3, ' Juventus': 3, ' Sassuolo': 2, ' Palermo': 1, ' Genoa': 1, ' Pescara': 1, ' Sampdoria': 1, ' Udinese': 1})


 
These teams were the top scorers(considering home and away topscorers each matchday): 
  Counter({' Roma': 11, ' Nápoles': 8, ' Juventus': 7, ' Lazio': 6, ' Torino': 6, ' Fiorentina': 5, ' Milan': 5, ' Atalanta': 4, ' Inter': 4, ' Cagliari': 3, ' Genoa': 2, ' Crotone': 2, ' Sassuolo': 2, ' Pescara': 2, ' Bolonia': 2, ' Chievo': 1, ' Empoli': 1, ' Palermo': 1, ' Sampdoria': 1, ' Udinese': 1})

 Home topscorers Counter({' Juventus': 7, ' Roma': 5, ' Torino': 4, ' Lazio': 3, ' Cagliari': 3, ' Milan': 2, ' Inter': 2, ' Nápoles': 2, ' Crotone': 2, ' Bolonia': 2, ' Atalanta': 1, ' Pescara': 1, ' Sampdoria': 1, ' Fiorentina': 1, ' Sassuolo': 1})

 Away topscorers Counter({' Nápoles': 6, ' Roma': 6, ' Fiorentina': 4, ' Lazio': 3, ' Atalanta': 3, ' Milan': 3, ' Genoa': 2,

In [22]:
#Function applied to the Ligue 1 2015-16 

top_scorers(jor5)


These teams were the top scorers: 
  Counter({' PSG': 11, ' Toulouse': 6, ' Rennes': 4, ' Niza': 3, ' Troyes': 2, ' Stade de Reims': 2, ' Marsella': 2, ' Saint-Etienne': 2, ' Montpellier': 2, ' Bastia': 1, ' Lyon': 1, ' Mónaco': 1})


 
These teams were the top scorers(considering home and away topscorers each matchday): 
  Counter({' PSG': 15, ' Lyon': 7, ' Bastia': 6, ' Niza': 6, ' Toulouse': 6, ' Rennes': 4, ' Marsella': 3, ' Guingamp': 3, ' Montpellier': 3, ' Girondins': 3, ' Angers': 2, ' Troyes': 2, ' Stade de Reims': 2, ' Caen': 2, ' Mónaco': 2, ' Lorient': 2, ' Nantes': 2, ' Saint-Etienne': 2, ' Lille': 2})

 Home topscorers Counter({' PSG': 9, ' Bastia': 5, ' Lyon': 4, ' Niza': 3, ' Toulouse': 3, ' Marsella': 2, ' Stade de Reims': 2, ' Guingamp': 2, ' Troyes': 1, ' Lorient': 1, ' Nantes': 1, ' Saint-Etienne': 1, ' Girondins': 1, ' Angers': 1, ' Montpellier': 1})

 Away topscorers Counter({' PSG': 6, ' Rennes': 4, ' Niza': 3, ' Lyon': 3, ' Toulouse': 3, ' Caen': 2, ' Mónaco': 

In [23]:
#Function applied to the Spanish Division 2017-18 League (only 30 matchdays)

top_scorers(jor6)


These teams were the top scorers: 
  Counter({' Real Madrid': 9, ' R. Sociedad': 4, ' Valencia': 4, ' Eibar': 3, ' Villarreal': 2, ' Barcelona': 2, ' Sevilla': 2, ' Espanyol': 1, ' Girona': 1, ' Betis': 1})


 
These teams were the top scorers(considering home and away topscorers each matchday): 
  Counter({' Real Madrid': 10, ' Barcelona': 7, ' R. Sociedad': 6, ' Atlético': 6, ' Betis': 5, ' Girona': 5, ' Valencia': 4, ' Celta': 3, ' Eibar': 3, ' Villarreal': 2, ' Sevilla': 2, ' Espanyol': 1, ' Deportivo': 1, ' Getafe': 1, ' Alavés': 1, ' Athletic': 1})

 Home topscorers Counter({' Real Madrid': 6, ' Barcelona': 4, ' R. Sociedad': 3, ' Eibar': 3, ' Betis': 2, ' Valencia': 2, ' Sevilla': 2, ' Girona': 2, ' Atlético': 2, ' Celta': 1, ' Espanyol': 1, ' Villarreal': 1})

 Away topscorers Counter({' Atlético': 4, ' Real Madrid': 4, ' R. Sociedad': 3, ' Betis': 3, ' Girona': 3, ' Barcelona': 3, ' Celta': 2, ' Valencia': 2, ' Villarreal': 1, ' Deportivo': 1, ' Getafe': 1, ' Alavés': 1, ' At