# Data Scrapping from SOFIFA.COM

### Import the necessary libraries in Python

In [1]:
from bs4 import BeautifulSoup
import requests
import os
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import re
import sqlite3

### Connect to SQLite database

In [2]:
db = sqlite3.connect(os.path.join('..', 'dataset', 'soccer2017.db'))

Function for Creating BeautifulSoup object

In [3]:
def get_beautifulsoup(url):
    
    ## Grab the HTML from URL webpage and create a beautifulSoup object.    
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data,'html.parser')
    return soup

Function to get Table rows from Soup object

In [4]:
def get_data_rows(soup):
    
    ## we want to grab the first table from the webpage. 
    ## Since we want information from the first table only, we need to specify the table we want by using index [0]
    table = soup.find_all('table')[0]
    
    ## Next we find all the tr elements, which are all the rows of table first. 
    ## We want to skip the first row because it contain header, so we use [1:] to capture only the rows with data.
    rows = table.find_all('tr')[1:]
    
    return rows

## League Data

In [214]:
source_url='https://sofifa.com/leagues'

Functions to Read Data from HTML node

In [198]:
def league_id(row):
    return row.find_all('td')[1].find_all('a')[0].get('href').replace('/league/','').encode('utf-8')
    
def league_name(row):
    return row.find_all('td')[1].get_text().encode('utf-8')   

def teams(row):
    return row.find_all('td')[2].get_text().encode('utf-8')   

def division(row):
    line= row.find_all('td')[1].get_text()  
    value = re.search(r'\([0-9]\)',  line,re.M|re.I)
    if value:
        return value.group().replace('(','').replace(')','').encode('utf-8')
    else:
        return '1'

def hits(row):
    return row.find_all('td')[3].get_text().encode('utf-8')   

Clear leagues table data before loading new data

In [200]:
cursor = db.cursor()
count = cursor.execute(''' DELETE FROM leagues''')
db.commit()
print('Total {0} records has been deleted from leagues table').format(count.rowcount)

Total 38 records has been deleted from leagues table


Next, loop through all the rows that are in the leagues table, and insert into league table

In [201]:
def insert_into_leagues_table(rows):
    cur = db.cursor()
    i=1
    for row in rows:         
        values = (i+1, league_id(row), league_name(row), teams(row), division(row), hits(row))        
        ##print("insert into leagues values"+str(values))        
        cur.execute("insert into leagues values"+str(values))
        i +=1
        
    db.commit()

Perform Operations

In [202]:
soup = get_beautifulsoup(source_url)
rows = get_data_rows(soup)
insert_into_leagues_table(rows)

Verify leagues table data

In [203]:
df = pd.io.sql.read_sql(
'''
    SELECT *
    FROM leagues    
    
;
''', con = db)

In [204]:
## Total Records
df.shape[0]

38

### Teams Data

In [205]:
df = pd.io.sql.read_sql(
'''
SELECT *
    FROM teams
    
;
''', con = db)

In [206]:
df.columns

Index([u'record_id', u'team_id', u'league_id', u'name', u'overall', u'attack',
       u'midfield', u'defence', u'home_stadium', u'rival_team',
       u'international_prestige', u'domestic_prestige', u'transfer_budget',
       u'starting_11_average_age', u'whole_team_average_age', u'captain',
       u'short_free_kick', u'long_free_kick', u'penalties', u'left_corner',
       u'right_corner'],
      dtype='object')

In [207]:
def team_id(row):    
    return row.find_all('td')[0].find_all('a')[0].get('href').replace('/team/','').encode('utf-8')

def team_name(row):    
    return row.find_all('td')[0].find_all('a')[0].get_text().encode('utf-8') 

def team_overall(row):
    return row.find_all('td')[1].find_all('span')[0].get_text().encode('utf-8') 

In [208]:
cursor = db.cursor()
count = cursor.execute(''' DELETE FROM teams''')
db.commit()
print('Total {0} records has been deleted from teams table').format(count.rowcount)

Total 420 records has been deleted from teams table


In [209]:
def insert_into_teams_table(lg_id, items):
    cur = db.cursor()
    i=1
    for row in items: 
        values = ( team_id(row), lg_id, team_name(row), team_overall(row))
        ##print("insert into team( 'team_id', 'league_id', 'name', 'overall') values"+str(values))        
        cur.execute("insert into teams('team_id', 'league_id', 'name', 'overall') values"+str(values))
        i +=1
    db.commit()

<div class="alert alert-info">

**Note:** Executing below cell will scrap all leagues data. Please use Responsibly.

</div>

In [210]:
## Please convert cell mode to Code to execute below code

items = rows ## rows will have leagues records

for row in items: 
    lg_id = league_id(row)
    source_url='https://sofifa.com/league/{0}'.format(lg_id)
    soup = get_beautifulsoup(source_url)
    rows = get_data_rows(soup)
    insert_into_teams_table(lg_id, rows)

### English Premier League Data Only

In [150]:
league_id=13 ## English Premier League 
source_url='https://sofifa.com/league/{0}'.format(league_id)
soup = get_beautifulsoup(source_url)
rows = get_data_rows(soup)
insert_into_teams_table(league_id, rows)

Validate teams table records

In [7]:
df = pd.io.sql.read_sql(
'''
SELECT *
    FROM teams
    
;
''', con = db)

In [8]:
## Records count
df.shape[0]

693

In [9]:
df.tail()

Unnamed: 0,record_id,team_id,league_id,name,overall,...,short_free_kick,long_free_kick,penalties,left_corner,right_corner
688,1110,113008,2136,Sweden Women,79,...,,,,,
689,1111,113009,2136,United States Women,82,...,,,,,
690,1112,113010,2136,Mexico Women,72,...,,,,,
691,1113,113011,2136,Netherlands Women,76,...,,,,,
692,1114,113012,2136,Spain Women,75,...,,,,,


### Load Team profile

In [40]:
df = pd.io.sql.read_sql(
'''
SELECT *
    FROM teams
    
;
''', con = db)

In [41]:
df.head()

Unnamed: 0,record_id,team_id,league_id,name,overall,...,short_free_kick,long_free_kick,penalties,left_corner,right_corner
0,422,269,1,Br\xc3\xb8ndby IF,83,...,,,,,
1,423,270,1,Silkeborg IF,83,...,,,,,
2,424,271,1,Aarhus GF,83,...,,,,,
3,425,272,1,Odense Boldklub,83,...,,,,,
4,426,819,1,FC K\xc3\xb8benhavn,83,...,,,,,


In [6]:
df.columns

Index([u'record_id', u'team_id', u'league_id', u'name', u'overall', u'attack',
       u'midfield', u'defence', u'home_stadium', u'rival_team',
       u'international_prestige', u'domestic_prestige', u'transfer_budget',
       u'starting_11_average_age', u'whole_team_average_age', u'captain',
       u'short_free_kick', u'long_free_kick', u'penalties', u'left_corner',
       u'right_corner'],
      dtype='object')

In [42]:
def team_overall(row):    
    return row.find_all('td')[0].find_all('span')[0].get_text().encode('utf-8') 

def team_attack(row):    
    return row.find_all('td')[1].find_all('span')[0].get_text().encode('utf-8') 

def team_midfield(row):    
    return row.find_all('td')[2].find_all('span')[0].get_text().encode('utf-8') 

def team_defence(row):
    return row.find_all('td')[3].find_all('span')[0].get_text().encode('utf-8') 

In [222]:
def team_home_stadium(ul):    
    return ul.find_all('li')[0].contents[2].strip().encode('utf-8') 

def team_rival_team(ul):    
    return ul.find_all('li')[1].find('a').get_text().encode('utf-8') 

def team_international_prestige(ul):    
    return ul.find_all('li')[2].find('span').get_text().encode('utf-8') 

def team_domestic_prestige(ul):    
    return ul.find_all('li')[2].find('span').get_text().encode('utf-8') 

def team_transfer_budget(ul):    
    return ul.find_all('li')[4].contents[2].strip().encode('utf-8') 

def team_starting_xi_avg_age(ul):    
    return ul.find_all('li')[5].contents[2].strip().encode('utf-8') 

def team_avg_age(ul):    
    return ul.find_all('li')[6].contents[2].strip().encode('utf-8') 

def team_captain(ul):    
    return ul.find_all('li')[7].find('a').get_text().encode('utf-8') 

def team_short_free_kick(ul):    
    return ul.find_all('li')[8].find('a').get_text().encode('utf-8') 

def team_long_free_kick(ul):    
    return ul.find_all('li')[9].find('a').get_text().encode('utf-8') 

def team_penalties(ul):    
    return ul.find_all('li')[10].find('a').get_text().encode('utf-8') 

def team_left_corner(ul):    
    return ul.find_all('li')[11].find('a').get_text().encode('utf-8') 

def team_right_corner(ul):    
    return ul.find_all('li')[12].find('a').get_text().encode('utf-8') 

In [91]:
def update_team_profile_score(team_id, soup):
    tbl = soup.find_all('table')[0]
    rows = tbl.find_all('tr')
    row=rows[0]
    cur = db.cursor()
    
    cur.execute('update teams set overall = {0}, attack = {1}, midfield = {2}, defence = {3} where team_id = {4}'
                .format(team_overall(row), team_attack(row), team_midfield(row), team_defence(row), team_id))
        
    db.commit()

In [235]:
def update_team_profile(team_id, soup):    
    ul = soup.find_all('ul')[2]  
    
    cur = db.cursor()
    sql =(""" update teams set \
                home_stadium = "{0}", \
                rival_team = "{1}", \
                international_prestige = "{2}", \
                domestic_prestige = "{3}" ,\
                transfer_budget = "{4}",\
                starting_11_average_age ="{5}",\
                whole_team_average_age ="{6}",\
                captain="{7}",\
                short_free_kick ="{8}",\
                long_free_kick = "{9}",\
                penalties = "{10}",\
                left_corner = "{11}",\
                right_corner = "{12}"\
                where team_id = {13} """
                .format(
                    team_home_stadium(ul),
                    team_rival_team(ul), 
                    team_international_prestige(ul), 
                    team_domestic_prestige(ul), 
                    team_transfer_budget(ul), 
                    team_starting_xi_avg_age(ul), 
                    team_avg_age(ul), 
                    team_captain(ul), 
                    team_short_free_kick(ul), 
                    team_long_free_kick(ul), 
                    team_penalties(ul), 
                    team_left_corner(ul), 
                    team_right_corner(ul),
                    team_id))
    
    cur.execute(sql)
    ##print(sql)    
    db.commit()
    

update team score

In [236]:
## EPL League Only
df_epl = df[df.league_id==13]

In [237]:
for i,row in df_epl.iterrows(): 
    tm_id = row['team_id']    
    source_url='https://sofifa.com/team/{0}'.format(tm_id)
    soup = get_beautifulsoup(source_url)   
    update_team_profile_score(tm_id,soup)  
    update_team_profile(tm_id,soup)  

In [239]:
pd.io.sql.read_sql(
'''
SELECT *
    FROM teams
where league_id=13
;
''', con = db)

Unnamed: 0,record_id,team_id,league_id,name,overall,...,short_free_kick,long_free_kick,penalties,left_corner,right_corner
0,488,1,13,Arsenal,83,...,A. Sánchez,M. Özil,A. Sánchez,M. Özil,M. Özil
1,489,5,13,Chelsea,83,...,David Luiz,Marcos Alonso,E. Hazard,Pedro,Pedro
2,490,7,13,Everton,80,...,R. Barkley,R. Barkley,L. Baines,R. Barkley,R. Barkley
3,491,9,13,Liverpool,81,...,Coutinho,J. Henderson,J. Milner,Coutinho,Coutinho
4,492,10,13,Manchester City,83,...,K. De Bruyne,K. De Bruyne,S. Agüero,David Silva,K. De Bruyne
...,...,...,...,...,...,...,...,...,...,...,...
15,503,1799,13,Crystal Palace,77,...,A. Townsend,Y. Cabaye,L. Milivojević,Y. Cabaye,A. Townsend
16,504,1806,13,Stoke City,78,...,M. Arnautović,X. Shaqiri,M. Arnautović,M. Arnautović,X. Shaqiri
17,505,1943,13,Bournemouth,74,...,C. Daniels,C. Daniels,C. Daniels,C. Daniels,C. Daniels
18,506,1952,13,Hull City,75,...,K. Grosicki,S. Clucas,O. Niasse,K. Grosicki,K. Grosicki


In [242]:
df=pd.io.sql.read_sql(
'''
SELECT *
    FROM squad

;
''', con = db)

In [243]:
df.columns

Index([u'id', u'team_id', u'player_id', u'name', u'age', u'ova', u'pot',
       u'team_contract', u'value', u'wage', u'total', u'pos'],
      dtype='object')