# Data Scrapping from SOFIFA.COM

### Import the necessary libraries in Python

In [1]:
from bs4 import BeautifulSoup
import requests
import os
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import re
import sqlite3

### Connect to SQLite database

In [2]:
db = sqlite3.connect(os.path.join('..', 'dataset', 'soccer2017.db'))

Function for Creating BeautifulSoup object

In [3]:
def get_beautifulsoup(url):
    
    ## Grab the HTML from URL webpage and create a beautifulSoup object.    
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data,'html.parser')
    return soup

Function to get Table rows from Soup object

In [4]:
def get_data_rows(soup):
    
    ## we want to grab the first table from the webpage. 
    ## Since we want information from the first table only, we need to specify the table we want by using index [0]
    table = soup.find_all('table')[0]
    
    ## Next we find all the tr elements, which are all the rows of table first. 
    ## We want to skip the first row because it contain header, so we use [1:] to capture only the rows with data.
    rows = table.find_all('tr')[1:]
    
    return rows

## League Data

In [442]:
source_url='https://sofifa.com/leagues'

Functions to Read Data from HTML node

In [443]:
def league_id(row):
    return row.find_all('td')[1].find_all('a')[0].get('href').replace('/league/','').encode('utf-8')
    
def league_name(row):
    return row.find_all('td')[1].get_text().encode('utf-8')   

def teams(row):
    return row.find_all('td')[2].get_text().encode('utf-8')   

def division(row):
    line= row.find_all('td')[1].get_text()  
    value = re.search(r'\([0-9]\)',  line,re.M|re.I)
    if value:
        return value.group().replace('(','').replace(')','').encode('utf-8')
    else:
        return '1'

def hits(row):
    return row.find_all('td')[3].get_text().encode('utf-8')   

Clear leagues table data before loading new data

In [361]:
cursor = db.cursor()
count = cursor.execute(''' DELETE FROM leagues''')
db.commit()
print('Total {0} records has been deleted from leagues table').format(count.rowcount)

Total 38 records has been deleted from leagues table


Next, loop through all the rows that are in the leagues table, and insert into league table

In [362]:
def insert_into_leagues_table(rows):
    cur = db.cursor()
    i=1
    for row in rows:         
        values = (i+1, league_id(row), league_name(row), teams(row), division(row), hits(row))        
        ##print("insert into leagues values"+str(values))        
        cur.execute("insert into leagues values"+str(values))
        i +=1
        
    db.commit()

Perform Operations

In [363]:
soup = get_beautifulsoup(source_url)
rows = get_data_rows(soup)
insert_into_leagues_table(rows)

Verify leagues table data

In [444]:
df = pd.io.sql.read_sql(
'''
    SELECT *
    FROM leagues    
    
;
''', con = db)

In [445]:
## Total Records
df.shape[0]

38

### Load Calendar Info

In [7]:
pd.io.sql.read_sql(
'''
SELECT *
    FROM calendar
    
;
''', con = db)

Unnamed: 0,id,season,season_v,year,month,day,value,date


In [8]:
source_url='https://sofifa.com/players/top'

In [9]:
soup = get_beautifulsoup(source_url)
calender_div = soup.find("div", {"id": "version-calendar"})

In [10]:
cards= calender_div.find('div',{'class':'filter-body'}).find_all('div',{'class':'column col-4'})

In [11]:
def season_name(card):    
    return card.get('data-tag').replace('tag-','').replace('-',' ').upper().encode('utf-8') 

def season_name_v(card):    
    return card.get('data-tag').replace('tag-','').replace('-',' ').upper().encode('utf-8')[-2:] 

def season_year(card):
    card_header = card.find('div',{'class':'card-header'}).get_text()
    return card_header.split(' ')[2].encode('utf-8') 

def season_month(card):
    card_header = card.find('div',{'class':'card-header'}).get_text()
    return card_header.split(' ')[0].encode('utf-8') 

def season_day(card_item):    
    return card_item.get_text().encode('utf-8') 

def season_value(card_item):    
    return card_item.get('class')[0].split('-')[1].encode('utf-8') 

In [17]:
cursor = db.cursor()
count = cursor.execute(''' DELETE FROM calendar''')
db.commit()
print('Total {0} records has been deleted from calendar table').format(count.rowcount)

Total 299 records has been deleted from calendar table


In [20]:
def insert_into_calendar(cards):
    cur = db.cursor()
    for card in cards:
        card_body = card.find('div',{'class':'card-body'})
        card_items=card_body.find_all('a')
        for card_item in card_items:
            sql = "insert into calendar(season, season_v, year, month, day, value)\
                values('{0}', {1},{2}, '{3}', {4}, {5})"\
            .format(season_name(card), season_name_v(card), season_year(card), season_month(card), season_day(card_item), season_value(card_item))
            
            cur.execute(sql)
            #print(sql)
            
        if card.get('data-tag')=='tag-fifa-07':
            break;
            
    sql_update_date ="\
    UPDATE calendar\
    SET date = year||'-'||CASE month \
         WHEN 'Jan' THEN 1 \
         WHEN 'Feb' THEN 2 \
         WHEN 'Mar' THEN 3 \
         WHEN 'Apr' THEN 4 \
         WHEN 'May' THEN 5 \
         WHEN 'Jun' THEN 6 \
         WHEN 'Jul' THEN 7 \
         WHEN 'Aug' THEN 8 \
         WHEN 'Sep' THEN 9 \
         WHEN 'Oct' THEN 10 \
         WHEN 'Nov' THEN 11 \
         WHEN 'Dec' THEN 12 \
      END ||'-'|| day "
    
    cur.execute(sql_update_date)
    
    db.commit()

In [21]:
#do not execute below line since calendar table has already been loaded

insert_into_calendar(cards)

In [22]:
# remove duplicate record
cursor = db.cursor()
count = cursor.execute(''' delete from calendar where id in (1414,1461)''')
db.commit()


In [14]:
df_calendar_last_5_season = pd.io.sql.read_sql(
'''
SELECT *
    FROM calendar
WHERE
    season in ( 'FIFA 17','FIFA 16', 'FIFA 15', 'FIFA 14', 'FIFA 13')
    
;
''', con = db)

In [15]:
df_calendar_last_5_season

Unnamed: 0,id,season,season_v,year,month,day,value,date
0,906,FIFA 17,17,2017,Aug,10,158816,
1,907,FIFA 17,17,2017,Aug,3,158809,
2,908,FIFA 17,17,2017,Jul,31,158806,
3,909,FIFA 17,17,2017,Jul,27,158802,
4,910,FIFA 17,17,2017,Jul,24,158799,
...,...,...,...,...,...,...,...,...
285,1191,FIFA 13,13,2013,Mar,4,157196,
286,1192,FIFA 13,13,2013,Mar,1,157193,
287,1193,FIFA 13,13,2013,Feb,22,157186,
288,1194,FIFA 13,13,2013,Feb,15,157179,


### Teams Data

In [450]:
df = pd.io.sql.read_sql(
'''
SELECT *
    FROM teams
    
;
''', con = db)

In [451]:
df.columns

Index([u'record_id', u'team_id', u'league_id', u'name', u'overall', u'attack',
       u'midfield', u'defence', u'home_stadium', u'rival_team',
       u'international_prestige', u'domestic_prestige', u'transfer_budget',
       u'starting_11_average_age', u'whole_team_average_age', u'captain',
       u'short_free_kick', u'long_free_kick', u'penalties', u'left_corner',
       u'right_corner', u'date_value'],
      dtype='object')

In [479]:
def team_id(row):    
    return row.find_all('td')[0].find_all('a')[0].get('href').replace('/team/','').encode('utf-8')

def team_name(row):    
    return row.find_all('td')[0].find_all('a')[0].get_text().encode('utf-8') 

def team_overall(row):
    return row.find_all('td')[1].find_all('span')[0].get_text().encode('utf-8') 

In [453]:
cursor = db.cursor()
count = cursor.execute(''' DELETE FROM teams''')
db.commit()
print('Total {0} records has been deleted from teams table').format(count.rowcount)

Total 0 records has been deleted from teams table


In [454]:
def insert_into_teams_table(lg_id, items):
    cur = db.cursor()
    i=1
    for row in items: 
        values = ( team_id(row), lg_id, team_name(row), team_overall(row))
        ##print("insert into team( 'team_id', 'league_id', 'name', 'overall') values"+str(values))        
        cur.execute("insert into teams('team_id', 'league_id', 'name', 'overall') values"+str(values))
        i +=1
    db.commit()

<div class="alert alert-info">

**Note:** Executing below cell will scrap all leagues data. Please use Responsibly.

</div>

### English Premier League Data Only

In [455]:
lg_id=13 ## English Premier League 
source_url='https://sofifa.com/league/{0}'.format(lg_id)
soup = get_beautifulsoup(source_url)
epl_rows = get_data_rows(soup)
#insert_into_teams_table(league_id, rows)

Validate teams table records

### Load Team profile

In [456]:
df = pd.io.sql.read_sql(
'''
SELECT *
    FROM teams
    
;
''', con = db)

In [457]:
df.head()

Unnamed: 0,record_id,team_id,league_id,name,overall,...,long_free_kick,penalties,left_corner,right_corner,date_value


In [480]:
def team_overall(row):    
    return row.find_all('td')[0].find_all('span')[0].get_text().encode('utf-8') 

def team_attack(row):    
    return row.find_all('td')[1].find_all('span')[0].get_text().encode('utf-8') 

def team_midfield(row):    
    return row.find_all('td')[2].find_all('span')[0].get_text().encode('utf-8') 

def team_defence(row):
    return row.find_all('td')[3].find_all('span')[0].get_text().encode('utf-8') 

In [481]:
def team_home_stadium(ul):    
    return ul.find_all('li')[0].contents[2].strip().encode('utf-8') 

def team_rival_team(ul):    
    return ul.find_all('li')[1].find('a').get_text().encode('utf-8') 

def team_international_prestige(ul):    
    return ul.find_all('li')[2].find('span').get_text().encode('utf-8') 

def team_domestic_prestige(ul):    
    return ul.find_all('li')[2].find('span').get_text().encode('utf-8') 

def team_transfer_budget(ul):    
    return ul.find_all('li')[4].contents[2].strip().encode('utf-8') 

def team_starting_xi_avg_age(ul):    
    return ul.find_all('li')[5].contents[2].strip().encode('utf-8') 

def team_avg_age(ul):    
    return ul.find_all('li')[6].contents[2].strip().encode('utf-8') 

def team_captain(ul):    
    return ul.find_all('li')[7].find('a').get_text().encode('utf-8') 

def team_short_free_kick(ul):    
    return ul.find_all('li')[8].find('a').get_text().encode('utf-8') 

def team_long_free_kick(ul):    
    return ul.find_all('li')[9].find('a').get_text().encode('utf-8') 

def team_penalties(ul):    
    return ul.find_all('li')[10].find('a').get_text().encode('utf-8') 

def team_left_corner(ul):    
    return ul.find_all('li')[11].find('a').get_text().encode('utf-8') 

def team_right_corner(ul):    
    return ul.find_all('li')[12].find('a').get_text().encode('utf-8') 

In [493]:
def insert_team_profile_score(lg_id, tm_id,tm_name, date_value, soup):    
    tbl = soup.find_all('table')[0]
    rows = tbl.find_all('tr')
    row=rows[0]
    
    cur = db.cursor()
    
    #values = ( team_id(row), lg_id, team_name(row), team_overall(row),team_attack(row), team_midfield(row), team_defence(row), team_id, date_value )
    sql = "insert into teams('team_id', 'league_id', 'name', 'overall', 'attack','midfield', 'defence', 'date_value')"
    values="values({0},{1},'{2}',{3},{4},{5},{6},{7})".format(tm_id, lg_id, tm_name, team_overall(row),team_attack(row), team_midfield(row), team_defence(row), date_value)
        
          
    cur.execute(sql+values)
    #print(sql+values)
        
    db.commit()

In [494]:
def update_team_profile(tm_id, date_value, soup):    
    ul = soup.find_all('ul')[2]  
    
    cur = db.cursor()
    sql =(""" update teams set \
                home_stadium = "{0}", \
                rival_team = "{1}", \
                international_prestige = "{2}", \
                domestic_prestige = "{3}" ,\
                transfer_budget = "{4}",\
                starting_11_average_age ="{5}",\
                whole_team_average_age ="{6}",\
                captain="{7}",\
                short_free_kick ="{8}",\
                long_free_kick = "{9}",\
                penalties = "{10}",\
                left_corner = "{11}",\
                right_corner = "{12}"\                
                where team_id = {13}  and date_value = {14} """
                .format(
                    team_home_stadium(ul),
                    team_rival_team(ul), 
                    team_international_prestige(ul), 
                    team_domestic_prestige(ul), 
                    team_transfer_budget(ul), 
                    team_starting_xi_avg_age(ul), 
                    team_avg_age(ul), 
                    team_captain(ul), 
                    team_short_free_kick(ul), 
                    team_long_free_kick(ul), 
                    team_penalties(ul), 
                    team_left_corner(ul), 
                    team_right_corner(ul),                    
                    tm_id,
                    date_value))
    
    cur.execute(sql)
    ##print(sql)    
    db.commit()
    

update team score

In [495]:
import time


In [496]:
for row in epl_rows: 
    tm_name = team_name(row)
    tm_id = team_id(row) 
    for i,date_value in df_calendar_last_5_season.iterrows():
        source_url="https://sofifa.com/team/{0}?v={1}&e={2}&set=true".format(tm_id, date_value['season_v'], date_value['value'])
        soup = get_beautifulsoup(source_url)   
        insert_team_profile_score(lg_id,tm_id,tm_name,date_value['value'],soup)  
        #update_team_profile(tm_id,soup)  
        
        #Delay 1 sec
        time.sleep(1)

In [None]:
df_calendar_last_5_season

In [497]:
df_teams_epl=pd.io.sql.read_sql(
'''
SELECT *
    FROM teams
where league_id=13
;
''', con = db)

In [498]:
df_teams_epl

Unnamed: 0,record_id,team_id,league_id,name,overall,...,long_free_kick,penalties,left_corner,right_corner,date_value
0,1115,1,13,Arsenal,83,...,,,,,158816
1,1116,1,13,Arsenal,83,...,,,,,158809
2,1117,1,13,Arsenal,83,...,,,,,158806
3,1118,1,13,Arsenal,83,...,,,,,158802
4,1119,1,13,Arsenal,83,...,,,,,158799
...,...,...,...,...,...,...,...,...,...,...,...
17395,18510,1960,13,Swansea City,74,...,,,,,157196
17396,18511,1960,13,Swansea City,74,...,,,,,157193
17397,18512,1960,13,Swansea City,74,...,,,,,157186
17398,18513,1960,13,Swansea City,74,...,,,,,157179


### Load Sqad Data

In [499]:
df=pd.io.sql.read_sql(
'''
SELECT *
    FROM squad

;
''', con = db)

In [500]:
df.columns

Index([u'id', u'team_id', u'player_id', u'name', u'age', u'ova', u'pot',
       u'team_contract', u'value', u'wage', u'total', u'pos', u'date_value'],
      dtype='object')

In [501]:
def player_id(row):
    return row.find_all('td')[1].find_all('a')[1].get('href').replace('/player/','').encode('utf-8')

def player_name(row):
    return row.find_all('td')[1].find_all('a')[1].get_text().encode('utf-8')

def player_age(row):
    return row.find_all('td')[2].get_text().encode('utf-8')

def player_ova(row):
    return row.find_all('td')[3].get_text().encode('utf-8')

def player_pot(row):
    return row.find_all('td')[4].get_text().encode('utf-8')

def player_value(row):
    return row.find_all('td')[6].get_text().encode('utf-8')

def player_wage(row):
    return row.find_all('td')[7].get_text().encode('utf-8')

def player_total(row):
    return row.find_all('td')[8].get_text().encode('utf-8')

In [502]:
df_teams_epl.head()

Unnamed: 0,record_id,team_id,league_id,name,overall,...,long_free_kick,penalties,left_corner,right_corner,date_value
0,1115,1,13,Arsenal,83,...,,,,,158816
1,1116,1,13,Arsenal,83,...,,,,,158809
2,1117,1,13,Arsenal,83,...,,,,,158806
3,1118,1,13,Arsenal,83,...,,,,,158802
4,1119,1,13,Arsenal,83,...,,,,,158799


In [35]:
def insert_into_squad_table(tm_id, items):
    cur = db.cursor()
    
    for row in items: 
        values = ( tm_id, player_id(row), player_name(row), player_age(row), player_ova(row), player_pot(row), player_value(row), player_wage(row), player_total(row))
        print("insert into squad\
              ( 'team_id', 'player_id', 'name', 'age', 'ova', 'pot', 'team_contract', 'value', 'wage', 'total', 'pos', 'date')\
              values"+str(values))        
        ##cur.execute("insert into teams('team_id', 'league_id', 'name', 'overall') values"+str(values))
        
    db.commit()