# <img style="float: left; padding-right: 10px; width: 45px" src="fig/iacs.png"> Scope of Work for Data Science Final Project

<br/>
<div>
    <b>S-109A Introduction to Data Science</b><br/>
    Harvard University<br/>
    Summer 2018<br/>
    Instructors: Pavlos Protopapas, Kevin Rader
</div>
<div>
    <b>Prepared by Group #28</b>
    <ul style="margin:0;">
        <li>Mark Dinneen</li>
        <li>Mohammad Karim</li>
        <li>Ramandeep Harjai</li>
    </ul>
</div> 
<hr style="display:block; border: 2px solid #b71010;" />

## Supplemental Notebook for Web Scraping

This notebook documents all the data which has been collected via web scraping technique

In [8]:
# import the necessary libraries
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series
import random

# libraries for scraping data from web
import re
from bs4 import BeautifulSoup
from sys import argv
import requests
from urllib.request import urlopen
from urllib.error import HTTPError

<hr style="display:block; border: 1px solid #b71010;" />
<hr style="display:block; border: 1px solid #b71010;" />

### Historical FIFA Ranking Data
The latest men's FIFA ranking data is available at: https://www.fifa.com/fifa-world-ranking/ranking-table/men/index.html, which is published on `07-June-2018`. The oldest men's FIFA ranking data is available at: https://www.fifa.com/fifa-world-ranking/ranking-table/men/rank=2/index.html, which was publsihed on `08-August-1993`. We plan to parse this data from 288+ pages, which will provide us the FIFA ranking for various men's teams, for time period between `08-August-1993, and 07-June-2018`.<br/><br/>
<div align="center">
    <img src="fig/ranking.png" width="600" /><br/>
    <small><i>FIFA ranking page for 08-August-1993</i></small>
</div>

In [6]:
def get_month_number(month_name: str) -> str:
    month_number = 0
    if month_name.lower() == "january":
        month_number = "01"
    elif month_name.lower() == "february":
        month_number = "02"
    elif month_name.lower() == "march":
        month_number = "03"
    elif month_name.lower() == "april":
        month_number = "04"        
    elif month_name.lower() == "may":
        month_number = "05"     
    elif month_name.lower() == "june":
        month_number = "06"        
    elif month_name.lower() == "july":
        month_number = "07"        
    elif month_name.lower() == "august":
        month_number = "08"        
    elif month_name.lower() == "september":
        month_number = "09"        
    elif month_name.lower() == "october":
        month_number = "10"        
    elif month_name.lower() == "november":
        month_number = "11"        
    elif month_name.lower() == "december":
        month_number = "12"        
    return month_number

def get_year(rank_date: str) -> str:
    rank_date_split = rank_date.split(" ")
    return rank_date_split[2]

def format_date(rank_date: str) -> str:
    rank_date_split = rank_date.split(" ")
    rank_date_formatted = rank_date_split[2] + "-" + \
                            get_month_number(rank_date_split[1]) + "-" + \
                            rank_date_split[0]
    return rank_date_formatted

def parse_rank_data() -> []:
    ranking = []
    page_url = "https://www.fifa.com/fifa-world-ranking/ranking-table/men/rank={}/index.html"
    page_ids = range(2, 288)
    
    for page_id in page_ids:
        progress = round( (page_id / max(page_ids))*100 )
        print("Parsing FIFA ranking data: {}%\r".format(progress), end="", flush=True)
        rank_page = requests.get(page_url.format(page_id))
        page_soup = BeautifulSoup(rank_page.text, 'html.parser')
        rank_date = page_soup.findAll("ul", {"class":["slider-list","items-1"]})[0].text
        rank_soup = page_soup.findAll("tr", {"class":"anchor"})
        for rank_tag in rank_soup:
            date = format_date(rank_date)
            year = get_year(rank_date)
            rank = rank_tag.find("td", {"class":"tbl-rank"}).text
            team = rank_tag.find("td", {"class":"tbl-teamname"}).text
            ranking.append([date, year, team, rank])
    
    return ranking

# un-comment the lines below run data scraping
# fifa_ranking = parse_rank_data()  # get FIFA ranking data from fifa.com website
# df_ranking = pd.DataFrame(fifa_ranking, columns=["date", "year", "team", "rank"]) # create dataframe
# df_ranking.to_pickle("datasets/ranking_data.pkl") # store dataframe to local disk

# read back web scraped dataset
df_ranking = pd.read_pickle("datasets/ranking_data.pkl")
print("\ndf_ranking = ", df_ranking.shape, "\n")
df_ranking.sample(10)



df_ranking =  (57793, 4) 



Unnamed: 0,date,year,team,rank
29624,2007-03-14,2007,Romania,14
2466,1994-12-20,1994,Malawi,82
13262,2000-04-12,2000,Samoa,183
14642,2000-11-15,2000,St Kitts and Nevis,146
53579,2016-11-24,2016,France,7
2533,1994-12-20,1994,Guatemala,149
22041,2004-01-14,2004,Croatia,20
32770,2008-06-04,2008,Tunisia,50
41035,2011-11-23,2011,Hungary,37
15411,2001-03-14,2001,Syria,103


<hr style="display:block; border: 1px solid #b71010;" />
<hr style="display:block; border: 1px solid #b71010;" />

### Historical FIFA World Cup Winners
The historical FIFA world-cup finals data is available at: https://en.wikipedia.org/wiki/List_of_FIFA_World_Cup_finals, which is last published (updated) on `25-July-2018`. We'll scrape this data to build a dataset of historical FIFA world cup winners.<br/><br/>
<div align="center">
    <img src="fig/fifa_results.png" width="600" /><br/>
    <small><i>Historical FIFA World Cup Final Match Results</i></small>
</div>

In [103]:
def parse_past_winners() -> []:
    url = "https://en.wikipedia.org/wiki/List_of_FIFA_World_Cup_finals"
    page = requests.get(url)
    page_soup = BeautifulSoup(page.text, 'html.parser')
    table_soup = page_soup.findAll("tbody")[2]
    row_soup = table_soup.findAll("tr")

    results = []
    for row in range(1,len(row_soup)-4):
        result = []
        result.append(re.findall(r"[0-9]+", row_soup[row].find("th").text)[0])
        result.append(row_soup[row].findAll("td")[0].find("a").text)
        result.append(row_soup[row].findAll("td")[2].find("a").text)
        result.append(re.findall(r"[0-9]+", row_soup[row].findAll("td")[1].find("a").text)[0])
        result.append(re.findall(r"[0-9]+", row_soup[row].findAll("td")[1].find("a").text)[1])
        result.append(row_soup[row].findAll("td")[0].find("a").text)
        result.append(row_soup[row].findAll("td")[3].findAll("a")[0].text)
        result.append(row_soup[row].findAll("td")[4].findAll("a")[0].text)
        result.append(row_soup[row].findAll("td")[4].findAll("a")[1].text)
        result.append(re.findall(r"[0-9,]+", row_soup[row].findAll("td")[5].text)[0].replace(',',''))
        results.append(result)
        
    return results

# un-comment the lines below run data scraping
# colnames = ['year','team1','team2','team1_score','team2_score',
#             'winner','venue','city','country','attendance']
# df_fifa_finals = pd.DataFrame.from_records(parse_past_winners(), columns=colnames)
# df_fifa_finals["year"] = df_fifa_finals["year"].astype(int)
# df_fifa_finals["team1"] = df_fifa_finals["team1"].astype(str)
# df_fifa_finals["team2"] = df_fifa_finals["team2"].astype(str)
# df_fifa_finals["team1_score"] = df_fifa_finals["team1_score"].astype(int)
# df_fifa_finals["team2_score"] = df_fifa_finals["team2_score"].astype(int)
# df_fifa_finals["winner"] = df_fifa_finals["winner"].astype(str)
# df_fifa_finals["venue"] = df_fifa_finals["venue"].astype(str)
# df_fifa_finals["city"] = df_fifa_finals["city"].astype(str)
# df_fifa_finals["country"] = df_fifa_finals["country"].astype(str)
# df_fifa_finals["attendance"] = df_fifa_finals["attendance"].astype(int)
# df_fifa_finals = df_fifa_finals.replace("West Germany","Germany")
# df_fifa_finals.to_pickle("datasets/fifa_finals_data.pkl") # store dataframe to local disk

# read back web scraped dataset
df_fifa_finals = pd.read_pickle("datasets/fifa_finals_data.pkl")
print("\ndf_fifa_finals = ", df_fifa_finals.shape, "\n")
df_fifa_finals


df_fifa_finals =  (21, 10) 



Unnamed: 0,year,team1,team2,team1_score,team2_score,winner,venue,city,country,attendance
0,1930,Uruguay,Argentina,4,2,Uruguay,Estadio Centenario,Montevideo,Uruguay,80000
1,1934,Italy,Czechoslovakia,2,1,Italy,Stadio Nazionale PNF,Rome,Italy,50000
2,1938,Italy,Hungary,4,2,Italy,Stade Olympique de Colombes,Paris,France,45000
3,1950,Uruguay,Brazil,2,1,Uruguay,Estádio do Maracanã,Rio de Janeiro,Brazil,199854
4,1954,Germany,Hungary,3,2,Germany,Wankdorf Stadium,Bern,Switzerland,60000
5,1958,Brazil,Sweden,5,2,Brazil,Råsunda Stadium,Solna,Sweden,51800
6,1962,Brazil,Czechoslovakia,3,1,Brazil,Estadio Nacional,Santiago,Chile,69000
7,1966,England,Germany,4,2,England,Wembley Stadium,London,England,93000
8,1970,Brazil,Italy,4,1,Brazil,Estadio Azteca,Mexico City,Mexico,107412
9,1974,Germany,Netherlands,2,1,Germany,Olympiastadion,Munich,Germany,75200


In [119]:
df_fifa_finals.groupby("winner").agg({
    'year':'count'
}).sort_values(by='year', ascending=False)

Unnamed: 0_level_0,year
winner,Unnamed: 1_level_1
Brazil,5
Germany,4
Italy,4
Argentina,2
France,2
Uruguay,2
England,1
Spain,1


<hr style="display:block; border: 1px solid #b71010;" />
<hr style="display:block; border: 1px solid #b71010;" />

### FIFA World Cup &mdash; All Time Team Rankings
FIFA all time team rankings, and associated team statistics is available at https://www.fifa.com/fifa-tournaments/statistics-and-records/worldcup/teams/index.html. We'll scrape this data to build a dataset of FIFA team rankings.<br/><br/>
<div align="center">
    <img src="fig/all_time_ranking.png" width="600" /><br/>
    <small><i>FIFA World Cup &mdash; All Time Team Rankings</i></small>
</div>

In [141]:
def parse_team_rankings() -> []:
    url = "https://www.fifa.com/fifa-tournaments/statistics-and-records/worldcup/teams/index.html"
    page = requests.get(url)
    page_soup = BeautifulSoup(page.text, 'html.parser')
    table_soup = page_soup.findAll("table", {"class":["table","tbl-alltimeranking"]})[1].find("tbody")
    row_soup = table_soup.findAll("tr")

    results = []
    for row in row_soup:
        result = []
        result.append(row.findAll("td")[0].text)
        result.append(row.findAll("td")[1].find("span", {"class":"t-nText"}).text)
        result.append(row.findAll("td")[3].find("span", {"class":"text"}).text)
        result.append(row.findAll("td")[4].find("span", {"class":"text"}).text)
        result.append(row.findAll("td")[5].find("span", {"class":"text"}).text)
        result.append(row.findAll("td")[6].find("span", {"class":"text"}).text)
        result.append(row.findAll("td")[7].find("span", {"class":"text"}).text)
        result.append(row.findAll("td")[8].find("span", {"class":"text"}).text)
        result.append(row.findAll("td")[9].find("span", {"class":"text"}).text)
        result.append(row.findAll("td")[10].text)
        result.append(row.findAll("td")[11].find("span", {"class":"text"}).text)
        results.append(result)
        
    return results

# un-comment the lines below run data scraping
# colnames = ['rank','team','points','matches','win','draw','lost',
#             'goal_for','goal_against','points_avg', 'appearances']
# df_fifa_wc_ranking = pd.DataFrame.from_records(parse_team_rankings(), columns=colnames)
# df_fifa_wc_ranking["rank"] = df_fifa_wc_ranking["rank"].astype(int)
# df_fifa_wc_ranking["points"] = df_fifa_wc_ranking["points"].astype(int)
# df_fifa_wc_ranking["matches"] = df_fifa_wc_ranking["matches"].astype(int)
# df_fifa_wc_ranking["win"] = df_fifa_wc_ranking["win"].astype(int)
# df_fifa_wc_ranking["draw"] = df_fifa_wc_ranking["draw"].astype(int)
# df_fifa_wc_ranking["lost"] = df_fifa_wc_ranking["lost"].astype(int)
# df_fifa_wc_ranking["goal_for"] = df_fifa_wc_ranking["goal_for"].astype(int)
# df_fifa_wc_ranking["goal_against"] = df_fifa_wc_ranking["goal_against"].astype(int)
# df_fifa_wc_ranking["points_avg"] = df_fifa_wc_ranking["points_avg"].astype(float)
# df_fifa_wc_ranking["appearances"] = df_fifa_wc_ranking["appearances"].astype(int)
# df_fifa_wc_ranking.to_pickle("datasets/fifa_wc_ranking.pkl") # store dataframe to local disk

# read back web scraped dataset
df_fifa_wc_ranking = pd.read_pickle("datasets/fifa_wc_ranking.pkl")
print("\ndf_fifa_wc_ranking = ", df_fifa_wc_ranking.shape, "\n")
df_fifa_wc_ranking.head(10)



df_fifa_wc_ranking =  (77, 11) 



Unnamed: 0,rank,team,points,matches,win,draw,lost,goal_for,goal_against,points_avg,appearances
0,1,Brazil,227,104,70,17,17,221,102,2.2,20
1,2,Germany,218,106,66,20,20,224,121,2.1,18
2,3,Italy,156,83,45,21,17,128,77,1.9,18
3,4,Argentina,140,77,42,14,21,131,84,1.8,16
4,5,Spain,99,59,29,12,18,92,66,1.7,14
5,6,England,98,62,26,20,16,79,56,1.6,14
6,7,France,96,59,28,12,19,106,71,1.6,14
7,8,Netherlands,93,50,27,12,11,86,48,1.9,10
8,9,Uruguay,72,51,20,12,19,80,71,1.4,12
9,10,Sweden,61,46,16,13,17,74,69,1.3,11


<hr style="display:block; border: 1px solid #b71010;" />
<hr style="display:block; border: 1px solid #b71010;" />

### FIFA World Cup &mdash; Participations
FIFA World Cup - team participations, and associated team statistics is available at https://www.fifa.com/fifa-tournaments/statistics-and-records/worldcup/teams/index.html. We'll scrape this data to build a dataset of FIFA team participations in world cup tournament.<br/><br/>
<div align="center">
    <img src="fig/wc_participation.png" width="600" /><br/>
    <small><i>FIFA World Cup &mdash; Team Participations</i></small>
</div>

In [147]:
def parse_team_participations() -> []:
    url = "https://www.fifa.com/fifa-tournaments/statistics-and-records/worldcup/teams/index.html"
    page = requests.get(url)
    page_soup = BeautifulSoup(page.text, 'html.parser')
    table_soup = page_soup.findAll("table", {"class":["table","tbl-alltimeranking"]})[2].find("tbody")
    row_soup = table_soup.findAll("tr")

    results = []
    for row in row_soup:
        result = []
        result.append(row.findAll("td")[0].find("span", {"class":"t-nText"}).text)
        result.append(row.findAll("td")[2].find("span", {"class":"text"}).text)
        result.append(row.findAll("td")[3].find("span", {"class":"text"}).text)
        results.append(result)
        
    return results

# un-comment the lines below run data scraping
# colnames = ['team','participations','years']
# df_fifa_participations = pd.DataFrame.from_records(parse_team_participations(), columns=colnames)
# df_fifa_participations["participations"] = df_fifa_participations["participations"].astype(int)
# df_fifa_participations.to_pickle("datasets/fifa_participations.pkl") # store dataframe to local disk

# read back web scraped dataset
df_fifa_participations = pd.read_pickle("datasets/fifa_participations.pkl")
print("\ndf_fifa_participations = ", df_fifa_participations.shape, "\n")
df_fifa_participations.head(10)



df_fifa_participations =  (77, 3) 



Unnamed: 0,team,participations,years
0,Brazil,20,"1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966..."
1,Germany,18,"1934, 1938, 1954, 1958, 1962, 1966, 1970, 1974..."
2,Italy,18,"1934, 1938, 1950, 1954, 1962, 1966, 1970, 1974..."
3,Argentina,16,"1930, 1934, 1958, 1962, 1966, 1974, 1978, 1982..."
4,Mexico,15,"1930, 1950, 1954, 1958, 1962, 1966, 1970, 1978..."
5,England,14,"1950, 1954, 1958, 1962, 1966, 1970, 1982, 1986..."
6,France,14,"1930, 1934, 1938, 1954, 1958, 1966, 1978, 1982..."
7,Spain,14,"1934, 1950, 1962, 1966, 1978, 1982, 1986, 1990..."
8,Belgium,12,"1930, 1934, 1938, 1954, 1970, 1982, 1986, 1990..."
9,Uruguay,12,"1930, 1950, 1954, 1962, 1966, 1970, 1974, 1986..."


In [120]:
from IPython.core.display import HTML
def css_styling(): styles = open("cs109.css", "r").read(); return HTML(styles)
css_styling()