<h1>FIFA Project</h1>
<h3>Goal</h3>
<p>To scrape world cup data from wikipedia and plot them as many various plots</p>

In [1]:
# Wikipedia article is List of FIFA World Cup finals
import pandas as pd
import numpy as np

# Scraped all the tables in the webpage and stored them in a list
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_FIFA_World_Cup_finals')

# This slice contains 3 tables: List of finals of the FIFA World Cup, Results by Nation, Results by confederation
fifa_tables = tables[3:6]
print(type(fifa_tables))
print(fifa_tables[0])


<class 'list'>
               Year          Winners                 Score[2]  \
0              1930          Uruguay                      4–2   
1              1934            Italy             2–1 (a.e.t.)   
2              1938            Italy                      4–2   
3              1950          Uruguay                2–1 [n 3]   
4              1954     West Germany                      3–2   
5              1958           Brazil                      5–2   
6              1962           Brazil                      3–1   
7              1966          England             4–2 (a.e.t.)   
8              1970           Brazil                      4–1   
9              1974     West Germany                      2–1   
10             1978        Argentina             3–1 (a.e.t.)   
11             1982            Italy                      3–1   
12             1986        Argentina                      3–2   
13             1990     West Germany                      1–0   
14        

In [2]:
# Getting a copy of the list of finals table to work with
finals_tabel = fifa_tables[0].copy()

# Changing the index of finals_table to year
finals_tabel.set_index('Year', inplace=True)

# Reindexing to get necessary tables: Winners and Runners-up
finals_tabel_useful = finals_tabel.reindex(columns=['Winners', 'Runners-up'])

# Removing unnecessary rows: Upcoming finals, Year, 2022, 2026
finals_tabel_useful = finals_tabel_useful.drop(index=['Upcoming finals', 'Year', '2022', '2026'])
finals_tabel_useful.replace('West Germany', 'Germany', inplace=True)
print(finals_tabel_useful)

        Winners      Runners-up
Year                           
1930    Uruguay       Argentina
1934      Italy  Czechoslovakia
1938      Italy         Hungary
1950    Uruguay          Brazil
1954    Germany         Hungary
1958     Brazil          Sweden
1962     Brazil  Czechoslovakia
1966    England         Germany
1970     Brazil           Italy
1974    Germany     Netherlands
1978  Argentina     Netherlands
1982      Italy         Germany
1986  Argentina         Germany
1990    Germany       Argentina
1994     Brazil           Italy
1998     France          Brazil
2002     Brazil         Germany
2006      Italy          France
2010      Spain     Netherlands
2014    Germany       Argentina
2018     France         Croatia


In [3]:
# Importing association information
uefa_list = pd.read_html('https://en.wikipedia.org/wiki/UEFA')

# Isolating UEFA member country table
uefa = uefa_list[3].copy()
print(uefa)

   Code             Association                  National teams    Founded  \
0   ALB                 Albania  Men'sU21U19U17FBSWomen'sU19U17       1930   
1   AND                 Andorra  Men'sU21U19U17FBSWomen'sU19U17       1994   
2   ARM                 Armenia  Men'sU21U19U17FBSWomen'sU19U17       1992   
3   AUT                 Austria  Men'sU21U19U17FBSWomen'sU19U17       1904   
4   AZE              Azerbaijan  Men'sU21U19U17FBSWomen'sU19U17       1992   
5   BLR                 Belarus  Men'sU21U19U17FBSWomen'sU19U17       1989   
6   BEL                 Belgium  Men'sU21U19U17FBSWomen'sU19U17       1895   
7   BIH  Bosnia and Herzegovina  Men'sU21U19U17FBSWomen'sU19U17       1946   
8   BUL                Bulgaria  Men'sU21U19U17FBSWomen'sU19U17       1923   
9   CRO                 Croatia  Men'sU21U19U17FBSWomen'sU19U17       1912   
10  CYP                  Cyprus  Men'sU21U19U17FBSWomen'sU19U17       1934   
11  CZE          Czech Republic  Men'sU21U19U17FBSWomen'sU19U17 

In [4]:
# Get useful columns from uefa
uefa_useful = uefa.reindex(columns=['Association', 'Code'])
print(uefa_useful.head())

  Association Code
0     Albania  ALB
1     Andorra  AND
2     Armenia  ARM
3     Austria  AUT
4  Azerbaijan  AZE


In [5]:
# Getting CONMEBOL information
conmebol_tables = pd.read_html('https://en.wikipedia.org/wiki/CONMEBOL')

# Isolating table with country and country code information
conmebol = conmebol_tables[4]

In [6]:
# Reindexing to only have Association and Code columns
conmebol_useful = conmebol.reindex(columns=['Association', 'Code'])
print(conmebol_useful)

  Association Code
0   Argentina  ARG
1     Bolivia  BOL
2      Brazil  BRA
3       Chile  CHI
4    Colombia  COL
5     Ecuador  ECU
6    Paraguay  PAR
7        Peru  PER
8     Uruguay  URU
9   Venezuela  VEN


In [7]:
# Concatenate the 2 association dataframes
associations = pd.concat([conmebol_useful, uefa_useful], keys=['UEFA', 'CONMEBOL'])

# This gives as a dataframe that we merge with fifa tables to get association and country code for each table
associations.reset_index(inplace=True)

# Remove unnecesary columns
associations = associations.drop(columns=['level_1'])

# Renaming columns to more sensible names
associations.rename(columns={
    'level_0':'Association',
    'Association': 'Country'
}, inplace=True)
print(associations)

   Association      Country Code
0         UEFA    Argentina  ARG
1         UEFA      Bolivia  BOL
2         UEFA       Brazil  BRA
3         UEFA        Chile  CHI
4         UEFA     Colombia  COL
..         ...          ...  ...
60    CONMEBOL       Sweden  SWE
61    CONMEBOL  Switzerland  SUI
62    CONMEBOL       Turkey  TUR
63    CONMEBOL      Ukraine  UKR
64    CONMEBOL        Wales  WAL

[65 rows x 3 columns]


In [8]:
# Merge finals_table_useful and associations to form winners and runners up tables
winners_merged = pd.merge(finals_tabel_useful, associations, left_on='Winners', right_on='Country', how='left')
winners_merged = winners_merged.drop(columns=['Runners-up', 'Winners'])

# Repeat to get runners up table
runners_up_merged = pd.merge(finals_tabel_useful, associations, how='left', left_on='Runners-up', right_on='Country')
runners_up_merged = runners_up_merged.drop(columns=['Winners', 'Country'])
# Renaming runners-up column to Country
runners_up_merged.rename(columns={
    'Runners-up': 'Country'
}, inplace=True)

# Concatenate the winners and runners-up dataframes
finals_concatenated = pd.concat([winners_merged, runners_up_merged], axis=1, keys=['Winners', 'Runners Up'])

# Setting years as index of finals_concatenated
finals_concatenated.index = finals_tabel_useful.index
print(finals_concatenated)

         Winners                      Runners Up                 
     Association    Country Code         Country Association Code
Year                                                             
1930        UEFA    Uruguay  URU       Argentina        UEFA  ARG
1934    CONMEBOL      Italy  ITA  Czechoslovakia         NaN  NaN
1938    CONMEBOL      Italy  ITA         Hungary    CONMEBOL  HUN
1950        UEFA    Uruguay  URU          Brazil        UEFA  BRA
1954    CONMEBOL    Germany  GER         Hungary    CONMEBOL  HUN
1958        UEFA     Brazil  BRA          Sweden    CONMEBOL  SWE
1962        UEFA     Brazil  BRA  Czechoslovakia         NaN  NaN
1966    CONMEBOL    England  ENG         Germany    CONMEBOL  GER
1970        UEFA     Brazil  BRA           Italy    CONMEBOL  ITA
1974    CONMEBOL    Germany  GER     Netherlands    CONMEBOL  NED
1978        UEFA  Argentina  ARG     Netherlands    CONMEBOL  NED
1982    CONMEBOL      Italy  ITA         Germany    CONMEBOL  GER
1986      

In [9]:
# Grouping winners_merged and runners_up_merged by County
winners_group = winners_merged.groupby('Country')
runners_group = runners_up_merged.groupby('Country')

# Getting size of the groups and storing in df
number_won = winners_group.size()
number_runner_up = runners_group.size()
number_won.name = 'Won'
number_runner_up.name = 'Runnner Up'

# Concatenating the 2 dataframes
runners_winners_merged = pd.concat([number_won, number_runner_up], join='outer', axis=1)
runners_winners_merged['finals'] = runners_winners_merged.sum(axis=1)
print(runners_winners_merged)
# print(number_won, number_runner_up, sep='\n')

                Won  Runnner Up  finals
Country                                
Argentina       2.0         3.0     5.0
Brazil          5.0         2.0     7.0
England         1.0         NaN     1.0
France          2.0         1.0     3.0
Germany         4.0         4.0     8.0
Italy           4.0         2.0     6.0
Spain           1.0         NaN     1.0
Uruguay         2.0         NaN     2.0
Croatia         NaN         1.0     1.0
Czechoslovakia  NaN         2.0     2.0
Hungary         NaN         2.0     2.0
Netherlands     NaN         3.0     3.0
Sweden          NaN         1.0     1.0


In [10]:
print(number_runner_up)

Country
Argentina         3
Brazil            2
Croatia           1
Czechoslovakia    2
France            1
Germany           4
Hungary           2
Italy             2
Netherlands       3
Sweden            1
Name: Runnner Up, dtype: int64


In [20]:
# Have final_tables_useful - winning country, runners up and year. I want to get years won for every country and years runner up for every county
print(finals_tabel_useful)
# Group df by winning column to get info per country
winning_group = finals_tabel_useful.groupby('Winners')

# Extract index from group and store in a dictionary, key is country name and data is list of years
years_won = {country: group.index.to_list() for country, group in winning_group}

# Convert years_won to a series that I will later concat with runnners_winners_merged
years_won = pd.Series(years_won)
years_won.name = 'Years won'

# Repeat for runners up
runners_up_group = finals_tabel_useful.groupby('Runners-up')
years_runners_up = {country: group.index.to_list() for country, group in runners_up_group}
years_runners_up = pd.Series(years_runners_up, name='Years runners-up')
print(years_runners_up)

Argentina               [1930, 1990, 2014]
Brazil                        [1950, 1998]
Croatia                             [2018]
Czechoslovakia                [1934, 1962]
France                              [2006]
Germany           [1966, 1982, 1986, 2002]
Hungary                       [1938, 1954]
Italy                         [1970, 1994]
Netherlands             [1974, 1978, 2010]
Sweden                              [1958]
Name: Years runners-up, dtype: object


In [21]:
# Concatenate runners_winners_merged with years_won and years_runner_up
results_by_nation = pd.concat([runners_winners_merged, years_won, years_runners_up], axis=1, join='outer')
print(results_by_nation)

                Won  Runnner Up  finals                       Years won  \
Argentina       2.0         3.0     5.0                    [1978, 1986]   
Brazil          5.0         2.0     7.0  [1958, 1962, 1970, 1994, 2002]   
England         1.0         NaN     1.0                          [1966]   
France          2.0         1.0     3.0                    [1998, 2018]   
Germany         4.0         4.0     8.0        [1954, 1974, 1990, 2014]   
Italy           4.0         2.0     6.0        [1934, 1938, 1982, 2006]   
Spain           1.0         NaN     1.0                          [2010]   
Uruguay         2.0         NaN     2.0                    [1930, 1950]   
Croatia         NaN         1.0     1.0                             NaN   
Czechoslovakia  NaN         2.0     2.0                             NaN   
Hungary         NaN         2.0     2.0                             NaN   
Netherlands     NaN         3.0     3.0                             NaN   
Sweden          NaN      