In [1]:
# import dependencies
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [2]:
# load in csv files
world_cups_csv = "Data/WorldCups.csv"
wc_matches_csv = "Data/WorldCupMatches.csv"
wc_players_csv = "Data/WorldCupPlayers.csv"

In [3]:
# read the csv files to data frames
world_cups_df = pd.read_csv(world_cups_csv)
wc_matches_df = pd.read_csv(wc_matches_csv)
wc_players_df = pd.read_csv(wc_players_csv)

### World Cups Data

In [4]:
world_cups_df['Attendance'] = world_cups_df['Attendance'].str.replace('.','').astype(int)

In [5]:
world_cups_df.rename(columns={'Runners-Up':'Second'}, inplace=True)

In [6]:
world_cups_df

Unnamed: 0,Year,Country,Winner,Second,Third,Fourth,GoalsScored,QualifiedTeams,MatchesPlayed,Attendance
0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590549
1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363000
2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375700
3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1045246
4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768607
5,1958,Sweden,Brazil,Sweden,France,Germany FR,126,16,35,819810
6,1962,Chile,Brazil,Czechoslovakia,Chile,Yugoslavia,89,16,32,893172
7,1966,England,England,Germany FR,Portugal,Soviet Union,89,16,32,1563135
8,1970,Mexico,Brazil,Italy,Germany FR,Uruguay,95,16,32,1603975
9,1974,Germany,Germany FR,Netherlands,Poland,Brazil,97,16,38,1865753


### World Cup Matches

In [7]:
wc_matches_df = wc_matches_df.dropna().reset_index(drop=True)

In [8]:
wc_matches_df = wc_matches_df.drop_duplicates('MatchID').reset_index(drop=True)

In [9]:
wc_matches_df = wc_matches_df.apply(pd.to_numeric, errors='ignore', downcast='integer')

In [10]:
wc_matches_df['Datetime'] = pd.to_datetime(wc_matches_df['Datetime'])

In [11]:
wc_matches_df.insert(2,'Date', '')
wc_matches_df.insert(3,'Time', '')
wc_matches_df['Date'] = [d.date() for d in wc_matches_df['Datetime']]
wc_matches_df['Time'] = [d.time() for d in wc_matches_df['Datetime']]
del wc_matches_df['Datetime']

In [12]:
wc_matches_df.insert(6,'Home Team',wc_matches_df['Home Team Initials'])
wc_matches_df.insert(10,'Away Team',wc_matches_df['Away Team Initials'])
del wc_matches_df['Home Team Name']
del wc_matches_df['Away Team Name']
del wc_matches_df['Away Team Initials']
del wc_matches_df['Home Team Initials']

In [13]:
del wc_matches_df['Half-time Home Goals']
del wc_matches_df['Half-time Away Goals']
del wc_matches_df['RoundID']

In [14]:
wc_matches_df.rename(columns={'MatchID':'MatchID_old'}, inplace=True)
wc_matches_df.insert(0, 'MatchID', wc_matches_df['MatchID_old'])
del wc_matches_df['MatchID_old']

In [15]:
wc_matches_df.rename(columns={'Home Team':'Home_Team',
                             'Home Team Goals':'Home_Team_Goals',
                             'Away Team Goals':'Away_Team_Goals',
                             'Away Team':'Away_Team',
                             'Win conditions':'Win_Conditions',
                             'Assistant 1':'Assistant_1',
                             'Assistant 2':'Assistant_2',}, inplace=True)

In [16]:
wc_matches_df.head()

Unnamed: 0,MatchID,Year,Date,Time,Stage,Stadium,City,Home_Team,Home_Team_Goals,Away_Team_Goals,Away_Team,Win_Conditions,Attendance,Referee,Assistant_1,Assistant_2
0,1096,1930,1930-07-13,15:00:00,Group 1,Pocitos,Montevideo,FRA,4,1,MEX,,4444,LOMBARDI Domingo (URU),CRISTOPHE Henry (BEL),REGO Gilberto (BRA)
1,1090,1930,1930-07-13,15:00:00,Group 4,Parque Central,Montevideo,USA,3,0,BEL,,18346,MACIAS Jose (ARG),MATEUCCI Francisco (URU),WARNKEN Alberto (CHI)
2,1093,1930,1930-07-14,12:45:00,Group 2,Parque Central,Montevideo,YUG,2,1,BRA,,24059,TEJADA Anibal (URU),VALLARINO Ricardo (URU),BALWAY Thomas (FRA)
3,1098,1930,1930-07-14,14:50:00,Group 3,Pocitos,Montevideo,ROU,3,1,PER,,2549,WARNKEN Alberto (CHI),LANGENUS Jean (BEL),MATEUCCI Francisco (URU)
4,1085,1930,1930-07-15,16:00:00,Group 1,Parque Central,Montevideo,ARG,1,0,FRA,,23409,REGO Gilberto (BRA),SAUCEDO Ulises (BOL),RADULESCU Constantin (ROU)


### World Cup Players

In [17]:
del wc_players_df['Coach Name']
del wc_players_df['Line-up']
del wc_players_df['Shirt Number']
del wc_players_df['Position']

In [18]:
wc_players_df = wc_players_df[wc_players_df['MatchID'] != 300186460].reset_index(drop=True)

In [19]:
wc_players_df['Event'] = wc_players_df['Event'].fillna('')

In [20]:
wc_players_df['Goals'] = 0
wc_players_df['Own_Goals'] = 0
wc_players_df['Yellow_Card'] = 0
wc_players_df['Red_Card'] = 0
wc_players_df['Second_Yellow_Card'] = 0
wc_players_df['Penalty'] = 0
wc_players_df['Missed_Penalty'] = 0

In [21]:
for index, row in wc_players_df.iterrows():
    events = row['Event'].split(' ')
    for event in events:
        if len(event) != 0:
            if event[0] == 'G':
                wc_players_df.loc[index,'Goals'] += 1
            elif event[0:2] == 'OG':
                wc_players_df.loc[index,'Own_Goals'] += 1
            elif event[0] == 'Y':
                wc_players_df.loc[index,'Yellow_Card'] += 1
            elif event[0] == 'R':
                wc_players_df.loc[index,'Red_Card'] += 1
            elif event[0:2] == 'SY':
                wc_players_df.loc[index,'Second_Yellow_Card'] += 1
            elif event[0] == 'P':
                wc_players_df.loc[index,'Penalty'] += 1
            elif event[0:2] == 'MP':
                wc_players_df.loc[index,'Missed_Penalty'] += 1

In [22]:
del wc_players_df['Event']
del wc_players_df['RoundID']

In [23]:
wc_players_df.insert(0, 'Player_MatchID', wc_players_df['Player Name']+'_'+wc_players_df['MatchID'].astype(str))

In [24]:
wc_players_df = wc_players_df.drop_duplicates('Player_MatchID').reset_index(drop=True)

In [25]:
wc_players_df.rename(columns={'Team Initials':'Team_Initials','Player Name':'Player_Name'},inplace=True)

In [26]:
wc_players_df.head()

Unnamed: 0,Player_MatchID,MatchID,Team_Initials,Player_Name,Goals,Own_Goals,Yellow_Card,Red_Card,Second_Yellow_Card,Penalty,Missed_Penalty
0,Alex THEPOT_1096,1096,FRA,Alex THEPOT,0,0,0,0,0,0,0
1,Oscar BONFIGLIO_1096,1096,MEX,Oscar BONFIGLIO,0,0,0,0,0,0,0
2,Marcel LANGILLER_1096,1096,FRA,Marcel LANGILLER,1,0,0,0,0,0,0
3,Juan CARRENO_1096,1096,MEX,Juan CARRENO,1,0,0,0,0,0,0
4,Ernest LIBERATI_1096,1096,FRA,Ernest LIBERATI,0,0,0,0,0,0,0


### Unique Players

In [27]:
# when grouping with team initials and player name together to keep the country in the table,
    # duplicates arise due to different teams like Germany vs French Germany
# unique_players_df = wc_players_df.groupby(['Player_Name','Team_Initials'], as_index=False).sum()
# del unique_players_df['MatchID']

In [28]:
unique_players_df = wc_players_df.groupby(['Player_Name'], as_index=False).sum()
del unique_players_df['MatchID']

In [29]:
unique_players_df = unique_players_df.sort_values('Goals', ascending=False).reset_index(drop=True)

In [30]:
unique_players_df.head()

Unnamed: 0,Player_Name,Goals,Own_Goals,Yellow_Card,Red_Card,Second_Yellow_Card,Penalty,Missed_Penalty
0,KLOSE,16,0,3,1,0,0,0
1,RONALDO,16,0,3,0,0,1,0
2,Just FONTAINE,13,0,0,0,0,0,0
3,Gerd MUELLER,13,0,2,0,0,1,0
4,PEL� (Edson Arantes do Nascimento),12,0,0,0,0,0,0


### Push DataFrames to SQL Database and Display Them

In [31]:
rds_connection_string = "postgres:postgres@localhost:5432/FIFA_World_Cup"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [32]:
# Confirm tables
engine.table_names()

['world_cups', 'wc_matches', 'wc_players', 'unique_players']

In [33]:
world_cups_df.to_sql(name="world_cups", con=engine, if_exists='append', index=True)

In [34]:
wc_matches_df.to_sql(name="wc_matches", con=engine, if_exists='append', index=True)

In [35]:
wc_players_df.to_sql(name="wc_players", con=engine, if_exists='append', index=True)

In [36]:
unique_players_df.to_sql(name="unique_players", con=engine, if_exists='append', index=True)

In [37]:
pd.read_sql_query('select * from world_cups', con=engine).head()

Unnamed: 0,index,Year,Country,Winner,Second,Third,Fourth,GoalsScored,QualifiedTeams,MatchesPlayed,Attendance,data_date
0,0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590549,2019-08-18 15:47:10.045940
1,1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363000,2019-08-18 15:47:10.045940
2,2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375700,2019-08-18 15:47:10.045940
3,3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1045246,2019-08-18 15:47:10.045940
4,4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768607,2019-08-18 15:47:10.045940


In [38]:
pd.read_sql_query('select * from wc_matches', con=engine).head()

Unnamed: 0,index,MatchID,Year,Date,Time,Stage,Stadium,City,Home_Team,Home_Team_Goals,Away_Team_Goals,Away_Team,Win_Conditions,Attendance,Referee,Assistant_1,Assistant_2,data_date
0,0,1096,1930,1930-07-13,15:00:00,Group 1,Pocitos,Montevideo,FRA,4,1,MEX,,4444,LOMBARDI Domingo (URU),CRISTOPHE Henry (BEL),REGO Gilberto (BRA),2019-08-18 15:47:10.271444
1,1,1090,1930,1930-07-13,15:00:00,Group 4,Parque Central,Montevideo,USA,3,0,BEL,,18346,MACIAS Jose (ARG),MATEUCCI Francisco (URU),WARNKEN Alberto (CHI),2019-08-18 15:47:10.271444
2,2,1093,1930,1930-07-14,12:45:00,Group 2,Parque Central,Montevideo,YUG,2,1,BRA,,24059,TEJADA Anibal (URU),VALLARINO Ricardo (URU),BALWAY Thomas (FRA),2019-08-18 15:47:10.271444
3,3,1098,1930,1930-07-14,14:50:00,Group 3,Pocitos,Montevideo,ROU,3,1,PER,,2549,WARNKEN Alberto (CHI),LANGENUS Jean (BEL),MATEUCCI Francisco (URU),2019-08-18 15:47:10.271444
4,4,1085,1930,1930-07-15,16:00:00,Group 1,Parque Central,Montevideo,ARG,1,0,FRA,,23409,REGO Gilberto (BRA),SAUCEDO Ulises (BOL),RADULESCU Constantin (ROU),2019-08-18 15:47:10.271444


In [39]:
pd.read_sql_query('select * from wc_players', con=engine).head()

Unnamed: 0,index,Player_MatchID,MatchID,Team_Initials,Player_Name,Goals,Own_Goals,Yellow_Card,Red_Card,Second_Yellow_Card,Penalty,Missed_Penalty,data_date
0,0,Alex THEPOT_1096,1096,FRA,Alex THEPOT,0,0,0,0,0,0,0,2019-08-18 15:47:10.966976
1,1,Oscar BONFIGLIO_1096,1096,MEX,Oscar BONFIGLIO,0,0,0,0,0,0,0,2019-08-18 15:47:10.966976
2,2,Marcel LANGILLER_1096,1096,FRA,Marcel LANGILLER,1,0,0,0,0,0,0,2019-08-18 15:47:10.966976
3,3,Juan CARRENO_1096,1096,MEX,Juan CARRENO,1,0,0,0,0,0,0,2019-08-18 15:47:10.966976
4,4,Ernest LIBERATI_1096,1096,FRA,Ernest LIBERATI,0,0,0,0,0,0,0,2019-08-18 15:47:10.966976


In [40]:
pd.read_sql_query('select * from unique_players', con=engine).head()

Unnamed: 0,index,Player_Name,Goals,Own_Goals,Yellow_Card,Red_Card,Second_Yellow_Card,Penalty,Missed_Penalty,data_date
0,0,KLOSE,16,0,3,1,0,0,0,2019-08-18 15:47:19.343763
1,1,RONALDO,16,0,3,0,0,1,0,2019-08-18 15:47:19.343763
2,2,Just FONTAINE,13,0,0,0,0,0,0,2019-08-18 15:47:19.343763
3,3,Gerd MUELLER,13,0,2,0,0,1,0,2019-08-18 15:47:19.343763
4,4,PEL� (Edson Arantes do Nascimento),12,0,0,0,0,0,0,2019-08-18 15:47:19.343763
