In [1]:
# Import dependencies
import json
import pandas as pd
import sqlite3
import pprint as pp

# Create the connection string to the sqllite database
con = sqlite3.connect("db/olympic_data.db")

# Using SQL, select all records from the athlete_events table
sql = f"""
    SELECT * FROM athlete_events
    """
# Read the table into a PANDAS dataframe
ath_df = pd.read_sql(sql, con)
ath_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [2]:
# Grabbing only the columns needed for the sunburst viz
clean_ath_df = ath_df[['Season','Sport','Team','Medal','Year']]
clean_ath_df.head()

Unnamed: 0,Season,Sport,Team,Medal,Year
0,Summer,Basketball,China,,1992
1,Summer,Judo,China,,2012
2,Summer,Football,Denmark,,1920
3,Summer,Tug-Of-War,Denmark/Sweden,Gold,1900
4,Winter,Speed Skating,Netherlands,,1988


In [3]:
# Filtering data for 2016 only to test the "to_json" function
filt_clean_df = clean_ath_df.loc[clean_ath_df['Year'] >= 2014]
fcdf = filt_clean_df.reset_index(drop=True)

In [4]:
# Convert the dataframe into a json object
# j = filt_clean_df.to_json()

In [5]:
# Print the json object
# print(json.dumps(json.loads(j), indent=2, sort_keys=False)).head()

In [6]:
# Extracting unique values from each column of the dataframe and putting them in a list
unique_seasons = fcdf.Season.unique()
unique_sports = fcdf.Sport.unique()
unique_teams = fcdf.Team.unique()
unique_medals = fcdf.Medal.unique()
unique_years = fcdf.Year.unique()

seasons_list = unique_seasons.tolist()
sports_list = unique_sports.tolist()
teams_list = unique_teams.tolist()
medals_list = unique_medals.tolist()
years_list = unique_years.tolist()

In [7]:
# Testing creating format of JSON object we need manually...
newDict = {}
newDict.update({"name":"Olympics"})
newDict.update({"children":[
    {"name":unique_seasons[0]},
    {"name":unique_seasons[1]}
]})


In [8]:
print(json.dumps(newDict, indent=4))

{
    "name": "Olympics",
    "children": [
        {
            "name": "Winter"
        },
        {
            "name": "Summer"
        }
    ]
}


In [9]:
# Sort columns to be extracted into CSV
sort_clean_ath_df = clean_ath_df.sort_values(['Season', 'Sport', 'Team', 'Medal', 'Year'], ascending=[True, True, True, True, True])
sort_clean_ath_df

Unnamed: 0,Season,Sport,Team,Medal,Year
214105,Summer,Aeronautics,Switzerland,Gold,1936
213142,Summer,Alpinism,Germany,Gold,1932
213208,Summer,Alpinism,Germany,Gold,1932
60639,Summer,Alpinism,Switzerland,Gold,1936
60641,Summer,Alpinism,Switzerland,Gold,1936
244966,Summer,Archery,American Samoa,,2000
18304,Summer,Archery,Argentina,,1988
180139,Summer,Archery,Argentina,,1988
46941,Summer,Archery,Australia,Bronze,2004
192202,Summer,Archery,Australia,Bronze,2016


In [10]:
# Convert to csv
sort_clean_ath_df.to_csv("clean_athlete_data.csv",encoding="utf-8", index=False)

In [11]:
##############################################
## New Tables for Medals by Country by Year ##
##############################################

In [12]:
removeNone = ath_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
removeNone.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
40,16,Juhamatti Tapio Aaltonen,M,28.0,184.0,85.0,Finland,FIN,2014 Winter,2014,Winter,Sochi,Ice Hockey,Ice Hockey Men's Ice Hockey,Bronze
41,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Individual All-Around,Bronze
42,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Team All-Around,Gold
44,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Horse Vault,Gold
48,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Pommelled Horse,Gold


In [13]:
removeNone["Medal"].value_counts()

Gold      10167
Bronze    10148
Silver     9866
Name: Medal, dtype: int64

In [14]:
removeNone["NOC"].value_counts()

USA    4383
URS    2246
GER    1612
AUS    1206
RUS    1134
ITA    1060
CAN    1060
GBR    1031
GDR     995
FRA     987
CHN     985
JPN     843
HUN     791
SWE     765
FIN     724
NED     708
ROU     597
FRG     582
KOR     561
POL     548
NOR     514
BRA     447
ESP     431
CUB     394
TCH     375
SUI     345
BUL     328
YUG     321
AUT     273
DEN     255
       ... 
ECU       2
UAE       2
LIB       2
TAN       2
MOZ       2
AFG       2
SYR       2
KUW       2
GUA       1
MRI       1
CYP       1
NIG       1
TOG       1
HAI       1
SRI       1
GAB       1
ERI       1
GUY       1
SEN       1
ZAM       1
KOS       1
ISV       1
BOT       1
BAR       1
MKD       1
JOR       1
TGA       1
DJI       1
BER       1
SUD       1
Name: NOC, Length: 143, dtype: int64

In [15]:
# New Dataframe to count medals by Country
countryMedals = removeNone[["NOC","Medal"]]
gr_countryMedals = countryMedals.groupby(["NOC"])
newDf = pd.DataFrame(gr_countryMedals["Medal"].count())
ri_newDf = newDf.reset_index()
ri_newDf

Unnamed: 0,NOC,Medal
0,AFG,2
1,ALG,15
2,ANZ,4
3,ARG,220
4,ARM,15
5,AUS,1206
6,AUT,273
7,AZE,44
8,BAH,40
9,BAR,1


In [16]:
# New DataFrame to count medals by Country by Year
countryMedalsByYear = removeNone[["NOC","Year","Medal"]]
gr2_countryMedals = countryMedalsByYear.groupby(["NOC","Year"])
newDf2 = pd.DataFrame(gr2_countryMedals["Medal"].count())
newDf2
sri_newDf2 = newDf2.sort_values(['Year','NOC']).reset_index()
cleanDf2 = sri_newDf2.groupby(['Year','NOC']).sum().groupby(level=[1]).cumsum()
reset_cleanDf2 = cleanDf2.reset_index()
reset_cleanDf2





Unnamed: 0,Year,NOC,Medal
0,1896,GBR,2
1,1896,GER,6
2,1896,USA,12
3,1900,AUS,4
4,1900,CAN,2
5,1900,FRA,4
6,1900,GBR,6
7,1900,HUN,3
8,1900,USA,33
9,1904,CAN,5


In [17]:
con = sqlite3.connect("db/olympic_data.db")
newDf.to_sql("medals_by_country", con, if_exists="replace", index=False)
reset_cleanDf2.to_sql("medals_by_country_by_year", con, if_exists="replace",index=False)

con.commit()
con.close()

In [18]:
# Create CSVs - this was for testing
# ri_newDf.to_csv("medals_by_country.csv",encoding="utf-8", index=False)
# reset_cleanDf2.to_csv("medals_by_country_by_year.csv",encoding="utf-8", index=False)