In [17]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from scipy.stats import linregress

df = pd.read_csv("cleandata/clean_df.csv")
df = df[['Unnamed: 0', 'ID', 'Name', "Team", 'Games', 'Sport', 'Event', 'Medal','region','Population', 'Area (sq km)', 'GDP']]
df["Medal"] = df["Medal"].fillna("no")
df["Medal"].value_counts()

no        179226
Bronze      9665
Gold        9411
Silver      9261
Name: Medal, dtype: int64

In [18]:
medals = pd.DataFrame({
    "Medals" : df["Medal"]
})

medals = pd.get_dummies(medals)
df["Bronze"] = medals["Medals_Bronze"]
df["Silver"] = medals["Medals_Silver"]
df["Gold"] = medals["Medals_Gold"]
df["No Win"] = medals["Medals_no"]
df["Attempts"] = df["Bronze"] + df["Silver"] + df["Gold"] + df["No Win"]
df["Wins"] = df["Bronze"] + df["Silver"] + df["Gold"]
df["GDP"] = df["GDP"].replace('[^.0-9]', '', regex=True).astype(float)
df.head(1)

Unnamed: 0.1,Unnamed: 0,ID,Name,Team,Games,Sport,Event,Medal,region,Population,Area (sq km),GDP,Bronze,Silver,Gold,No Win,Attempts,Wins
0,0,1,A Dijiang,China,1992 Summer,Basketball,Basketball Men's Basketball,no,China,1182230625,9326410,427000000000.0,0,0,0,1,1,0


In [19]:
team_group = df.groupby(by = ["Team", "Games", "Sport", "Event", "region"], as_index= False).agg({
    "Population" : "mean",
    "Attempts" : "max",
    "Wins" : "max",
    "Area (sq km)" : "mean",
    "GDP" : "mean",
    "Bronze" : "max",
    "Silver" : "max",
    "Gold" : "max",
})

country_list = team_group.groupby(["region"], as_index= False).agg({
    "Population" : "mean",
    "Attempts" : "sum",
    "Wins" : "sum",
    "Area (sq km)" : "mean",
    "GDP" : "mean",
    "Bronze" : "sum",
    "Silver" : "sum",
    "Gold" : "sum",
})
country_group = team_group.groupby(["region", "Games"], as_index= False).agg({
    "Population" : "mean",
    "Attempts" : "sum",
    "Wins" : "sum",
    "Area (sq km)" : "mean",
    "GDP" : "mean",
    "Bronze" : "sum",
    "Silver" : "sum",
    "Gold" : "sum",
}).sort_values(by = "Wins")
try:
    country_group["Win Rate"] = country_group["Wins"] / country_group["Attempts"]
except:
    0
country_group["GDP Per Capita"] = country_group["GDP"] / country_group["Population"]
country_list = country_list.loc[country_list["Wins"] >= 20]
country_group = country_group.loc[country_group["Wins"] >= 1]


In [20]:
#looping blank lists
country = country_list["region"].unique()
r2_pop = []
r2_gdp = []
r2_gdp_per_capita = []
r2_pop_MR = []
r2_gdp_MR = []
r2_gdp_per_capita_MR = []
count_r2_gdp_pc_MR = []


In [21]:
for x in country:
    try:    
        c_loop = country_group
        c_loop = c_loop.loc[c_loop["region"] == f'{x}']    
        # try:
        # pop x medals
        plt.clf()
        x_val = c_loop["Population"].astype(float)
        y_val = c_loop["Wins"]
        (slope, intercept, rvalue, pvalue, stderr) = linregress(x_val, y_val)
        regress_values = x_val * slope + intercept
        plt.scatter(x_val, y_val)
        plt.plot(x_val,regress_values,"r-")
        plt.xlabel('Population')
        plt.ylabel('Medals')
        plt.title(f'{x} - Population vs Medals')
        plt.savefig(f'breakdowns/countryplots/Pop/{x}xPopulation.png')
        r2_pop.append(rvalue**2)
            # GDP x medals
        plt.clf()
        x_val = c_loop["GDP"].astype(float)
        y_val = c_loop["Wins"]
        (slope, intercept, rvalue, pvalue, stderr) = linregress(x_val, y_val)
        regress_values = x_val * slope + intercept
        # plt.scatter(x_val, y_val)
        # plt.plot(x_val,regress_values,"r-")
        # plt.xlabel('GDP')
        # plt.ylabel('Medals')
        # plt.title(f'{x} - GDP vs Medals')
        # plt.savefig(f'breakdowns/countryplots/GDP/{x}xGDP.png')
        r2_gdp.append(rvalue**2)
            # GDP per capita x medals
        plt.clf()
        x_val = c_loop["GDP Per Capita"].astype(float)
        y_val = c_loop["Wins"]
        (slope, intercept, rvalue, pvalue, stderr) = linregress(x_val, y_val)
        regress_values = x_val * slope + intercept
        plt.scatter(x_val, y_val)
        plt.plot(x_val,regress_values,"r-")
        plt.xlabel('GDP Per Capita')
        plt.ylabel('Medals')
        plt.title(f'{x} - GDP Per Capita vs Medals')
        plt.savefig(f'breakdowns/countryplots/GDP_PC/{x}xGDP Per Capita.png')
        r2_gdp_per_capita.append(rvalue**2)
        #switching to the y value being Medal Rate/Win Rate
        # pop x medals
        plt.clf()
        x_val = c_loop["Population"].astype(float)
        y_val = c_loop["Win Rate"]
        (slope, intercept, rvalue, pvalue, stderr) = linregress(x_val, y_val)
        regress_values = x_val * slope + intercept
        # plt.scatter(x_val, y_val)
        # plt.plot(x_val,regress_values,"r-")
        # plt.xlabel('Population')
        # plt.ylabel('Win Rate')
        # plt.title(f'{x} - Population vs Win Rate')
        # plt.savefig(f'breakdowns/countryplots_mr/Pop/{x}xPopulation.png')
        r2_pop_MR.append(rvalue**2)
            # GDP x medals
        plt.clf()
        x_val = c_loop["GDP"].astype(float)
        y_val = c_loop["Win Rate"]
        (slope, intercept, rvalue, pvalue, stderr) = linregress(x_val, y_val)
        regress_values = x_val * slope + intercept
        # plt.scatter(x_val, y_val)
        # plt.plot(x_val,regress_values,"r-")
        # plt.xlabel('GDP')
        # plt.ylabel('Win Rate')
        # plt.title(f'{x} - GDP vs Win Rate')
        # plt.savefig(f'breakdowns/countryplots_mr/GDP/{x}xGDP.png')
        r2_gdp_MR.append(rvalue**2)
            # GDP per capita x medals
        plt.clf()
        x_val = c_loop["GDP Per Capita"].astype(float)
        y_val = c_loop["Win Rate"]
        (slope, intercept, rvalue, pvalue, stderr) = linregress(x_val, y_val)
        regress_values = x_val * slope + intercept
        # plt.scatter(x_val, y_val)
        # plt.plot(x_val,regress_values,"r-")
        # plt.xlabel('GDP Per Capita')
        # plt.ylabel('Win Rate')
        # plt.title(f'{x} - GDP Per Capita vs Win Rate')
        # plt.savefig(f'breakdowns/countryplots_mr/GDP_PC/{x}xGDP Per Capita.png')
        r2_gdp_per_capita_MR.append(rvalue**2)
        count_r2_gdp_pc_MR.append(len(c_loop))
    except:
        r2_pop.append(0)
        r2_gdp.append(0)
        r2_gdp_per_capita.append(0)
        r2_pop_MR.append(0)
        r2_gdp_MR.append(0)
        r2_gdp_per_capita_MR.append(0)
        count_r2_gdp_pc_MR.append(0)

<Figure size 432x288 with 0 Axes>

In [22]:
print(len(r2_pop))
print(len(r2_gdp))
print(len(r2_gdp_per_capita))
print(len(r2_pop_MR))
print(len(r2_gdp_MR))
print(len(r2_gdp_per_capita_MR))
print(len(count_r2_gdp_pc_MR))
print(len(country))

57
57
57
57
57
57
57
57


In [26]:
Country_Data = pd.DataFrame({
    "Country": country,
    "R2 Population": r2_pop,
    "R2 GDP": r2_gdp,
    "R2 GDP PC": r2_gdp_per_capita,
    "R2 WR Population": r2_pop_MR,
    "R2 WR GDP": r2_gdp_MR,
    "R2 WR GDP PC": r2_gdp_per_capita_MR,
    "# of Data Points" : count_r2_gdp_pc_MR,
}).sort_values(by = "R2 GDP PC", ascending = True)
Country_Data = Country_Data.loc[Country_Data["# of Data Points"] >= 20]
Country_Data.head(20)

Unnamed: 0,Country,R2 Population,R2 GDP,R2 GDP PC,R2 WR Population,R2 WR GDP,R2 WR GDP PC,# of Data Points
27,Japan,0.002932,0.00848,0.007585,0.062677,0.006264,0.007122,25
1,Australia,0.026869,0.02174,0.02637,1e-06,0.001119,0.001464,21
49,Sweden,0.056738,0.046909,0.044386,0.000731,0.0078,0.004565,30
54,USA,0.065312,0.068698,0.06844,0.000169,0.003051,0.00147,29
25,Italy,0.025928,0.068938,0.070576,0.027835,0.00348,0.002372,30
47,South Korea,0.201236,0.098296,0.107307,0.37435,0.192706,0.20832,20
53,UK,0.152206,0.120517,0.110536,0.125776,0.112851,0.101586,25
16,Finland,0.103276,0.146123,0.139279,0.109474,0.122445,0.119452,30
39,Norway,0.15413,0.137646,0.148078,2.5e-05,0.000387,0.000696,28
2,Austria,0.170539,0.18054,0.183152,0.016274,0.021858,0.022509,28


In [24]:
country_group.corr()

Unnamed: 0,Population,Attempts,Wins,Area (sq km),GDP,Bronze,Silver,Gold,Win Rate,GDP Per Capita
Population,1.0,0.18493,0.245531,0.401656,0.371238,0.221747,0.236968,0.26581,0.128409,-0.094434
Attempts,0.18493,1.0,0.787962,0.32998,0.398832,0.786657,0.736134,0.67624,0.189945,0.220239
Wins,0.245531,0.787962,1.0,0.464474,0.485387,0.95873,0.967006,0.957377,0.579503,0.184902
Area (sq km),0.401656,0.32998,0.464474,1.0,0.350591,0.437378,0.459175,0.472684,0.325822,-0.019076
GDP,0.371238,0.398832,0.485387,0.350591,1.0,0.456453,0.474158,0.467896,0.246707,0.349198
Bronze,0.221747,0.786657,0.95873,0.437378,0.456453,1.0,0.89705,0.87441,0.552763,0.174672
Silver,0.236968,0.736134,0.967006,0.459175,0.474158,0.89705,1.0,0.918895,0.582203,0.181615
Gold,0.26581,0.67624,0.957377,0.472684,0.467896,0.87441,0.918895,1.0,0.587114,0.148489
Win Rate,0.128409,0.189945,0.579503,0.325822,0.246707,0.552763,0.582203,0.587114,1.0,0.120225
GDP Per Capita,-0.094434,0.220239,0.184902,-0.019076,0.349198,0.174672,0.181615,0.148489,0.120225,1.0
