In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('olympic_analysis_data.csv')


In [2]:
#1.Number of medals vs. time (trajectory of top few countries over the years)
top_countries = df.groupby('country_name')['total_medal_count'].sum().nlargest(20).index
plt.figure(figsize=(12, 6))
for country in top_countries:
    country_data = df[df['country_name'] == country].sort_values('year')
    plt.plot(country_data['year'], country_data['total_medal_count'], label=country)
plt.title('Number of Medals Over Time for Top 5 Countries')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.legend()
plt.savefig('medals_over_time.png')
plt.close()

In [4]:
# 2. Correlation heatmap of the predictor variables
predictor_vars = ['population', 'gdp_per_capita', 'life_expectancy', 'pop_20_39_percent', 'urban_population_percent','bmi_mean','area_sq_km','democracy_score', 'hosting_status','total_medal_count']
corr_matrix = df[predictor_vars].corr()
plt.figure(figsize=(18, 18))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)

plt.savefig('correlation_heatmap.pdf')
plt.close()

In [4]:
# 4. Number of medals vs. GDP per capita (scatter plot)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='gdp_per_capita', y='total_medal_count', alpha=0.6)
plt.title('Number of Medals vs GDP per Capita')
plt.xlabel('GDP per Capita')
plt.ylabel('Number of Medals')
plt.xscale('log')  # Using log scale for GDP per capita due to its wide range
plt.savefig('medals_vs_gdp.png')
plt.close()

In [17]:
# 5. Bonus: Box plot of medals for host vs non-host countries
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='hosting_status', y='total_medal_count')

plt.xlabel('Hosting Status (0: Non-Host, 1: Host)')
plt.ylabel('Number of Medals')
plt.savefig('medals_host_vs_nonhost.pdf')
plt.close()


In [12]:
# 6
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='democracy_score', y='total_medal_count')
plt.savefig('medals_democracy.pdf')
plt.close()

In [7]:

features = ['population', 'life_expectancy', 'urban_population_percent', 'bmi_mean', 'area_sq_km', 'democracy_score']

plt.figure(figsize=(14, 10))
for i, feature in enumerate(features, 1):
    plt.subplot(2, 3, i)
    sns.scatterplot(data=df, x=feature, y='total_medal_count', alpha=0.7)
    plt.title(f'Total Medals vs {feature}')
    plt.xlabel(feature)
    plt.ylabel('Total Medals')
plt.tight_layout()
plt.savefig('scatter_plots_features_medals.png')
plt.close()


In [8]:
# Joint plot for GDP per Capita vs Total Medals
sns.jointplot(data=df, x='gdp_per_capita', y='total_medal_count', kind='reg', height=8)
plt.title('GDP per Capita vs Total Medals', loc='left')
plt.savefig('jointplot_gdp_vs_medals.png')
plt.close()

# Joint plot for Population vs Total Medals
sns.jointplot(data=df, x='population', y='total_medal_count', kind='reg', height=8)
plt.title('Population vs Total Medals', loc='left')
plt.savefig('jointplot_population_vs_medals.png')
plt.close()

In [9]:
# Pair plot to explore pairwise relationships among key features and total medals
sns.pairplot(df[['population', 'gdp_per_capita', 'life_expectancy', 'total_medal_count']])
plt.savefig('pairplot_features_medals.png')
plt.close()
print("Graphs have been generated and saved.")

Graphs have been generated and saved.


In [10]:
# Calculate total GDP and add it as a new column
df['total_gdp'] = df['gdp_per_capita'] * df['population']

# Scatter plot of total GDP vs total medal count
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='total_gdp', y='total_medal_count', alpha=0.6)
plt.title('Total GDP vs Total Medal Count')
plt.xlabel('Total GDP')
plt.ylabel('Total Medal Count')
plt.xscale('log')  # Optional: Use log scale for better visualization if GDP values vary widely
plt.savefig('total gdp.png')
plt.close()



In [9]:
df['total_gdp'] = df['gdp_per_capita'] * df['population']
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='total_gdp', y='total_medal_count', alpha=0.6)
plt.xscale('log')
plt.yscale('log')
plt.savefig('log_log_gdp_vs_medals.pdf')
plt.close()


In [12]:
# Create GDP quartiles
df['gdp_quartile'] = pd.qcut(df['total_gdp'], 4, labels=['Low', 'Medium', 'High', 'Very High'])

# Box plot of total medal count by GDP quartile
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='gdp_quartile', y='total_medal_count')
plt.title('Total Medal Count by GDP Quartile')
plt.xlabel('GDP Quartile')
plt.ylabel('Total Medal Count')
plt.savefig('medal_count_by_gdp_quartile.png')
plt.close()


In [13]:
# Create population quartiles for better categorization
df['population_quartile'] = pd.qcut(df['population'], 4, labels=['Low', 'Medium', 'High', 'Very High'])

# Violin plot for population quartiles vs. total medal count
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x='population_quartile', y='total_medal_count')
plt.title('Total Medal Count by Population Quartile')
plt.xlabel('Population Quartile')
plt.ylabel('Total Medal Count')
plt.savefig('medals_by_population_quartile.png')
plt.close()


In [14]:
plt.figure(figsize=(10, 6))
sns.regplot(data=df, x='total_gdp', y='total_medal_count', scatter_kws={'alpha':0.5})
plt.xscale('log')  # Log scale for better visualization
plt.title('Regression Plot: Total GDP vs Total Medal Count')
plt.xlabel('Total GDP (Log Scale)')
plt.ylabel('Total Medal Count')
plt.savefig('regression_gdp_vs_medals.png')
plt.close()


In [15]:
# Create bins for population and GDP for heatmap
df['gdp_bin'] = pd.cut(df['total_gdp'], bins=5)
df['population_bin'] = pd.cut(df['population'], bins=5)

# Pivot table for heatmap
heatmap_data = df.pivot_table(index='gdp_bin', columns='population_bin', values='total_medal_count', aggfunc='mean')

plt.figure(figsize=(20, 20))
sns.heatmap(heatmap_data, annot=True, cmap='Blues', fmt='.1f')
plt.title('Heatmap of Medal Count by GDP and Population')
plt.xlabel('Population Bins')
plt.ylabel('GDP Bins')
plt.savefig('heatmap_gdp_population_medals.png')
plt.close()


  heatmap_data = df.pivot_table(index='gdp_bin', columns='population_bin', values='total_medal_count', aggfunc='mean')


In [16]:
plt.figure(figsize=(10, 6))
sns.swarmplot(data=df, x='gdp_quartile', y='total_medal_count', hue='hosting_status', palette='Set2')
plt.title('Total Medals by GDP Quartile and Hosting Status')
plt.xlabel('GDP Quartile')
plt.ylabel('Total Medal Count')
plt.savefig('swarmplot_gdp_vs_medals.png')
plt.close()




In [17]:
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df[df['population'] > df['population'].median()], x='total_medal_count', label='High Population', shade=True)
sns.kdeplot(data=df[df['population'] <= df['population'].median()], x='total_medal_count', label='Low Population', shade=True)
plt.title('Density Plot: Total Medals for High vs Low Population Countries')
plt.xlabel('Total Medal Count')
plt.legend()
plt.savefig('density_medals_by_population.png')
plt.close()



`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=df[df['population'] > df['population'].median()], x='total_medal_count', label='High Population', shade=True)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data=df[df['population'] <= df['population'].median()], x='total_medal_count', label='Low Population', shade=True)


In [18]:
# Calculate total GDP if not already done
if 'total_gdp' not in df.columns:
    df['total_gdp'] = df['gdp_per_capita'] * df['population']

# Create scatter plot with total GDP and democracy score, and size markers by total medal count
plt.figure(figsize=(10, 6))
scatter = plt.scatter(data=df, x='total_gdp', y='democracy_score', 
                      s=df['total_medal_count'] * 10,  # Scale medal count for marker size
                      c=df['total_medal_count'], cmap='viridis', alpha=0.6)

plt.xscale('log')  # Log scale for GDP to account for its wide range
plt.colorbar(scatter, label='Total Medal Count')
plt.title('Total GDP and Democracy Score vs Total Medal Count')
plt.xlabel('Total GDP (Log Scale)')
plt.ylabel('Democracy Score')
plt.savefig('total_gdp_vs_democracy_vs_medals.png')
plt.close()


In [19]:
# Calculate total GDP if not already done
if 'total_gdp' not in df.columns:
    df['total_gdp'] = df['gdp_per_capita'] * df['population']

# Create scatter plot with total GDP, democracy score, and color by life expectancy
plt.figure(figsize=(10, 6))
scatter = plt.scatter(data=df, x='total_gdp', y='democracy_score', 
                      s=df['total_medal_count'] * 10,  # Scale medal count for marker size
                      c=df['life_expectancy'], cmap='coolwarm', alpha=0.6)  # Color by life expectancy

plt.xscale('log')  # Log scale for GDP
plt.colorbar(scatter, label='Life Expectancy')
plt.title('Total GDP and Democracy Score vs Total Medal Count (Colored by Life Expectancy)')
plt.xlabel('Total GDP (Log Scale)')
plt.ylabel('Democracy Score')
plt.savefig('total_gdp_democracy_life_expectancy_medals.png')
plt.close()


In [20]:
df['medals_per_capita'] = df['total_medal_count'] / df['population']
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='gdp_per_capita', y='medals_per_capita', alpha=0.6)
plt.xscale('log')  # Log scale for GDP to account for its wide range
plt.colorbar(scatter, label='Total Medal Count')
plt.title('Total GDP per capita vs medals per capita')
plt.xlabel('Gdp per capita')
plt.ylabel('medals per capita')
plt.savefig('gdppercapitavsmedalspercapita.png')
plt.close()

In [21]:
df['population_density'] = df['population'] / df['area_sq_km']
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='population_density', y='total_medal_count', alpha=0.6)
plt.xscale('log')  # Log scale for GDP to account for its wide range
plt.colorbar(scatter, label='Total Medal Count')
plt.title('Medals per pop density')
plt.xlabel('pop density')
plt.ylabel('medals ')
plt.savefig('densitypop vs medals.png')
plt.close()

In [22]:
df['urban_population'] = df['urban_population_percent'] * df['population'] / 100
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='urban_population', y='total_medal_count', alpha=0.6)
plt.xscale('log')  # Log scale for GDP to account for its wide range
plt.colorbar(scatter, label='Total Medal Count')
plt.title('Urban population vs medals')
plt.xlabel('Urban population')
plt.ylabel('medals ')
plt.savefig('urbanpopulation vs medals.png')
plt.close()


In [23]:
df['pop_20_39'] = df['pop_20_39_percent'] * df['population'] / 100
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='pop_20_39', y='total_medal_count', alpha=0.6)
plt.xscale('log')  # Log scale for GDP to account for its wide range
plt.colorbar(scatter, label='Total Medal Count')
plt.title('pop 20-39 vs medals')
plt.xlabel('pop 20-39')
plt.ylabel('medals ')
plt.savefig('pop20-39 vs medals.png')
plt.close()


In [15]:
df['bmi'] = df['bmi_mean']*df['population']
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='bmi', y='total_medal_count', alpha=0.6)
plt.xscale('log')  # Log scale for GDP to account for its wide range
plt.savefig('bmivs_medals.pdf')
plt.close()