In [83]:
import pandas as pd 
import plotly.express as px
import numpy as np


co2_label = "CO2 emissions (metric tons per capita)"
gdp_label = "gdpPercap"

gapminder_df = pd.read_csv('./gapminder_clean.csv')

gapminder_df.head()

Unnamed: 0.1,Unnamed: 0,Country Name,Year,"Agriculture, value added (% of GDP)",CO2 emissions (metric tons per capita),Domestic credit provided by financial sector (% of GDP),Electric power consumption (kWh per capita),Energy use (kg of oil equivalent per capita),Exports of goods and services (% of GDP),"Fertility rate, total (births per woman)",GDP growth (annual %),Imports of goods and services (% of GDP),"Industry, value added (% of GDP)","Inflation, GDP deflator (annual %)","Life expectancy at birth, total (years)",Population density (people per sq. km of land area),"Services, etc., value added (% of GDP)",pop,continent,gdpPercap
0,0,Afghanistan,1962,,0.073781,21.276422,,,4.878051,7.45,,9.349593,,,33.219902,14.312061,,10267083.0,Asia,853.10071
1,1,Afghanistan,1967,,0.123782,9.917662,,,6.772908,7.45,,14.209827,,,35.389415,15.881812,,11537966.0,Asia,836.197138
2,2,Afghanistan,1972,,0.13082,18.880833,,,14.763231,7.45,,18.10585,,,37.610146,17.947027,,13079460.0,Asia,739.981106
3,3,Afghanistan,1977,,0.183118,13.836822,,,11.662904,7.449,,14.823175,,,40.110146,19.998926,,14880372.0,Asia,786.11336
4,4,Afghanistan,1982,,0.165879,,,,,7.45,,,,,43.230732,19.402324,,12881816.0,Asia,978.011439


In [84]:
gapminder_df_1962 = gapminder_df[gapminder_df['Year'] == 1962]
gapminder_df_1962 = gapminder_df_1962[[co2_label, gdp_label, 'Country Name', 'pop', 'continent']].dropna(subset=[co2_label, gdp_label])

px.scatter(gapminder_df_1962, x=co2_label, y=gdp_label, color="continent",
                 title="CO2 Emission Per Capita v/s GDP Per Capita")

In [85]:
from scipy.stats import pearsonr

dropped_df = gapminder_df[[co2_label, gdp_label]].dropna()
pearson_corr, p_value = pearsonr(dropped_df[co2_label].values, dropped_df[gdp_label].values)

print('The Correlation between GDP and CO2 is: \n')
print('Pearsons Correlation: ' + str(pearson_corr))
print('p value: ' + str(p_value))

The Correlation between GDP and CO2 is: 

Pearsons Correlation: 0.813291861531547
p value: 2.9309176165291103e-280


In [86]:
# Find the year with the highest correlation 

def group_results(group):
    if len(group[co2_label].values) < 2: 
        return pd.Series({'pvalue': None, 'pearson_corr': None})
    
    pearson_corr, p_value = pearsonr(group[co2_label], group[gdp_label])
    return pd.Series({'pvalue': p_value, 'pearson_corr': pearson_corr })

dropped_df = gapminder_df[['Year', gdp_label, co2_label]].dropna()

grouped_df = dropped_df.groupby('Year').apply(group_results).reset_index().sort_values(by='pearson_corr', ascending=False)

grouped_df


Unnamed: 0,Year,pvalue,pearson_corr
1,1967,3.397143e-53,0.938792
0,1962,1.128679e-46,0.926082
2,1972,1.824292e-32,0.842899
4,1982,5.565916e-29,0.816638
5,1987,3.899627e-28,0.809553
6,1992,1.610614e-29,0.809432
7,1997,7.976156e-30,0.80814
8,2002,3.8635640000000003e-29,0.800642
3,1977,2.838892e-26,0.792834
9,2007,9.232747e-22,0.720417


The year with the highest correlation between GDP and CO2 Emissions is 1967.

In [87]:
import plotly.express as px

highest_corr_year = 1967
highest_corr_df = gapminder_df[gapminder_df['Year']== highest_corr_year][['Year', 'continent', co2_label, gdp_label, 'pop']].dropna(subset=[co2_label, gdp_label])

px.scatter(highest_corr_df, x=co2_label, y=gdp_label, color="continent",
                 title="CO2 Emission Per Capita v/s GDP Per Capita", size='pop')

### Time for some questions!

### Q1 

What is the relationship between `continent` and `'Energy use (kg of oil equivalent per capita)'`? 

Well, the first step is to actually see the distribution of energy used per continent.

In [88]:
energy_by_continent = gapminder_df.dropna(subset=['Energy use (kg of oil equivalent per capita)']).groupby(['continent', 'Year']).apply(lambda x: pd.Series({'energy':sum(x['Energy use (kg of oil equivalent per capita)'])})).reset_index()



px.box(energy_by_continent, x='continent', y='energy')

In [89]:
px.line(energy_by_continent, x='Year', y='energy', color='continent')

In [90]:
from scipy.stats import linregress

slopes = energy_by_continent.groupby('continent').apply(lambda v: linregress(v['Year'], v['energy'])[0]).reset_index().sort_values(by=0, ascending=False)
print(slopes)


value = energy_by_continent['continent'].unique()
encoding = { value[i] : i for i in range(len(value)) if str(value[i]) != 'nan'}
energy_by_continent['encoded_continent'] = energy_by_continent['continent'].map(encoding)

pearson_corr, p_value = pearsonr(energy_by_continent['encoded_continent'].values, energy_by_continent['energy'].values)

{
    'Pearson Correlation': pearson_corr, 
    'P-Val': p_value,
}

  continent            0
3    Europe  1587.095732
2      Asia  1320.565606
1  Americas   725.251428
0    Africa   202.972684
4   Oceania   111.155586


{'Pearson Correlation': 0.11547268568812524, 'P-Val': 0.43447929711644373}

Energy usage for Europe is the highest across all years, and also the highest slope, meanwhile Oceania has the lowest overall energy usage and lowest rate of increase over the years. The p-value for pearson's correlation between `continent` and `Energy` usage is very high, and there is no significant relation.

### Q2

Is there a significant difference between Europe and North America with respect to 'Imports of goods and services (% of GDP)' in the years after 1990?

For simplicity, I will only be considering US + Canada + Mexico as North American countries. 


In [91]:
q2_df = gapminder_df[gapminder_df['Year'] > 1990]

goods_label = 'Imports of goods and services (% of GDP)'

sum_df = q2_df[q2_df['continent'].isin(['Europe', 'Asia'])][[goods_label, 'continent', 'Year']].groupby(['continent','Year']).apply(lambda x: pd.Series({'imports': float(np.sum(x[goods_label]))})).reset_index()


px.bar(sum_df, x='Year', y='imports', color='continent', barmode='group')

In [92]:
from scipy.stats import ttest_rel

res = ttest_rel(sum_df[sum_df['continent'] == 'Europe']['imports'].values, sum_df[sum_df['continent'] == 'Asia']['imports'].values)

print("T: " + str(res[0]))
print("p-value: " + str(res[1]))

T: 0.753365900334969
p-value: 0.5059624445564106


The p-value for the paired t-test on the imports of both continents per year is very high, so there is not a significant difference between their imports.

### Q3

What is the country that has the highest 'Population density (people per sq. km of land area)' across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)

In [93]:
from collections import defaultdict
import operator

pd_label = 'Population density (people per sq. km of land area)'

pd_df = gapminder_df[['Country Name', pd_label, 'Year']]

rankings = defaultdict(list)

for year in pd_df['Year'].unique():
    year_df = pd_df[pd_df['Year'] == year].sort_values(by=pd_label, ascending=False)
    for index,row in year_df.iterrows():
        rankings[row['Country Name']].append(index)


rankings = { key:np.mean(rankings[key]) for key in rankings }

max(rankings.items(), key=operator.itemgetter(1))[0]


'Zimbabwe'

Zimbabwe had the highest average ranking across all years in Population Density.

### Q4

What country has shown the greatest increase in 'Life expectancy at birth, total (years)' since 1962?


In [94]:
le_label = 'Life expectancy at birth, total (years)'

le_df = gapminder_df[['Year', 'Country Name', le_label]]

max_year = max(le_df['Year'].unique())

# Get first and last available values for each country 
le_df = le_df.sort_values(by="Year").groupby(['Country Name']).agg(['first', 'last'])

# Filter to only include results after 1962
le_df = le_df[le_df['Year']['first'] >= 1962]

le_df['le_diff'] = le_df[le_label]['last'] - le_df[le_label]['first']

le_df = le_df.reset_index()
px.scatter(le_df, y='le_diff', color='Country Name')

Maldives's life expectancy has increased by 36 since first recorded after 1962!