In [96]:
import pandas as pd 
import plotly.express as px

co2_label = "CO2 emissions (metric tons per capita)"
gdp_label = "gdpPercap"

gapminder_df = pd.read_csv('./gapminder_clean.csv')
gapminder_df = gapminder_df.replace([np.inf, -np.inf], np.nan).dropna()

gapminder_df.head().sort_values(by='Year')

Unnamed: 0.1,Unnamed: 0,Country Name,Year,"Agriculture, value added (% of GDP)",CO2 emissions (metric tons per capita),Domestic credit provided by financial sector (% of GDP),Electric power consumption (kWh per capita),Energy use (kg of oil equivalent per capita),Exports of goods and services (% of GDP),"Fertility rate, total (births per woman)",GDP growth (annual %),Imports of goods and services (% of GDP),"Industry, value added (% of GDP)","Inflation, GDP deflator (annual %)","Life expectancy at birth, total (years)",Population density (people per sq. km of land area),"Services, etc., value added (% of GDP)",pop,continent,gdpPercap
22,22,Algeria,1972,8.420846,1.843624,55.748145,142.029428,261.465251,20.449562,7.597,27.423969,25.726868,48.127965,-4.606461,51.171805,6.456244,43.451191,14760787.0,Africa,4182.663766
23,23,Algeria,1977,8.470141,2.368082,58.312256,231.710422,398.656113,30.586567,7.234,5.258586,41.740317,55.230902,11.927099,54.578268,7.42742,36.298962,17152804.0,Africa,4910.416756
17,17,Albania,1997,32.70016,0.490365,54.045926,680.688922,384.594958,9.745654,2.423,-10.837856,34.189995,15.377651,12.088706,72.898098,114.900766,51.922189,3428038.0,Europe,3193.054604
18,18,Albania,2002,24.235933,1.229541,47.193912,1561.122382,645.080154,19.598874,1.975,4.231371,44.335366,19.596072,2.41059,75.15461,111.35073,56.167995,3508512.0,Europe,4604.211737
19,19,Albania,2007,19.874798,1.322335,62.076755,1213.124369,679.861765,28.084222,1.635,5.9,54.788201,25.334355,3.576195,76.470293,108.394781,54.790848,3600523.0,Europe,5937.029526


In [None]:
gapminder_df_1962 = gapminder_df[gapminder_df['Year'] == 1962]

px.scatter(gapminder_df_1962, x=co2_label, y=gdp_label)

In [2]:
from scipy.stats import pearsonr
import numpy as np

pearson_corr, p_value = pearsonr(gapminder_df[co2_label].values, gapminder_df[gdp_label].values)

print('Pearsons Correlation: ' + str(pearson_corr))
print('p value: ' + str(p_value))

Pearsons Correlation: 0.813291861531547
p value: 2.9309176165291103e-280


In [3]:
# Find the year with the highest correlation 

def group_results(name, group):
    pearson_corr, p_value = pearsonr(group[co2_label].values, group[gdp_label].values)
    return pd.Series({'Year': name, 'pvalue': p_value, 'pearson_corr': pearson_corr })

grouped_results = gapminder_df.groupby('Year').apply(lambda x: group_results(x.name, x)).sort_values(by='pearson_corr', ascending=False)

highest_corr_year = int(grouped_results['Year'].iloc[0])

highest_corr_year

1967

In [26]:
import plotly.express as px
highest_corr_df = gapminder_df[gapminder_df['Year']== highest_corr_year]
fig = px.scatter(highest_corr_df, x=co2_label, y=gdp_label, color="continent",
                 title="CO2 Emission Per Capita v/s GDP Per Capita", size="pop")

fig.show()

### Time for some questions!

### Q1 

What is the relationship between `continent` and `'Energy use (kg of oil equivalent per capita)'`? 

Well, the first step is to actually see the distribution of energy used per continent.

In [167]:
energy_by_continent = gapminder_df.groupby(['continent', 'Year']).apply(lambda x: pd.Series({'energy':sum(x['Energy use (kg of oil equivalent per capita)'])})).reset_index()

energy_by_continent



fig = px.box(energy_by_continent, x="continent", y="energy")

fig.show()

Looks like energy usage is the highest for Europe, and drastically falls off for Asia and Americas. I have a hunch this is because Europe is huge + has higher GDP per capita. Furthermore, since Europe technically had the Industrial revolution first, their total CO2 emissions have to be much higher.

In [162]:
from scipy.stats import pearsonr

process = lambda group: pd.Series({'Total GDP':np.sum(group[gdp_label]), 'Total Energy Usage': np.sum(group['Energy use (kg of oil equivalent per capita)'])})

total_gdp = gapminder_df.groupby(['continent','Year']).apply(process).sort_values(by=['Total GDP'], ascending=False).reset_index()
    

total_gdp


Unnamed: 0,continent,Year,Total GDP,Total Energy Usage
0,Europe,2007,751634.449078,111639.415191
1,Europe,2002,637558.70314,101562.156772
2,Europe,1997,530362.323765,91132.271221
3,Europe,1992,305111.299742,50693.007296
4,Americas,2007,234181.535857,47969.009501
5,Asia,2007,185086.99084,34370.69995
6,Americas,2002,163791.296987,32740.548969
7,Americas,1997,154654.636041,28255.393751
8,Asia,2002,152785.712328,29496.719411
9,Europe,1987,150763.551806,28943.306007


Looks like my hunch was right, and if the total GDP of a continent is higher, then the Energy used by the continent is also higher. I think its skewed because I batched all years together and the correlation would fluctuate if time-stepped.  

### Q2

Is there a significant difference between Europe and North America with respect to 'Imports of goods and services (% of GDP)' in the years after 1990?

For simplicity, I will only be considering US + Canada + Mexico as North American countries. 


In [155]:
q2_df = gapminder_df[gapminder_df['Year'] > 1990]

north_american_countries = ['United States', 'Canada', 'Mexico']
goods_label = 'Imports of goods and services (% of GDP)'

na_df = q2_df[q2_df['Country Name'].isin(north_american_countries)]
eu_df = q2_df[q2_df['continent']=='Europe']

eu_goods = eu_df.groupby('Year').apply(lambda x: pd.Series({'EU Import': np.sum(x[goods_label])})).reset_index()
na_goods = na_df.groupby('Year').apply(lambda x: pd.Series({'NA Import': np.sum(x[goods_label])})).reset_index()

merged_goods = eu_goods.merge(na_goods, on='Year')

px.bar(merged_goods, x='Year', y=['EU Import', 'NA Import'], barmode='group')

array(['Albania', 'Algeria', 'Angola', 'Argentina', 'Australia',
       'Austria', 'Bahrain', 'Bangladesh', 'Belgium', 'Benin', 'Bolivia',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria',
       'Cambodia', 'Cameroon', 'Canada', 'Chile', 'China', 'Colombia',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Czech Republic',
       'Denmark', 'Dominican Republic', 'Ecuador', 'El Salvador',
       'Eritrea', 'Finland', 'France', 'Gabon', 'Germany', 'Ghana',
       'Greece', 'Guatemala', 'Honduras', 'Hungary', 'Iceland', 'India',
       'Indonesia', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Jordan',
       'Kenya', 'Lebanon', 'Libya', 'Malaysia', 'Mauritius', 'Mexico',
       'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar',
       'Namibia', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua',
       'Niger', 'Nigeria', 'Norway', 'Oman', 'Pakistan', 'Panama',
       'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Romania',
       'Saudi Arabia', 'Sen