In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
from matplotlib import style
import plotly.express as px
import plotly.graph_objects as go

In [2]:
df = pd.read_csv("/kaggle/input/country-regional-and-world-gdp/gdp_csv.csv")

df.head(10)

Unnamed: 0,Country Name,Country Code,Year,Value
0,Arab World,ARB,1968,25760680000.0
1,Arab World,ARB,1969,28434200000.0
2,Arab World,ARB,1970,31385500000.0
3,Arab World,ARB,1971,36426910000.0
4,Arab World,ARB,1972,43316060000.0
5,Arab World,ARB,1973,55018390000.0
6,Arab World,ARB,1974,105145800000.0
7,Arab World,ARB,1975,116337000000.0
8,Arab World,ARB,1976,144846200000.0
9,Arab World,ARB,1977,167308300000.0


**Finding out the names of the countries in the dataset**

In [3]:
df["Country Name"].unique()

array(['Arab World', 'Caribbean small states',
       'Central Europe and the Baltics', 'Early-demographic dividend',
       'East Asia & Pacific',
       'East Asia & Pacific (excluding high income)',
       'East Asia & Pacific (IDA & IBRD countries)', 'Euro area',
       'Europe & Central Asia',
       'Europe & Central Asia (excluding high income)',
       'Europe & Central Asia (IDA & IBRD countries)', 'European Union',
       'Fragile and conflict affected situations',
       'Heavily indebted poor countries (HIPC)', 'High income',
       'IBRD only', 'IDA & IBRD total', 'IDA blend', 'IDA only',
       'IDA total', 'Late-demographic dividend',
       'Latin America & Caribbean',
       'Latin America & Caribbean (excluding high income)',
       'Latin America & the Caribbean (IDA & IBRD countries)',
       'Least developed countries: UN classification',
       'Low & middle income', 'Low income', 'Lower middle income',
       'Middle East & North Africa',
       'Middle East & No

Since the dataset is a mixture of regions and countries, it would make sense to seperate these into two sepeate datasets

**Separating the dataset into "regions" and "countries" sub-datasets**

In [4]:
region = ['Arab World', 'Caribbean small states',
       'Central Europe and the Baltics', 'Early-demographic dividend',
       'East Asia & Pacific',
       'East Asia & Pacific (excluding high income)',
       'East Asia & Pacific (IDA & IBRD countries)', 'Euro area',
       'Europe & Central Asia',
       'Europe & Central Asia (excluding high income)',
       'Europe & Central Asia (IDA & IBRD countries)', 'European Union',
       'Fragile and conflict affected situations',
       'Heavily indebted poor countries (HIPC)', 'High income',
       'IBRD only', 'IDA & IBRD total', 'IDA blend', 'IDA only',
       'IDA total', 'Late-demographic dividend',
       'Latin America & Caribbean',
       'Latin America & Caribbean (excluding high income)',
       'Latin America & the Caribbean (IDA & IBRD countries)',
       'Least developed countries: UN classification',
       'Low & middle income', 'Low income', 'Lower middle income',
       'Middle East & North Africa',
       'Middle East & North Africa (excluding high income)',
       'Middle East & North Africa (IDA & IBRD countries)',
       'Middle income', 'North America', 'OECD members',
       'Other small states', 'Pacific island small states',
       'Post-demographic dividend', 'Pre-demographic dividend',
       'Small states', 'South Asia', 'South Asia (IDA & IBRD)',
       'Sub-Saharan Africa', 'Sub-Saharan Africa (excluding high income)',
       'Sub-Saharan Africa (IDA & IBRD countries)', 'Upper middle income',
       'World']

In [5]:
regions = df[df['Country Name'].isin(region)]
regions.reset_index(inplace=True, drop=True)

countries = df[~df['Country Name'].isin(region)]
countries.reset_index(inplace=True, drop=True)

In [6]:
regions.head()

Unnamed: 0,Country Name,Country Code,Year,Value
0,Arab World,ARB,1968,25760680000.0
1,Arab World,ARB,1969,28434200000.0
2,Arab World,ARB,1970,31385500000.0
3,Arab World,ARB,1971,36426910000.0
4,Arab World,ARB,1972,43316060000.0


In [7]:
countries.head()

Unnamed: 0,Country Name,Country Code,Year,Value
0,Afghanistan,AFG,1960,537777800.0
1,Afghanistan,AFG,1961,548888900.0
2,Afghanistan,AFG,1962,546666700.0
3,Afghanistan,AFG,1963,751111200.0
4,Afghanistan,AFG,1964,800000000.0


The focus of this notebook will be on the analysis of individual countries as opposed to regions, therefore only the dataset with individual countries will be cleaned and analysed from this point forward.

**Finding out if each country has the same number of yearly data**

In [8]:
countries.groupby("Country Name")["Year"].count()

Country Name
Afghanistan              38
Albania                  33
Algeria                  57
American Samoa           15
Andorra                  47
                         ..
Virgin Islands (U.S.)    48
West Bank and Gaza       23
Yemen, Rep.              27
Zambia                   57
Zimbabwe                 57
Name: Year, Length: 210, dtype: int64

Each country and region does not have the same number of yearly data since some countries have data going back 57 years while other countries have data going back only less than 27 years, subsequent analysis would not yield accurate results due to this mismatch. Therefore, it would make sense to create a sub-dataset with countries that have the same number of yearly data.

**Creating a sub-dataset which only includes countries who have the latest GDP values from 2016**

In [9]:
countries2 = countries.groupby("Country Name", as_index=False)["Year"].max()

countries3 = countries2[countries2["Year"]==2016]

countries4 = countries3["Country Name"]

countries5 = countries[countries["Country Name"].isin(countries4)]

countries5

Unnamed: 0,Country Name,Country Code,Year,Value
0,Afghanistan,AFG,1960,5.377778e+08
1,Afghanistan,AFG,1961,5.488889e+08
2,Afghanistan,AFG,1962,5.466667e+08
3,Afghanistan,AFG,1963,7.511112e+08
4,Afghanistan,AFG,1964,8.000000e+08
...,...,...,...,...
9196,Zimbabwe,ZWE,2012,1.424249e+10
9197,Zimbabwe,ZWE,2013,1.545177e+10
9198,Zimbabwe,ZWE,2014,1.589105e+10
9199,Zimbabwe,ZWE,2015,1.630467e+10


**Finding out which countries in the sub-dataset has the lowest number of years' worth of data and subsequently removing them so that we remain with an equal and sufficient amount of data for each country**

In [10]:
min_year = countries5.groupby("Country Name", as_index=False)["Year"].min()

min_year["Year"].max()

2013

2013 would not be a good starting year for our analysis since it would only give us 4 years worth of data for each country, therefore we would need to remove this country and repeat this step till we are left with a starting year that would leave us with sufficient amount of data for each country.

In [11]:
min_year[min_year["Year"]==2013]

Unnamed: 0,Country Name,Year
154,Somalia,2013


In [12]:
countries6 = countries5[countries5["Country Name"]!="Somalia"]

min_year = countries6.groupby("Country Name", as_index=False)["Year"].min()

min_year["Year"].max()

2007

2007 is still not a good starting year, therefore we will repeat the process

In [13]:
min_year[min_year["Year"]==2007]

Unnamed: 0,Country Name,Year
119,Nauru,2007


In [14]:
countries7 = countries6[countries5["Country Name"]!="Nauru"]

min_year = countries7.groupby("Country Name", as_index=False)["Year"].min()

min_year["Year"].max()

  """Entry point for launching an IPython kernel.


2002

In [15]:
min_year[min_year["Year"]==2002]

Unnamed: 0,Country Name,Year
3,American Samoa,2002
65,Guam,2002
125,Northern Mariana Islands,2002


In [16]:
countries8 = countries7[~countries7["Country Name"].isin(["American Samoa","Guam","Northern Mariana Islands"])]

min_year = countries8.groupby("Country Name", as_index=False)["Year"].min()

min_year["Year"].max()

2001

In [17]:
min_year[min_year["Year"]==2001]

Unnamed: 0,Country Name,Year
140,Sao Tome and Principe,2001


In [18]:
countries9 = countries8[countries8["Country Name"]!="Sao Tome and Principe"]

min_year = countries9.groupby("Country Name", as_index=False)["Year"].min()

min_year["Year"].max()

2000

I believe 2000 is a great starting year for our analysis since it gives us 17 years worth of data for each country

In [19]:
country = countries9[countries9["Year"]>=2000]

country

Unnamed: 0,Country Name,Country Code,Year,Value
22,Afghanistan,AFG,2001,2.461666e+09
23,Afghanistan,AFG,2002,4.128821e+09
24,Afghanistan,AFG,2003,4.583644e+09
25,Afghanistan,AFG,2004,5.285466e+09
26,Afghanistan,AFG,2005,6.275074e+09
...,...,...,...,...
9196,Zimbabwe,ZWE,2012,1.424249e+10
9197,Zimbabwe,ZWE,2013,1.545177e+10
9198,Zimbabwe,ZWE,2014,1.589105e+10
9199,Zimbabwe,ZWE,2015,1.630467e+10


**Making sure that the new dataset does not have any missing years for each of the countries**

In [20]:
missing = country.groupby("Country Name", as_index=False)["Year"].count()

missing[missing["Year"]!=17]

Unnamed: 0,Country Name,Year
0,Afghanistan,16
38,"Congo, Dem. Rep.",16
76,Iraq,13


Since the three countries above are missing some years as part of their data, they would need to be removed

**Removing Afghanistan, Democratic Republic of Congo and Iraq from the dataset and creating our final dataset that will be used for the analysis**

In [21]:
country_new = pd.DataFrame(country[~country["Country Name"].isin(["Afghanistan","Congo, Dem. Rep.","Iraq"])])

country_new.reset_index(drop=True, inplace=True)

country_new["Country Name"].nunique()

181

The new dataset has 181 countries and their corresponding 17 years worth of data from the year 2000 to 2016

In [22]:
country_new.head()

Unnamed: 0,Country Name,Country Code,Year,Value
0,Albania,ALB,2000,3632044000.0
1,Albania,ALB,2001,4060759000.0
2,Albania,ALB,2002,4435079000.0
3,Albania,ALB,2003,5746946000.0
4,Albania,ALB,2004,7314865000.0


In [23]:
import warnings
warnings.filterwarnings("ignore")

**Finding out the top 15 countries with the highest GDP in 2016**

In [24]:
countries_2016 = country_new[country_new["Year"]==2016]

countries_2016.sort_values("Value", axis=0, ascending=False, inplace=True)

countries_2016.reset_index(drop=True, inplace=True)

countries_2016_highest = countries_2016.head(15)

countries_2016_highest

fig = px.bar(countries_2016_highest, x="Country Name", y="Value", color="Country Name",
             color_discrete_sequence=px.colors.qualitative.Vivid)

fig.update_layout(title_text="Top 15 Countries with the highest GDP (2016)", title_font_size=22,
                  height=800, width=980, yaxis_title="GDP($)", xaxis_title="Country",
                  title_y=0.97, title_x=0.45)

fig.show()

countries_2016_highest

Unnamed: 0,Country Name,Country Code,Year,Value
0,United States,USA,2016,18624480000000.0
1,China,CHN,2016,11199150000000.0
2,Japan,JPN,2016,4940159000000.0
3,Germany,DEU,2016,3477796000000.0
4,United Kingdom,GBR,2016,2647899000000.0
5,France,FRA,2016,2465454000000.0
6,India,IND,2016,2263792000000.0
7,Italy,ITA,2016,1858913000000.0
8,Brazil,BRA,2016,1796187000000.0
9,Canada,CAN,2016,1529760000000.0


**Creating a Heatmap to represent countries GDPs in 2016**

In [25]:
fig = px.choropleth(countries_2016, locations="Country Name", locationmode='country names', color="Value",
                    color_continuous_scale=px.colors.sequential.Redor)

fig.update_layout(title_text="Heatmap of GDPs for Countries in 2016", title_font_size=24,
                  height=800, width=1000, yaxis_title="GDP($)", xaxis_title="Country",
                  title_y=0.85, title_x=0.45)

fig.show()

The reason there are some countries that are unshaded is because we had removed some of them earlier to create a dataset with countries that have the same number of years worth of data

**Finding out the trend of GDP from 2000 to 2016 for the top 15 countries with the highest GDP in 2016**

In [26]:
countries_highest_trend = country_new[country_new["Country Name"].isin(["United States","China","Japan","Germany","United Kingdom",
                                                                       "France","India","Italy","Brazil","Canada","Korea, Rep.",
                                                                       "Russian Federation","Spain","Australia","Mexico"])]

fig = px.bar(countries_highest_trend, x="Country Name", y="Value", color="Country Name",
             color_discrete_sequence=px.colors.qualitative.Dark24,
             animation_frame="Year", animation_group="Country Name")

fig.update_layout(title_text="Trends of Top 15 Countries with the highest GDP in 2016", title_font_size=22,
                  height=700, width=980, yaxis_title="GDP($)", xaxis_title="Country",
                  title_y=0.97, title_x=0.45)

fig.show()


The most important take-away from the graph above is the fact that in 2000, Japan had a higher GDP than that of China but because of China's high GDP growth rate, it overtook Japan after 2009/2010. Furthermore, another important takeaway is that the United States has had the highest GDP in the world since 2000.

**Creating a timeline graph that shows the trends of GDPs for the 181 countries in the dataset**

In [27]:
fig = px.scatter(country_new, x="Country Name", y="Value", color="Country Name",
                 animation_frame="Year", animation_group="Country Name")

fig.update_layout(title_text="GDP Trends of Countries (2000-2016)", title_font_size=22,
                  height=800, width=2000, yaxis_title="GDP($)", xaxis_title="Country",
                  title_y=0.97, title_x=0.45)

fig.show()

This timeline graph, though has countries squeezed together on the x-axis, reveals some important information. It shows how most countries are on the baseline of GDPs that are less than $100 billion, and most importantly, it reveals the countries that have moved away from that baseline since 2000 and have improved their GDPs drastically.This includes, apart from the top 15 countries mentioned in previous graphs, countries such as Argentina, Austria, Belgium, Chile, Colombia, Indonesia, Iran, Hong Kong, Malayisa and many more.

What would help this analysis even further would be to see which countries have improved their GDPs the most during this time-period.

**Finding out the countries with the highest GDP growth rates from 2000 to 2016**

In [28]:
countries_2000 = country_new[country_new["Year"]==2000]
countries_2000.reset_index(drop=True, inplace=True)

countries_2016 = country_new[country_new["Year"]==2016]
countries_2016.reset_index(drop=True, inplace=True)

countries_2000_value = pd.DataFrame(countries_2000["Value"])
countries_2000_value.reset_index(drop=True, inplace=True)

header=["Value 2000"]
countries_2000_value.columns = header

countries_2000_2016 = pd.concat([countries_2016, countries_2000_value], axis=1, ignore_index=False)

countries_2000_2016["GDP Growth Rate(%)"] = ((countries_2000_2016["Value"]-countries_2000_2016["Value 2000"])/countries_2000_2016["Value 2000"])

countries_2000_2016.drop(labels=["Year"], axis=1, inplace=True)

countries_2000_2016.sort_values("GDP Growth Rate(%)", axis=0, ascending=False, kind='quicksort', inplace=True)

top_GDP = countries_2000_2016.head(15)
top_GDP.reset_index(drop=True, inplace=True)

rounded = np.round(top_GDP["GDP Growth Rate(%)"], decimals=2)
rounded_GDP_growth = pd.DataFrame(rounded)
top_GDP["GDP Growth Rate(%)"] = rounded_GDP_growth


fig = px.bar(top_GDP, x="Country Name", y="GDP Growth Rate(%)", color="Country Name",
             color_discrete_sequence=px.colors.qualitative.Dark24, text="GDP Growth Rate(%)")

fig.update_layout(title_text="Top 15 Countries with the highest GDP Growth Rate % (2000-2016)", title_font_size=22,
                  height=700, width=980, yaxis_title="GDP Growth Rate(%)", xaxis_title="Country",
                  title_y=0.97, title_x=0.45)

fig2 = px.choropleth(countries_2000_2016, locations="Country Name", locationmode='country names', color="GDP Growth Rate(%)",
                    color_continuous_scale=px.colors.sequential.Darkmint)

fig2.update_layout(title_text="Heatmap of GDP Growth Rate % (2000-2016)", title_font_size=24,
                  height=800, width=1000, yaxis_title="GDP($)", xaxis_title="Country",
                  title_y=0.85, title_x=0.45)


fig.show()
fig2.show()

top_GDP

Unnamed: 0,Country Name,Country Code,Value,Value 2000,GDP Growth Rate(%)
0,Turkmenistan,TKM,36179890000.0,2904663000.0,11.46
1,Angola,AGO,95335110000.0,9129595000.0,9.44
2,Equatorial Guinea,GNQ,10684800000.0,1045998000.0,9.21
3,Mongolia,MNG,11183460000.0,1136896000.0,8.84
4,China,CHN,11199150000000.0,1211347000000.0,8.25
5,Lao PDR,LAO,15903330000.0,1731198000.0,8.19
6,Ethiopia,ETH,72374220000.0,8242392000.0,7.78
7,Nigeria,NGA,404652700000.0,46386010000.0,7.72
8,Qatar,QAT,152451900000.0,17759890000.0,7.58
9,Ghana,GHA,42689780000.0,4983024000.0,7.57


As illustrated in the table and graphs above, the highest GDP growth rates from 2000 to 2016 were experienced by countries in Central & South East Asia and some countries in Sub-Saharan Africa.

***I will be continuing this analysis further by creating a forecasting model to predict GDPs for the top 15 countries with the highest GDPs in 2016 for 2017, 2018 and 2019. Furthermore, I will be comparing these forecasts to the actual GDP values recorded for these countries, to gauge how accurate the model is.***