## Regression Analysis SDG Index vs GDP Per Capita

In [139]:
library(readr)
library(dplyr)
library(tidyverse)

### Cleaning the Data

In [140]:
# Classication of countries based on high income countries and rest of the world. Classification based on 
# https://datahelpdesk.worldbank.org/knowledgebase/articles/906519-world-bank-country-and-lending-groups

classification <- read.csv('high_income_classification.csv')
high_income_countries <- classification %>% filter(GroupCode == 'HIC') %>% rename(ID = CountryCode)
# high_income_countries
# nrow(high_income_countries)
rest_of_world_countries <- classification %>% filter(GroupCode == 'LIC' | GroupCode == 'LMY' | GroupCode == 'LMC' | GroupCode == 'MIC' | GroupCode == 'UMC') %>% rename(ID = CountryCode) %>% distinct(CountryName, .keep_all = TRUE)# remove duplicated data
# head(rest_of_world_countries)
# nrow(rest_of_world_countries)
# rest_of_world_countries

In [141]:
# SDG Index Data from Years 2016 - 2019 sourced from SDG Website 
SDG_Index_2016 <- read.csv('2016GlobalIndexResults(1).csv')
SDG_Index_2017 <- read.csv('2017GlobalIndexResults(1).csv')
SDG_Index_2018 <- read.csv('2018GlobalIndexResults(1).csv')
SDG_Index_2019 <- read.csv('2019GlobalIndexResults-1.csv')

In [142]:
SDGI_2016 <- SDG_Index_2016 %>% select(ID, SDGI_Score, SDGI_Rank) %>% rename(SDGI_2016 = 	SDGI_Score, SDGI_Rank_2016 = SDGI_Rank)
# head(SDGI_2016)
SDGI_2017 <- SDG_Index_2017 %>% select(ISO3	, Global.Index.Score..0.100., Global.Index.Rank) %>% 
rename(ID = ISO3, SDGI_2017 = Global.Index.Score..0.100., SDGI_Rank_2017 = Global.Index.Rank)
# head(SDGI_2017)
SDGI_2018 <- SDG_Index_2018 %>% select(X	, X.1, X.2) %>% rename(ID = X, SDGI_2018 = X.1, SDGI_Rank_2018 = X.2)
# head(SDGI_2018)
SDGI_2019 <- SDG_Index_2019 %>% select(	X, X.1, X.2) %>% rename(ID = X, SDGI_2019 = X.1, SDGI_Rank_2019 = X.2)
# head(SDGI_2019)

### Final Data Frame 

In [143]:
# Combined Data frame with country classificaiton and SDG Index 
test_merge <- merge(SDGI_2016, SDGI_2017)
merge_2 <- merge(test_merge, SDGI_2018)
merge_3 <- merge(merge_2, SDGI_2019)
country_classification <- merge(x = high_income_countries, y = rest_of_world_countries, all = TRUE)
merge_4 <- merge(merge_3, country_classification)
df <- merge_4 %>% arrange(desc(SDGI_2016)) %>% relocate(CountryName, GroupName)


# GDP Data - Data sourced from worldbank 
GDP_per_capita <- read.csv('GDP_per_capita_worldbank.csv')
GDP_per_capita <- GDP_per_capita %>% select(World.Development.Indicators	, 	X.58, X.59, X.60, X.61) %>% 
rename(ID = World.Development.Indicators, gdp_2016 = X.58, gdp_2017 = X.59, gdp_2018 = X.60, gdp_2019 = X.61)


df <- merge(df, GDP_per_capita)
df <- na.omit(df)
nrow(df)
df <- df %>% arrange(desc(gdp_2016)) %>% relocate(CountryName, GroupName, gdp_2016, gdp_2017, gdp_2018, gdp_2019)
df$GroupName <- ifelse(df$GroupName == "High income", 0, 1)
head(df)

# Group Name 1 if High Income, 0 otherwise 
model_2019 <- df %>% select(CountryName, ID, gdp_2019, SDGI_2019, SDGI_Rank_2019, GroupName)
regression_model <- lm(SDGI_2019 ~ gdp_2019 + GroupName, data = model_2019)


Unnamed: 0_level_0,CountryName,GroupName,gdp_2016,gdp_2017,gdp_2018,gdp_2019,ID,SDGI_2016,SDGI_Rank_2016,SDGI_2017,SDGI_Rank_2017,SDGI_2018,SDGI_Rank_2018,SDGI_2019,SDGI_Rank_2019,GroupCode
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<int>,<dbl>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>
1,Luxembourg,0,104278.39,107627.15,116654.26,114704.59,LUX,76.66,16,75.0,33,76.1,22,74.8,34.0,HIC
2,Switzerland,0,80172.23,80449.99,82818.11,81993.73,CHE,80.87,5,81.2,8,80.1,7,78.8,17.0,HIC
3,Norway,0,70459.18,75496.75,81734.47,75419.63,NOR,82.31,3,83.9,4,81.2,6,80.7,8.0,HIC
4,Ireland,0,63197.08,69822.35,78621.23,78660.96,IRL,76.75,14,77.9,19,77.5,18,78.2,19.0,HIC
5,Iceland,0,61466.8,71310.94,72968.7,66944.83,ISL,78.41,9,79.3,14,79.7,10,79.2,14.0,HIC
6,United States,0,57927.52,59957.73,62840.02,65118.36,USA,72.71,25,72.4,42,73.0,35,74.5,35.0,HIC


### Regression Model with Dummy Variable 

In [144]:
# Regression model with one independent variable and one dummy variable
# Group Name 1 if High Income, 0 otherwise 
model_2019 <- df %>% select(CountryName, ID, gdp_2019, SDGI_2019, SDGI_Rank_2019, GroupName)
regression_model <- lm(SDGI_2019 ~ gdp_2019 + GroupName, data = model_2019)
summary(regression_model)


Call:
lm(formula = SDGI_2019 ~ gdp_2019 + GroupName, data = model_2019)

Residuals:
    Min      1Q  Median      3Q     Max 
-22.089  -5.806   2.049   6.204  15.305 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  6.996e+01  2.206e+00  31.718  < 2e-16 ***
gdp_2019     1.463e-04  4.932e-05   2.965  0.00355 ** 
GroupName   -8.839e+00  2.190e+00  -4.036 8.89e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 7.746 on 141 degrees of freedom
Multiple R-squared:  0.4426,	Adjusted R-squared:  0.4347 
F-statistic: 55.99 on 2 and 141 DF,  p-value: < 2.2e-16


### Regression Model for High Income Countries 

In [145]:
# Separate Regression Models for High Income Countries and Rest of the World 
# High Income Countries Regression Model 
model_2019_high_income <- model_2019 %>% filter(GroupName == 0)
high_income_regression <- lm(SDGI_2019 ~ gdp_2019, data = model_2019_high_income)
summary(high_income_regression)


Call:
lm(formula = SDGI_2019 ~ gdp_2019, data = model_2019_high_income)

Residuals:
    Min      1Q  Median      3Q     Max 
-11.640  -3.337   1.715   3.568   8.485 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 7.289e+01  1.531e+00  47.599   <2e-16 ***
gdp_2019    7.026e-05  3.441e-05   2.042   0.0469 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5.299 on 46 degrees of freedom
Multiple R-squared:  0.08309,	Adjusted R-squared:  0.06316 
F-statistic: 4.169 on 1 and 46 DF,  p-value: 0.04694


### Regression Model for Rest of The World 

In [146]:
# Rest of the World Regression Model 
model_2019_rest_of_the_world <- model_2019 %>% filter(GroupName == 1)
rest_of_world_regression <- lm(SDGI_2019 ~ gdp_2019, data = model_2019_rest_of_the_world)
summary(rest_of_world_regression)


Call:
lm(formula = SDGI_2019 ~ gdp_2019, data = model_2019_rest_of_the_world)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.6405  -4.0709  -0.8012   4.2361  15.1512 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 5.379e+01  1.008e+00   53.38   <2e-16 ***
gdp_2019    2.030e-03  2.011e-04   10.09   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.22 on 94 degrees of freedom
Multiple R-squared:  0.5201,	Adjusted R-squared:  0.515 
F-statistic: 101.9 on 1 and 94 DF,  p-value: < 2.2e-16
