-
Notifications
You must be signed in to change notification settings - Fork 0
/
Code.rmd
168 lines (158 loc) · 9.73 KB
/
Code.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
---
title: "STA 230 Project 1"
author: "Stella Lee, Tanvi Jindal"
date: "2/15/2019"
output:
html_document: default
pdf_document: default
word_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
#install.packages("plotly")
#install.packages("scatterplot3d")
library(plotly)
library(ggplot2)
library(readr)
library(dplyr)
library(tidyr)
library(maps)
library(scatterplot3d)
```
## Upload data
```{r}
happy09 <- read.csv("https://raw.githubusercontent.com/stellasylee/WorldHappinessData/master/Data/2009HPI.csv")
happy12 <- read.csv("https://raw.githubusercontent.com/stellasylee/WorldHappinessData/master/Data/2012HPI.csv")
happy16 <- read.csv("https://raw.githubusercontent.com/stellasylee/WorldHappinessData/master/Data/2016HPI.csv")
variables <- read.csv("https://raw.githubusercontent.com/stellasylee/WorldHappinessData/master/Data/OtherVariables.csv")
```
## Data Cleaning
```{r}
# Select the columns that we are going to use and change the name of columns
# Set country names as identical for every data
# 2009
happy09 <- select(happy09, country, HPI, GDP.per.capita....PPP.)
names(happy09) <- c("country", "HPI09", "GDP09")
happy09$country <- as.character(happy09$country)
happy09$country[happy09$country == "Burma"] <- "Myanmar"
happy09$country[happy09$country == "Korea"] <- "South Korea"
happy09$country[happy09$country == "Congo"] <- "Republic of Congo"
happy09$country[happy09$country == "Congo, Dem. Rep. of the"] <- "Democratic Republic of the Congo"
# 2012
happy12 <- select(happy12, Country, Happy.Planet.Index, GDP.capita...PPP.)
names(happy12) <- c("country", "HPI12", "GDP12")
happy12$country <- as.character(happy12$country)
happy12$country[happy12$country == "Korea"] <- "South Korea"
happy12$country[happy12$country == "Congo"] <- "Republic of Congo"
happy12$country[happy12$country == "Congo, Dem. Rep. of the"] <- "Democratic Republic of the Congo"
# 2016
happy16 <- select(happy16, Country, Region, Happy.Planet.Index, X.GDP.capita...PPP.. )
names(happy16) <- c("country","region","HPI16", "GDP16")
happy16$country <- as.character(happy16$country)
# variables
variables <- filter(variables, year == "2009" | year == "2012" | year == "2016")
var <- select (variables, country, year, Healthy.life.expectancy.at.birth, Perceptions.of.corruption)
names(var) <- c("country", "year", "lifeExpect", "corruption")
var$year <- as.character(var$year)
var$country <- as.character(var$country)
var$country[var$country == "Congo (Brazzaville)"] <- "Republic of Congo"
var$country[var$country == "Congo (Kinshasa)"] <- "Democratic Republic of the Congo"
var$country[var$country == "Hong Kong S.A.R. of China"] <- "Hong Kong"
var$country[var$country == "United States"] <- "United States of America"
var$country[var$country == "Palestinian Territories"] <- "Palestine"
```
## Data Merging
```{r}
# Combine
happy <- full_join(x=happy16, y=happy12, by = "country")
happy <- full_join(x=happy, y=happy09, by = "country")
happy$country[happy09$country == "Burma"] <- "Myanmar"
happy$GDP09 <- parse_number(happy$GDP09)
happy$GDP12 <- parse_number(happy$GDP12)
happy$GDP16 <- parse_number(happy$GDP16)
```
## Tidy Data
```{r}
happy <- gather(data = happy, key = year, value = Index, 3:8)
happy <- separate(happy, year, into = c("variable", "year"), sep = -3)
happy$year<- paste0("20", happy$year)
happy <- spread (data = happy, key = variable, value = Index)
happy <- left_join(x = happy, y = var, by = c("country", "year"))
```
## Visualization
```{r}
# Map with region and Happiness
# Base world map
World <- map_data("world")
# Change the column and row for matching with the Happiness dataset
names(World) <- c("long", "lat", "group", "order", "country", "subregion")
World$country[World$country == "USA"] <- "United States of America"
World$country[World$country == "UK"] <- "United Kingdom"
World$country[World$subregion == "Hong Kong"] <- "Hong Kong"
World$country[World$country == "Trinidad" | World$country == "Tobago" ] <- "Trinidad and Tobago"
# Join the map, happiness data for visualization
final <- full_join(World, happy, by = "country")
final09 <- filter(happy, year == "2009")
final12 <- filter(happy, year == "2012")
final16 <- filter(happy, year == "2016")
map09 <- full_join(World, final09, by = "country")
map12 <- full_join(World, final12, by = "country")
map16 <- full_join(World, final16, by = "country")
```
## General Maps
```{r}
GDPMap09 <- ggplot() + geom_polygon(data=map09,
aes(x=long, y=lat, group=group, fill = log(GDP)), color="white", size = 0.2) +
scale_fill_gradientn(colours = rainbow(7))+guides(fill=FALSE)
lifeMap09 <- ggplot() + geom_polygon(data=map09,
aes(x=long, y=lat, group=group, fill = lifeExpect),
color="white", size = 0.2) +
scale_fill_gradientn(colours = rainbow(7))+guides(fill=FALSE)
corrupMap09 <- ggplot() + geom_polygon(data=map09,
aes(x=long, y=lat, group=group, fill = corruption),
color="white", size = 0.2) +
scale_fill_gradientn( colours = rainbow(7)) +guides(fill=FALSE)
happyMap09 <- ggplot() + geom_polygon(data=map09,
aes(x=long, y=lat, group=group, fill = HPI),
color="white", size = 0.2) +
scale_fill_gradientn(colours = rainbow(7))+guides(fill=FALSE)
map09<- subplot(
ggplotly(GDPMap09),
ggplotly(happyMap09),
ggplotly(lifeMap09),
ggplotly(corrupMap09),
nrows = 2, shareX = TRUE, shareY = TRUE, titleY = FALSE, titleX = FALSE
) %>% layout(title = "Happy Planet Index 2009 ", annotations = list(
list(yanchor = "middle", xanchor = "left", align = "center",x= 0.15, y = 0.54, text = "GDP per capita",showarrow=FALSE,xref = "paper", yref="paper"),
list(yanchor = "middle", xanchor = "right", align = "center",x= 0.75, y = 0.54, text = "HPI",showarrow=FALSE,xref = "paper", yref="paper"),
list(yanchor = "middle", xanchor = "left", align = "center",x=0.15, y = 0.0, text = "Life Expectancy",showarrow=FALSE,xref = "paper", yref="paper"),
list(yanchor = "middle", xanchor = "right", align = "center",x=0.8, y = 0.0, text = "Corruption",showarrow=FALSE,xref = "paper", yref="paper")
))
map09
```
# Scatterplots
```{r}
#scatterplots for Happiness (HPI) vs GDP, Life Expectancy (lifeExpect) and corruption for three years
ggplot(data=final09)+ geom_point(aes(x=lifeExpect, y=HPI, colour = region)) + labs(title = "Happiness vs Life Expectancy 2009", x = "Life Expectancy", y = "Happiness Index")
ggplot(data=final09)+ geom_point(aes(x=log(GDP), y=HPI, colour = region)) + labs(title = "Happiness vs log(GDP per capita) 2009", x = "log(GDP per capita)", y = "Happiness Index")
ggplot(data=final09)+ geom_point(aes(x=corruption, y=HPI, colour = region)) + labs(title = "Happiness vs Corruption 2009", x = "Corruption", y = "Happiness Index")
ggplot(data=final12)+ geom_point(aes(x=lifeExpect, y=HPI, colour = region)) + labs(title = "Happiness vs Life Expectancy 2012", x = "Life Expectancy", y = "Happiness Index")
ggplot(data=final12)+ geom_point(aes(x=log(GDP), y=HPI, colour = region)) + labs(title = "Happiness vs log(GDP per capita) 2012", x = "log(GDP per capita)", y = "Happiness Index")
ggplot(data=final12)+ geom_point(aes(x=corruption, y=HPI, colour = region)) + labs(title = "Happiness vs Corruption 2012", x = "Corruption", y = "Happiness Index")
ggplot(data=final16)+ geom_point(aes(x=lifeExpect, y=HPI, colour = region)) + labs(title = "Happiness vs Life Expectancy 2016", x = "Life Expectancy", y = "Happiness Index")
ggplot(data=final16)+ geom_point(aes(x=log(GDP), y=HPI, colour = region)) + labs(title = "Happiness vs log(GDP per capita) 2016", x = "log(GDP per capita)", y = "Happiness Index")
ggplot(data=final16)+ geom_point(aes(x=corruption, y=HPI, colour = region)) + labs(title = "Happiness vs Corruption 2016", x = "Corruption", y = "Happiness Index")
#Happiness by year
ggplot(data=happy)+ geom_point(aes(x=year, y=HPI, colour = region)) + labs(title = "Happiness by year", x = "Year", y = "Happiness Index")
#scatterplot of life expectancy (lifeExpect) and GDP
region <- as.numeric(final09$region)
ggplot(data=final16)+ geom_point(aes(x=log(GDP), y=lifeExpect, colour = region))
#3D scatterplot with Happiness predicted by GDP and life expectancy (lifeExpect)
scatterplot3d(log(final09$GDP), final09$lifeExpect, final09$HPI, color = region, type = "h", main="3D Scatter Plot", xlab = "GDP (logged)", ylab = "Life Expectancy", zlab = "Happiness")
```
We used log GDP here because most of the data lies under $25000 per capita but there exist outliers extending to more than $75000 per capita.The relationship between log(GDP) and HPI seems weakly linear, with high variance.
LifeExpect and HPI seem to have a linear relationship as well for all the three years. While this relationship is stronger than that of HPI and log(GDP), there is a lot of variance.
Corruption and HPI seem to be uncorrelated, as most countries have very high levels of corruption but varying levels of happiness.
This makes us believe that since log(GDP) and lifeExpect are strongly related (as can be observedfor the graph), both variables are confounding HPI's relationship with the other. So, we created a 3D scatterplot to visualize their relationship together. The graph shows a strong relationship between the three variables, with countries from Europe (region marked by green) being the exception- they have high GDP and life expectancy but medium levels of happiness.
Finally, the dotplot of happiness (HPI) by year depicts that, while the variance for HPI was higher in 2009 and 2012, countries in 2016 were on general less happy. Moreover, countries from the same region had similar levels of happiness.
Segregating the data by region allowed us to observe the relationship between each variable and region. Region was closely associated with GDP, happiness and life expectancy. While corruption was not as strongly associated with region, countries from some regions depicted similar levels of corruption.