Code.rmd

---
title: "STA 230 Project 1"
author: "Stella Lee, Tanvi Jindal"
date: "2/15/2019"
output:
  html_document: default
  pdf_document: default
  word_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
#install.packages("plotly")
#install.packages("scatterplot3d")
library(plotly)
library(ggplot2)
library(readr)
library(dplyr)
library(tidyr)
library(maps) 
library(scatterplot3d)
```

## Upload data
```{r}
happy09 <- read.csv("https://raw.githubusercontent.com/stellasylee/WorldHappinessData/master/Data/2009HPI.csv")
happy12 <- read.csv("https://raw.githubusercontent.com/stellasylee/WorldHappinessData/master/Data/2012HPI.csv")
happy16 <- read.csv("https://raw.githubusercontent.com/stellasylee/WorldHappinessData/master/Data/2016HPI.csv")
variables <- read.csv("https://raw.githubusercontent.com/stellasylee/WorldHappinessData/master/Data/OtherVariables.csv")
```

## Data Cleaning
```{r}
# Select the columns that we are going to use and change the name of columns
# Set country names as identical for every data
# 2009
happy09 <- select(happy09, country, HPI, GDP.per.capita....PPP.)
names(happy09) <- c("country", "HPI09", "GDP09")
happy09$country <- as.character(happy09$country)
happy09$country[happy09$country == "Burma"] <- "Myanmar"
happy09$country[happy09$country == "Korea"] <- "South Korea"
happy09$country[happy09$country == "Congo"] <- "Republic of Congo" 
happy09$country[happy09$country == "Congo, Dem. Rep. of the"] <- "Democratic Republic of the Congo"
# 2012
happy12 <- select(happy12, Country, Happy.Planet.Index, GDP.capita...PPP.)
names(happy12) <- c("country", "HPI12", "GDP12")
happy12$country <- as.character(happy12$country)
happy12$country[happy12$country == "Korea"] <- "South Korea"
happy12$country[happy12$country == "Congo"] <- "Republic of Congo" 
happy12$country[happy12$country == "Congo, Dem. Rep. of the"] <- "Democratic Republic of the Congo"
# 2016
happy16 <- select(happy16, Country, Region, Happy.Planet.Index, X.GDP.capita...PPP.. )
names(happy16) <- c("country","region","HPI16", "GDP16")
happy16$country <- as.character(happy16$country)
# variables
variables <- filter(variables, year == "2009" | year == "2012" | year == "2016")
var <- select (variables, country, year, Healthy.life.expectancy.at.birth, Perceptions.of.corruption)
names(var) <- c("country", "year", "lifeExpect", "corruption") 
var$year <- as.character(var$year)
var$country <- as.character(var$country)
var$country[var$country == "Congo (Brazzaville)"] <- "Republic of Congo" 
var$country[var$country == "Congo (Kinshasa)"] <- "Democratic Republic of the Congo"
var$country[var$country == "Hong Kong S.A.R. of China"] <- "Hong Kong"
var$country[var$country == "United States"] <- "United States of America"
var$country[var$country == "Palestinian Territories"] <- "Palestine"
```

## Data Merging  
```{r}
# Combine
happy <- full_join(x=happy16, y=happy12, by = "country")
happy <- full_join(x=happy, y=happy09, by = "country")
happy$country[happy09$country == "Burma"] <- "Myanmar"
happy$GDP09 <- parse_number(happy$GDP09)
happy$GDP12 <- parse_number(happy$GDP12)
happy$GDP16 <- parse_number(happy$GDP16)
```

## Tidy Data
```{r}
happy <- gather(data = happy, key = year, value = Index, 3:8)
happy <- separate(happy, year, into = c("variable", "year"), sep = -3)
happy$year<- paste0("20", happy$year)
happy <- spread (data = happy, key = variable, value = Index)
happy <- left_join(x = happy, y = var, by = c("country", "year"))
``` 

## Visualization
```{r}
# Map with region and Happiness
# Base world map
World <- map_data("world")
# Change the column and row for matching with the Happiness dataset
names(World) <- c("long", "lat", "group", "order", "country", "subregion")
World$country[World$country == "USA"] <- "United States of America" 
World$country[World$country == "UK"] <- "United Kingdom" 
World$country[World$subregion == "Hong Kong"] <- "Hong Kong" 
World$country[World$country == "Trinidad" | World$country == "Tobago" ] <- "Trinidad and Tobago" 
# Join the map, happiness data for visualization
final <- full_join(World, happy, by = "country")
final09 <- filter(happy, year == "2009")
final12 <- filter(happy, year == "2012")
final16 <- filter(happy, year == "2016")
map09 <- full_join(World, final09, by = "country")
map12 <- full_join(World, final12, by = "country")
map16 <- full_join(World, final16, by = "country")
```

## General Maps
```{r}
GDPMap09 <- ggplot() + geom_polygon(data=map09, 
          aes(x=long, y=lat, group=group, fill = log(GDP)), color="white", size = 0.2) +
  scale_fill_gradientn(colours = rainbow(7))+guides(fill=FALSE)
lifeMap09 <- ggplot() + geom_polygon(data=map09, 
          aes(x=long, y=lat, group=group, fill = lifeExpect), 
          color="white", size = 0.2) + 
  scale_fill_gradientn(colours = rainbow(7))+guides(fill=FALSE)
corrupMap09 <- ggplot() + geom_polygon(data=map09, 
          aes(x=long, y=lat, group=group, fill = corruption), 
          color="white", size = 0.2) + 
  scale_fill_gradientn( colours = rainbow(7)) +guides(fill=FALSE)
happyMap09 <- ggplot() + geom_polygon(data=map09, 
          aes(x=long, y=lat, group=group, fill = HPI), 
          color="white", size = 0.2) + 
  scale_fill_gradientn(colours = rainbow(7))+guides(fill=FALSE)
map09<- subplot(
  ggplotly(GDPMap09),
  ggplotly(happyMap09),
  ggplotly(lifeMap09),
  ggplotly(corrupMap09),
  nrows = 2, shareX = TRUE, shareY = TRUE, titleY = FALSE, titleX = FALSE
) %>% layout(title = "Happy Planet Index 2009 ", annotations = list(
  list(yanchor = "middle", xanchor = "left", align = "center",x= 0.15, y = 0.54, text = "GDP per capita",showarrow=FALSE,xref = "paper", yref="paper"),
  list(yanchor = "middle", xanchor = "right", align = "center",x= 0.75, y = 0.54, text = "HPI",showarrow=FALSE,xref = "paper", yref="paper"),
  list(yanchor = "middle", xanchor = "left", align = "center",x=0.15, y = 0.0, text = "Life Expectancy",showarrow=FALSE,xref = "paper", yref="paper"),
  list(yanchor = "middle", xanchor = "right", align = "center",x=0.8, y = 0.0, text = "Corruption",showarrow=FALSE,xref = "paper", yref="paper")
))
map09
```
 
# Scatterplots
```{r}
#scatterplots for Happiness (HPI) vs GDP, Life Expectancy (lifeExpect) and corruption for three years

ggplot(data=final09)+  geom_point(aes(x=lifeExpect, y=HPI, colour = region)) + labs(title = "Happiness vs Life Expectancy 2009", x = "Life Expectancy", y = "Happiness Index")
ggplot(data=final09)+  geom_point(aes(x=log(GDP), y=HPI, colour = region)) + labs(title = "Happiness vs log(GDP per capita) 2009", x = "log(GDP per capita)", y = "Happiness Index")
ggplot(data=final09)+  geom_point(aes(x=corruption, y=HPI, colour = region)) + labs(title = "Happiness vs Corruption 2009", x = "Corruption", y = "Happiness Index")
ggplot(data=final12)+  geom_point(aes(x=lifeExpect, y=HPI, colour = region)) + labs(title = "Happiness vs Life Expectancy 2012", x = "Life Expectancy", y = "Happiness Index")
ggplot(data=final12)+  geom_point(aes(x=log(GDP), y=HPI, colour = region)) + labs(title = "Happiness vs log(GDP per capita) 2012", x = "log(GDP per capita)", y = "Happiness Index")
ggplot(data=final12)+  geom_point(aes(x=corruption, y=HPI, colour = region))  + labs(title = "Happiness vs Corruption 2012", x = "Corruption", y = "Happiness Index")
ggplot(data=final16)+  geom_point(aes(x=lifeExpect, y=HPI, colour = region)) + labs(title = "Happiness vs Life Expectancy 2016", x = "Life Expectancy", y = "Happiness Index")
ggplot(data=final16)+  geom_point(aes(x=log(GDP), y=HPI, colour = region)) + labs(title = "Happiness vs log(GDP per capita) 2016", x = "log(GDP per capita)", y = "Happiness Index")
ggplot(data=final16)+  geom_point(aes(x=corruption, y=HPI, colour = region))  + labs(title = "Happiness vs Corruption 2016", x = "Corruption", y = "Happiness Index")
#Happiness by year
ggplot(data=happy)+  geom_point(aes(x=year, y=HPI, colour = region)) + labs(title = "Happiness by year", x = "Year", y = "Happiness Index")
#scatterplot of life expectancy (lifeExpect) and GDP
region <- as.numeric(final09$region)
ggplot(data=final16)+  geom_point(aes(x=log(GDP), y=lifeExpect, colour = region))
#3D scatterplot with Happiness predicted by GDP and life expectancy (lifeExpect)
scatterplot3d(log(final09$GDP), final09$lifeExpect, final09$HPI, color = region, type = "h", main="3D Scatter Plot", xlab = "GDP (logged)", ylab = "Life Expectancy", zlab = "Happiness")
```

We used log GDP here because most of the data lies under $25000 per capita but there exist outliers extending to more than $75000 per capita.The relationship between log(GDP) and HPI seems weakly linear, with high variance.
LifeExpect and HPI seem to have a linear relationship as well for all the three years. While this relationship is stronger than that of HPI and log(GDP), there is a lot of variance.
Corruption and HPI seem to be uncorrelated, as most countries have very high levels of corruption but varying levels of happiness.
This makes us believe that since log(GDP) and lifeExpect are strongly related (as can be observedfor the graph), both variables are confounding HPI's relationship with the other. So, we created a 3D scatterplot to visualize their relationship together. The graph shows a strong relationship between the three variables, with countries from Europe (region marked by green) being the exception- they have high GDP and life expectancy but medium levels of happiness.
Finally, the dotplot of happiness (HPI) by year depicts that, while the variance for HPI was higher in 2009 and 2012, countries in 2016 were on general less happy. Moreover, countries from the same region had similar levels of happiness.
Segregating the data by region allowed us to observe the relationship between each variable and region. Region was closely associated with GDP, happiness and life expectancy. While corruption was not as strongly associated with region, countries from some regions depicted similar levels of corruption.