diff --git a/Data_Analysis_Markdown.Rmd b/Data_Analysis_Markdown.Rmd index 00a3659..5d476da 100644 --- a/Data_Analysis_Markdown.Rmd +++ b/Data_Analysis_Markdown.Rmd @@ -204,7 +204,7 @@ Figure 3 shows the mean levels of schooling for the residents with the lightest #####ordered by the size of the average difference between the two. #################################################################################################################### -#prep for Figure 3, create a new dataframe pais_edu with weighted means and CI's +#prep for Figure 3, create a new dataframe pais_ed with weighted means and CIs pais_ed <- colorr_recode_subset %>% filter(!is.na(colorr_recode), !is.na(ed), tone != "medium") %>% group_by(pais, tone) %>% @@ -313,7 +313,7 @@ eight_pais <- colorr_recode_subset %>% #model 1 eight_pais$newid <- paste(eight_pais$upm, eight_pais$estratopri) -svy <- svydesign(ids=~newid, strata=~ estratopri, data=eight_pais, weights=~weight1500) +svy <- svydesign(ids = ~ newid, strata = ~ estratopri, data = eight_pais, weights = ~ weight1500) model1.graph <- svyglm(scale(ed) ~ scale(colorr) + scale(parent_occ) + scale(Urban) + @@ -357,7 +357,7 @@ fig4 <- ggplot(frame, aes(y = coefficient, x = variable.order)) + geom_point() + ylab("95 Percent C.I. (Design -Effects Based)") + xlab("") + theme_classic() + ggtitle("Effects of Skin Color and Other Factors on\n\n Educational Attainment in Select Latin American \n\nCountries") + - theme(plot.title = element_text(lineheight =.6, face ="bold", size = 12), + theme(plot.title = element_text(lineheight = .6, face ="bold", size = 12), axis.text = element_text(colour = "black", face = "bold"), axis.title = element_text(face = "bold")) @@ -369,7 +369,7 @@ fig4.note <- arrangeGrob(fig4, ***Here is our expansion:*** -We re-run a model sepratly for each country, so that we can see the disaggregated coefficients. +We re-run a model separately for each country, so that we can see the disaggregated coefficients. **Here is our new graphic:** @@ -499,5 +499,117 @@ country_grid <- arrangeGrob(country.fig.note.brazil, ``` +In order to explore other possible ways skin color could predict inequality, we decided to look at the income variable. +**Here is our new graphic:** + +![](Images/Figure6_ext.jpeg) + +**Here is our code:** + +```{r} +#Grab income variable from the main data frame and creat new dataframe colorr_income + +data <- read.dta("1626360926english_merge_2010_americasbarometer_v14v3.dta") +new_data <- data %>% + filter(pais != "Canada", pais != "United States", pais != "Haiti" ) + +colorr_recode_subset$income <- new_data$q10 +colorr_income <- colorr_recode_subset %>% + filter(pais %in% c("Brazil", "Mexico", "Guatemala", "Colombia", + "Ecuador", "Bolivia", "Peru", "Dominican Republic")) %>% + filter(!is.na(pais), !is.na(income), !is.na(ed), !is.na(q2), !is.na(q1), + !is.na(parent_occ), !is.na(ur), !is.na(colorr)) %>% + mutate(ed = as.numeric(ed), + income = as.numeric(income), + parent_occ = as.numeric(parent_occ), + age = as.numeric(q2), + Female = ifelse(q1 == "Female", 1, ifelse(q1 == "Male", 0, NA)), + Urban = ifelse(ur == "Urban", 1, ifelse(ur == "Rural", 0, NA)), + Brazil = ifelse(pais == "Brazil", 1, 0), + Mexico = ifelse(pais == "Mexico", 1, 0), + Guatemala = ifelse(pais == "Guatemala", 1, 0), + Colombia = ifelse(pais == "Colombia", 1, 0), + Ecuador = ifelse(pais == "Ecuador", 1, 0), + Bolivia = ifelse(pais == "Bolivia", 1, 0), + Peru = ifelse(pais == "Peru", 1, 0), + Dominican_Republic = ifelse(pais == "Dominican Republic", 1, 0)) + + +``` + +In the data, income is an ordinal variable, rated on a scale of 1-10. This violates one of the assumptions of linear regression that the interval between values be equal. In the case of income, a value of 1 corresponds to an income of $0-25, while a value of 2 corresponds to an income of $25-50 and so on. However, the intervals are not equal. In order to run a linear regression on this variable, one method is to pick the mid-point of the range that the summary value represents. + +```{r} + +#Pick mid-values. For the highest value (above $750), pick $800. + + colorr_income <- colorr_income %>% + mutate(mid_income = ifelse(income == 0, 0, + ifelse(income == 1, 12.5, + ifelse(income == 2, 37.5, + ifelse(income == 3, 75, + ifelse(income == 4, 125, + ifelse(income == 5, 175, + ifelse(income == 6, 250, + ifelse(income == 7, 350, + ifelse(income == 8, 450, + ifelse(income == 9, 625, + ifelse(income == 10, 800, NA)))))))))))) + +colorr_income <- colorr_income %>% + mutate(mid_income = as.numeric(mid_income)) + +#Specify survey design +colorr_income$newid <- paste(colorr_income$upm, colorr_income$estratopri) +svy_income <- svydesign(ids = ~ newid, strata = ~ estratopri, data = colorr_income, weights = ~ weight1500) + +#Run regression +model2_graph <- svyglm(scale(mid_income) ~ scale(colorr) + + scale(ed) + + scale(parent_occ) + + scale(Urban) + + scale(q2) + + scale(Female) + + scale(Mexico) + + scale(Guatemala) + + scale(Colombia) + + scale(Ecuador) + + scale(Bolivia) + + scale(Peru) + + scale(Dominican_Republic), svy_income) + +#Add confidence intervals and coefficients + +model2_coef <- summary(model2_graph)$coefficients[2:7, 1] +model2_ci <- confint(model2_graph) +model2_cilower <- model2_ci[2:7, 1] +model2_ciupper <- model2_ci[2:7, 2] + +model2_labels <- c("Skin Color", "Education", "Parental Occupation", "Urban", "Age", "Female") + +#Create dataframe +model2_frame <- data.frame(variable = model2_labels, + coefficient = model2_coef, + ci_lower = model2_cilower, + ci_upper = model2_ciupper) + +#Create Figure 5 +fig5 <- ggplot(data = model2_frame, aes(x = variable, y = coefficient)) + geom_point() + + geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) + + geom_hline(yintercept = 0, color ="dark green", size = 1) + + coord_flip() + + ylab("95 Percent C.I. (Design-Effects Based)") + xlab("") + + theme_classic() + + ggtitle("Effects of Skin Color and Other Factors on\n\n Income in Select Latin American \n\nCountries") + + theme(plot.title = element_text(lineheight =.6, face ="bold", size = 12), + axis.text = element_text(colour = "black", face = "bold"), + axis.title = element_text(face = "bold")) + +fig5_note <- arrangeGrob(fig5, sub = textGrob("Source: Americas Barometer by LAPOP", x = .1, hjust = 0, vjust = 0, gp = gpar(fontface = "italic", fontsize = 10))) + +``` + +The results above are significant for the study because they bolster Telles and Steele's claim "that the bulk of countries in Latin America and the Caribbean may be safely characterized as pigmentocracies" (2012:6). From Figure 5, it is evident that skin color and education are important predictors of income while other variables such as gender, age, urban/rural domicile, and parental occupation do not yield any significant association. + diff --git a/Data_Analysis_Markdown.html b/Data_Analysis_Markdown.html index 762d795..5f36846 100644 --- a/Data_Analysis_Markdown.html +++ b/Data_Analysis_Markdown.html @@ -85,17 +85,22 @@

March 15, 2015

##### Appendix: OLS Models Predicting Years of Schooling, 2010 ##### ##### Figure 4: Effects of Skin Color and Other Factors on Educational Attainment ##### ##################################################################################################################### -library(survey) -library(foreign) -library(ggplot2) -library(gridExtra) -library(xtable) -library(dplyr) -library(grid) +library(survey) +
## Warning: package 'survey' was built under R version 3.1.2
+
library(foreign)
+
## Warning: package 'foreign' was built under R version 3.1.2
+
library(ggplot2)
+
## Warning: package 'ggplot2' was built under R version 3.1.2
+
library(gridExtra)
+
## Warning: package 'gridExtra' was built under R version 3.1.2
+
library(xtable)
+
## Warning: package 'xtable' was built under R version 3.1.2
+
library(dplyr)
+
## Warning: package 'dplyr' was built under R version 3.1.2
+
library(grid)
 
 clean.2010 <- read.dta("clean2010data.dta")
-
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
-## else paste0(labels, : duplicated levels in factors are deprecated
+
## Warning: duplicated levels in factors are deprecated
colorr_recode_subset <- read.dta("colorr_recode_subset.dta")
####################################################################################################################
 ##### Creating Figure 2: Relation between Skin Color and Educational Attainment in Latin America and the Caribbean
@@ -225,7 +230,7 @@ 

March 15, 2015

#####ordered by the size of the average difference between the two. #################################################################################################################### -#prep for Figure 3, create a new dataframe pais_edu with weighted means and CI's +#prep for Figure 3, create a new dataframe pais_ed with weighted means and CIs pais_ed <- colorr_recode_subset %>% filter(!is.na(colorr_recode), !is.na(ed), tone != "medium") %>% group_by(pais, tone) %>% @@ -323,7 +328,7 @@

March 15, 2015

#model 1 eight_pais$newid <- paste(eight_pais$upm, eight_pais$estratopri) -svy <- svydesign(ids=~newid, strata=~ estratopri, data=eight_pais, weights=~weight1500) +svy <- svydesign(ids = ~ newid, strata = ~ estratopri, data = eight_pais, weights = ~ weight1500) model1.graph <- svyglm(scale(ed) ~ scale(colorr) + scale(parent_occ) + scale(Urban) + @@ -367,7 +372,7 @@

March 15, 2015

ylab("95 Percent C.I. (Design -Effects Based)") + xlab("") + theme_classic() + ggtitle("Effects of Skin Color and Other Factors on\n\n Educational Attainment in Select Latin American \n\nCountries") + - theme(plot.title = element_text(lineheight =.6, face ="bold", size = 12), + theme(plot.title = element_text(lineheight = .6, face ="bold", size = 12), axis.text = element_text(colour = "black", face = "bold"), axis.title = element_text(face = "bold")) @@ -376,7 +381,7 @@

March 15, 2015

x = .1, hjust = 0, vjust = 0, gp = gpar(fontface = "italic", fontsize = 10)))

Here is our expansion:

-

We re-run a model sepratly for each country, so that we can see the disaggregated coefficients.

+

We re-run a model separately for each country, so that we can see the disaggregated coefficients.

Here is our new graphic:

Here is our code:

@@ -496,6 +501,107 @@

March 15, 2015

gp = gpar(fontface = "bold", fontsize = 20)), sub = textGrob("95 Percent C.I. (Design -Effects Based)", gp = gpar(fontface = "bold", fontsize = 18))) +

In order to explore other possible ways skin color could predict inequality, we decided to look at the income variable. Here is our new graphic:

+

+

Here is our code:

+
#Grab income variable from the main data frame and creat new dataframe colorr_income
+
+data <- read.dta("1626360926english_merge_2010_americasbarometer_v14v3.dta")
+
## Warning: duplicated levels in factors are deprecated
+## Warning: duplicated levels in factors are deprecated
+## Warning: duplicated levels in factors are deprecated
+## Warning: duplicated levels in factors are deprecated
+## Warning: duplicated levels in factors are deprecated
+
new_data <- data %>%
+  filter(pais != "Canada", pais != "United States", pais != "Haiti" )
+
+colorr_recode_subset$income <- new_data$q10
+colorr_income <- colorr_recode_subset %>%
+  filter(pais %in% c("Brazil", "Mexico", "Guatemala", "Colombia", 
+                     "Ecuador", "Bolivia", "Peru", "Dominican Republic")) %>%
+   filter(!is.na(pais), !is.na(income), !is.na(ed), !is.na(q2), !is.na(q1), 
+          !is.na(parent_occ), !is.na(ur), !is.na(colorr)) %>%
+  mutate(ed = as.numeric(ed),
+         income = as.numeric(income),
+         parent_occ = as.numeric(parent_occ),
+         age = as.numeric(q2), 
+         Female = ifelse(q1 == "Female", 1, ifelse(q1 == "Male", 0, NA)), 
+         Urban = ifelse(ur == "Urban", 1, ifelse(ur == "Rural", 0, NA)),
+         Brazil = ifelse(pais == "Brazil", 1, 0),
+         Mexico = ifelse(pais == "Mexico", 1, 0),
+         Guatemala = ifelse(pais == "Guatemala", 1, 0),
+         Colombia = ifelse(pais == "Colombia", 1, 0),
+         Ecuador = ifelse(pais == "Ecuador", 1, 0),
+         Bolivia = ifelse(pais == "Bolivia", 1, 0),
+         Peru = ifelse(pais == "Peru", 1, 0),
+         Dominican_Republic = ifelse(pais == "Dominican Republic", 1, 0))
+

In the data, income is an ordinal variable, rated on a scale of 1-10. This violates one of the assumptions of linear regression that the interval between values be equal. In the case of income, a value of 1 corresponds to an income of $0-25, while a value of 2 corresponds to an income of $25-50 and so on. However, the intervals are not equal. In order to run a linear regression on this variable, one method is to pick the mid-point of the range that the summary value represents.

+
#Pick mid-values. For the highest value (above $750), pick $800.
+
+ colorr_income <- colorr_income  %>% 
+  mutate(mid_income = ifelse(income == 0, 0, 
+                      ifelse(income == 1, 12.5, 
+                      ifelse(income == 2, 37.5, 
+                      ifelse(income == 3, 75,
+                      ifelse(income == 4, 125,
+                      ifelse(income == 5, 175,
+                      ifelse(income == 6, 250,
+                      ifelse(income == 7, 350,
+                      ifelse(income == 8, 450,
+                      ifelse(income == 9, 625,
+                      ifelse(income == 10, 800, NA))))))))))))
+
+colorr_income <- colorr_income %>%
+  mutate(mid_income = as.numeric(mid_income))
+
+#Specify survey design
+colorr_income$newid <- paste(colorr_income$upm, colorr_income$estratopri)
+svy_income <- svydesign(ids = ~ newid, strata = ~ estratopri, data = colorr_income, weights = ~ weight1500)
+
+#Run regression
+model2_graph <- svyglm(scale(mid_income) ~ scale(colorr) + 
+                        scale(ed) +
+                        scale(parent_occ) + 
+                        scale(Urban) + 
+                        scale(q2) + 
+                        scale(Female) + 
+                        scale(Mexico) + 
+                        scale(Guatemala) + 
+                        scale(Colombia) + 
+                        scale(Ecuador) + 
+                        scale(Bolivia) + 
+                        scale(Peru) + 
+                        scale(Dominican_Republic), svy_income)
+
+#Add confidence intervals and coefficients
+
+model2_coef <- summary(model2_graph)$coefficients[2:7, 1]
+model2_ci <- confint(model2_graph)
+model2_cilower <- model2_ci[2:7, 1]
+model2_ciupper <- model2_ci[2:7, 2]
+
+model2_labels <- c("Skin Color", "Education", "Parental Occupation", "Urban", "Age", "Female")
+
+#Create dataframe
+model2_frame <- data.frame(variable = model2_labels,
+                           coefficient = model2_coef,
+                           ci_lower = model2_cilower,
+                           ci_upper = model2_ciupper)
+
+#Create Figure 5
+fig5 <- ggplot(data = model2_frame, aes(x = variable, y = coefficient)) + geom_point() +
+  geom_pointrange(aes(ymin = ci_lower,  ymax = ci_upper)) +
+  geom_hline(yintercept = 0, color ="dark green", size = 1) +
+  coord_flip() +
+  ylab("95 Percent C.I. (Design-Effects Based)") + xlab("") +
+  theme_classic() +
+  ggtitle("Effects of Skin Color and Other Factors on\n\n Income in Select Latin American \n\nCountries") +
+  theme(plot.title = element_text(lineheight =.6, face ="bold", size = 12),
+        axis.text = element_text(colour = "black", face = "bold"), 
+        axis.title = element_text(face = "bold"))
+
+fig5_note <- arrangeGrob(fig5, sub = textGrob("Source: Americas Barometer by LAPOP", x = .1, hjust = 0, vjust = 0, gp = gpar(fontface = "italic", fontsize = 10)))
+

The results above are significant for the study because they bolster Telles and Steele’s claim “that the bulk of countries in Latin America and the Caribbean may be safely characterized as pigmentocracies” (2012:6). From Figure 5, it is evident that skin color and education are important predictors of income while other variables such as gender, age, urban/rural domicile, and parental occupation do not yield any significant association.

diff --git a/Images/Figure6_ext.jpeg b/Images/Figure6_ext.jpeg new file mode 100644 index 0000000..6ad6da9 Binary files /dev/null and b/Images/Figure6_ext.jpeg differ