-
Notifications
You must be signed in to change notification settings - Fork 18
/
looping.R
124 lines (96 loc) · 4.51 KB
/
looping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
setwd("C:/Users/KASUTAJA/Documents/RLadies/Looping")
library(dplyr)
library(tidyr)
install.packages("countrycode") #codes country names into intnl org's classifications
install.packages("countrycode")
library(countrycode)
install.packages("tibble")
library(tibble) #tibble - devt of dataframe
library(ggplot2)
require(dplyr)
require(tidyr)
#download data from FAO website (fao.org) -> Data -> Crops
#Looping
wheat<-read.csv("wheat_yield.csv", header=T)
wheat[,c(1:3, 5:9, 11, 13:14)] <- NULL #get rid of these columns
#long > wide -- spread, create new df
wheat<- spread(wheat, key=Area, value=Value) #key will become column header
wheat<- wheat[, c(2:134)] / 10000 #divide all columns, get rid of 1st column (year)
#create separate year vector
year <- 1961:2014
year
class(year)
class(wheat)
#want to see the wheat yield trends
#two method for looping
#matrix method for looping and performing linear regression
#create new df-s first, can be used as a matrix for diff calculations
df <- matrix(NA, nrow=ncol(wheat), ncol=1) #first fill w NA -> this will be for r2
cf <- matrix(NA, nrow=ncol(wheat), ncol=2) #coefficients - need two values
pval <- matrix(NA, nrow=ncol(wheat), ncol=2) #also need two values
for (i in 1:ncol(wheat)) { #go through column 1 to end
result <- lm(wheat[[i]]~year) #double brackets - go through columns, regresses on yr, will put the lm into object called result, will do it repeatedly for each country
df[i] = summary(result)$r.squared
cf [i,] = coefficients(result) #put in i-th row, both columns; slope & intercept
pval[i,] = summary(result)$coef[,4] #4th column bc pvalue's position is 4th, all rows
}
result
#rbind looping - slow if have a lot of data
r_sqr <- data.frame()
coeff<- data.frame()
pvalues<- data.frame()
for (i in 1:ncol(wheat)) { #go through column 1 to end
result <- lm(wheat[[i]]~year) #double brackets - go through columns, regresses on yr, will put the lm into object called result, will do it repeatedly for each country
df = summary(result)$r.squared
cf = coefficients(result) #put in i-th row, both columns; slope & intercept
pval = summary(result)$coef[,4] #4th column bc pvalue's position is 4th, all rows
r_sqr <- rbind(r_sqr, df)
coeff <- rbind(coeff, cf)
pvalues <- rbind(pvalues,pval)
}
reg_test <- cbind(r_sqr, coeff, pvalues)
countries<-colnames(wheat)
rownames(reg_test)<-countries
columns <- c("r2_linear", "intercept", "slope", "p_intercept", "p_slope")
colnames(reg_test)<- columns #get linear regr results for all countries in one set
#polynomial regression
r_sqr <- data.frame()
coeff<- data.frame()
pvalues<- data.frame()
for (j in 1:ncol(wheat)) { #go through column 1 to end
result <- lm(wheat[[j]] ~ year + I(year^2)) #double brackets - go through columns, regresses on yr, will put the lm into object called result, will do it repeatedly for each country+polyn regr
df = summary(result)$r.squared
cf = coefficients(result) #put in i-th row, both columns; slope & intercept
pval = summary(result)$coef[,4] #4th column bc pvalue's position is 4th, all rows
r_sqr <- rbind(r_sqr, df)
coeff <- rbind(coeff, cf)
pvalues <- rbind(pvalues,pval)
}
reg.test2 <- cbind(r_sqr, coeff, pvalues)
#names(summaryobject) gives a list in lm model
#third loop to get f-test
ftest_p <- matrix(NA, nrow=ncol(wheat), ncol=1)
#when predfeine as matrix, have to put the position in the loop
#pf gives the f-distribution
#unname command gives a value and attribute of that value
for (k in 1:133) {
result <- lm(wheat[[k]] ~year + I(year^2))
sm <- summary(result)
ftest_p[k] <- unname(pf(sm$fstatistic[1], sm$fstatistic[2], sm$fstatistics[3], lower.tail=F))
}
#pf calculates f-test value based
#import population and gdp data and join to reg_test3
world_pop <- read.csv("world_pop.csv", header=T, na.strings="..") #na.strings for WB NA data
world_pop[, c(1:3)] <- NULL #all rows of col 1- 3 should be deleted
world_pop <- world_pop[1:217,]
pop_countries <- world_pop$country.Code
pop_countries2 <- as.character(pop_countries)
world_pop <- world_pop[,2:57]
rownames(world_pop)<- pop_countries2
#r doesnt like numbers for column names
#summarise_each is from dplyr - by columns
#mutate - to create col from 2 cols, transmute - same thing, but want to have the outcome as sep df
#country.name is a parameter in countrycode pckg
#join - column names have to be same in tibble pckg: rownames to col
#innerjoin - join 2 datasets, remove rows if they are not the same
?subset