/
data_analysis_with_R.Rmd
214 lines (165 loc) · 7.05 KB
/
data_analysis_with_R.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
---
title: "Intro to Data Analysis with R"
author: "Pri Oberoi"
date: "May 16, 2016"
output: html_document
---
# Intro to Data Analysis with R
```{r, echo=TRUE, message=FALSE}
# install.packages("ggplot2")
# install.packages("reshape2")
# install.packages("Rmisc")
library(ggplot2)
library(reshape2)
library(Rmisc)
```
## Visualization
qplot() is good for quick plots and is similar to plot()
ggplot() is more verbose but it has more functionality
### Scatter plots
```{r, echo = TRUE}
p1 <- qplot(carat, price, data = diamonds, size = I(1), alpha = I(1/10), main = "qplot scatter plot")
p2 <- ggplot(data = diamonds, aes(x = carat, y = price)) +
geom_point(size = 1, alpha = 1/10) +
ggtitle("ggplot scatter plot")
multiplot(p1, p2, cols = 2)
# however, with ggplot you can add a loess smoothed fit curve with confidence region
ggplot(data = diamonds, aes(x = carat, y = price)) +
geom_point(size = 1, alpha = 1/10) +
ggtitle("ggplot scatter plot") +
geom_smooth()
# or have a different plot for each value of a categorical variable, like diamond cut
ggplot(data = diamonds, aes(x = carat, y = price)) +
geom_point(size = 1, alpha = 1/10) +
ggtitle("ggplot scatter plot") +
geom_smooth() +
facet_wrap( ~ cut, ncol = 3)
# Your turn!
# Update that last plot so the color of the scatterplot points varies based on the value of “cut”
# You can do this by adding a “colour = “ argument to the aes() mapping and passing in the variable name (cut)
ggplot(data = diamonds, aes(x = carat, y = price, colour = cut)) +
geom_point(size = 1, alpha = 1/10) +
ggtitle("ggplot scatter plot") +
geom_smooth() +
facet_wrap( ~ cut, ncol = 3)
```
### Histograms and Bar Charts
```{r, echo = TRUE}
# Bar Chart
p1 <- ggplot(data = diamonds, aes(x = cut)) +
geom_bar() +
ggtitle("Bar Chart of Diamond Cuts")
# Histogram
p2 <- ggplot(data = diamonds, aes(x = price)) +
geom_histogram(binwidth = 1000) +
ggtitle("Histogram of Diamond Prices")
multiplot(p1, p2, cols = 2)
```
### Boxplots and violin plots
```{r, echo = TRUE}
# Boxplot
p1 <- ggplot(data = diamonds, aes(x = cut, y = log(price))) +
geom_boxplot() +
ggtitle("Boxplot of Diamond Prices by Cut")
# Violin plot
p2 <- ggplot(data = diamonds, aes(x = cut, y = log(price))) +
geom_violin(scale = "count") +
ggtitle("Violin plot of Diamond Prices by Cut")
multiplot(p1, p2, cols = 2)
```
## National Broadband Dataset from NTIA
We are going to apply the data analysis toolkit we are building to the broadband availability data by provider from NTIA for the DC region. This data is from June, 2014 and was pulled from http://www2.ntia.doc.gov/June_2014_datasets.
We will try to figure out if certain broadband company providers show a discrepancy between their advertised up/download speeds and the typical up/download speeds.
```{r, echo = TRUE}
# Load data
data <- read.csv("https://github.com/prioberoi/R_intro_to_data_analysis/raw/master/DC-NBM-Wireless-CSV-JUN-2014.csv", sep = "|")
# You turn!
# Create a histogram of max advertised download speeds (maxaddown)
# You may have to set the binwidth manually
ggplot(data, aes(x = maxaddown)) +
geom_histogram(binwidth = 1) +
ggtitle("Histogram of Max Advertised Download Speeds")
```
### Clean
Column headers are values, not variable names
```{r, echo = TRUE}
# maxaddown, maxadup, typicdown, typicup are values, not variable names
# to use melt, we need to ensure provname is a factor, for melt to recognize it as an id variable
is.factor(data$provname)
# and ensure maxaddown, maxadup, typicdown, and typicup are numeric
str(data[,c("provname", "maxaddown","maxadup","typicdown","typicup")])
data_clean <- melt(data[,c("provname","dbaname", "hoconame", "endusercat...", "maxaddown","maxadup","typicdown","typicup")])
ggplot(data_clean, aes(x = variable, y = value)) +
geom_boxplot()
```
Multiple variables are stored in one column
```{r, echo = TRUE}
# Let's split the values stored in the 'variable' column in data_clean into two variables
data_clean$speedDirection <- "download"
data_clean$speedDirection[data_clean$variable %in% c("maxadup","typicup")] <- "upload"
data_clean$speedSource <- "advertised"
data_clean$speedSource[data_clean$variable %in% c("typicup","typicdown")] <- "typical"
data_clean$variable <- NULL
ggplot(data_clean, aes(x = speedDirection, y = value, colour = speedSource)) +
geom_boxplot(position = "dodge")
```
### Summarize data within groups
What's the mean speed for each advertised speeds and typical speeds?
```{r, echo = TRUE}
aggregate(data_clean[data_clean$speedDirection == 'download','value'], list(data_clean[data_clean$speedDirection == 'download', 'speedSource']), mean)
aggregate(data_clean[data_clean$speedDirection == 'upload','value'], list(data_clean[data_clean$speedDirection == 'upload', 'speedSource']), mean)
```
### Correlation
Is there a correlation between the advertised and typical download speeds?
```{r}
# scatterplot
ggplot(data, aes(x = maxaddown, y = typicdown)) +
geom_point()
ggplot(data, aes(x = maxaddown, y = typicdown)) +
geom_point() +
geom_jitter()
# correlation test
cor.test(data$maxaddown, y = data$typicdown)
```
### Comparing Samples
```{r}
# Null hypothesis: the typical mean upload and download speeds for broadband providers in Washington, DC are the same as the advertised speeds
# t-test
t.test(data_clean[data_clean$speedDirection == 'download','value'] ~ data_clean[data_clean$speedDirection == 'download','speedSource'], data_clean)
t.test(data_clean[data_clean$speedDirection == 'upload','value'] ~ data_clean[data_clean$speedDirection == 'upload','speedSource'], data_clean)
```
### Analysis for model selection and feature selection
```{r}
# Scatterplot matrix
pairs(data[,c("maxaddown","maxadup","typicdown","typicup","spectrum")])
# choosing a model to predict typicdown
p1 <- ggplot(data, aes(x = typicdown, y = maxaddown)) +
geom_point(alpha = 0.9) +
ggtitle("Scatterplot of typicdown and maxaddown")
p2 <- ggplot(data, aes(x = typicdown, y = spectrum)) +
geom_point() +
ggtitle("Scatterplot of typicdown and spectrum")
multiplot(p1, p2, cols = 2)
```
```{r}
# violin plot to see if variables are all on the same scale and if there is a lot of missing data
dataset <- read.csv("https://github.com/prioberoi/R_intro_to_data_analysis/raw/master/injuries.csv", sep = ",")
# Your turn!
# use str to find all the continuous variables
str(dataset)
continuousVariable = c("psu","weight","age","length_of_stay")
# use melt to create a dataframe of the continuous variables, with the variable name as a categorical variable
dataset2 <- melt(dataset[,continuousVariable])
ggplot(dataset2, aes(x = variable, y = value)) +
geom_violin(scale = "count") +
ggtitle("Violin plot of continuous variables in Injuries dataset")
# normalize data
dataset_normalized <- dataset
for(variable in continuousVariable){
dataset_normalized[,variable] <- scale(dataset_normalized[,variable], center = TRUE, scale = FALSE)
}
dataset2 <- melt(dataset_normalized[,continuousVariable])
ggplot(dataset2, aes(x = variable, y = value)) +
geom_violin(scale = "count") +
ggtitle("Violin plot of normalized continuous variables in Injuries dataset")
```