/
RQ3_analysis.R
584 lines (506 loc) · 35.9 KB
/
RQ3_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
library(extrafont)
library(ggplot2)
library(questionr)
library(dplyr)
library(plyr)
library(tibble)
library(reshape2)
library(RColorBrewer)
library(vioplot)
loadfonts()
setwd(".")
data <- read.csv(file="./labelledData/labelled_commits.csv", head=TRUE, sep=",", stringsAsFactors=TRUE)
dataMS <- read.csv(file="./labelledData/labelled_commits_MobileSoft_2018.csv", head=TRUE, sep=",", stringsAsFactors=TRUE)
##################### Data import and checks
# Renaming the Debug top-level category into "Testing & debugging"
levels(data$level1_1) <- gsub("Debug","Testing & debugging", levels(data$level1_1))
levels(data$level1_2) <- gsub("Debug","Testing & debugging", levels(data$level1_2))
levels(data$level1_3) <- gsub("Debug","Testing & debugging", levels(data$level1_3))
levels(dataMS$level1_1) <- gsub("Debug","Testing & debugging", levels(dataMS$level1_1))
levels(dataMS$level1_2) <- gsub("Debug","Testing & debugging", levels(dataMS$level1_2))
levels(dataMS$level1_3) <- gsub("Debug","Testing & debugging", levels(dataMS$level1_3))
# check all rows for which there is no main categorization
emptyMainTag <- data[which(data$level1_1 == ""),]
print(emptyMainTag$commitMessage)
emptyMainTag2 <- dataMS[which(dataMS$level1_1 == ""),]
print(emptyMainTag2$commitMessage)
# check all rows for which there is no app category
emptyCategory <- data[which(data$appCategory == ""),]
print(unique(emptyCategory$repo))
emptyCategory2 <- dataMS[which(dataMS$appCategory == ""),]
print(unique(emptyCategory2$repo))
analyzePlotData <- function(data, dataMS) {
##################### Data reshaping
# Let's reshape the data from wide to long format
dataLong <- reshape(data, varying=c("level2_1", "level2_2", "level2_3", "level1_1", "level1_2", "level1_3"), direction="long", idvar="commit_sha", sep="_")
dataLongMS <- reshape(dataMS, varying=c("level2_1", "level2_2", "level2_3", "level1_1", "level1_2", "level1_3"), direction="long", idvar="commit_sha", sep="_")
# change the names of the added columns so to have the "_" separator
names(dataLong)[10] <- "level_1"
names(dataLong)[9] <- "level_2"
names(dataLongMS)[18] <- "level_1"
names(dataLongMS)[17] <- "level_2"
# Remove all items for which there is no level1 or level2 defined, it is a consequence of the reshape instruction
dataLong <- dataLong[which(dataLong$level_1 != "" | dataLong$level_2 != ""),]
dataLongMS <- dataLongMS[which(dataLongMS$level_1 != "" | dataLongMS$level_2 != ""),]
dataLong <<- dataLong
# Remove all items with main category = Turtle
dataLong <- dataLong[which(dataLong$level_1 != "Turtle"),]
dataLongMS <- dataLongMS[which(dataLongMS$level_1 != "Turtle"),]
# Remove the "time" column
dataLong <- subset(dataLong, select = -c(time) )
dataLongMS <- subset(dataLongMS, select = -c(time) )
##################### It's plotting time!
# we combine the level_1 and level_2 variables
dataLong <- reshape(dataLong, varying=c("level_1", "level_2"), direction="long", idvar="commit_sha", sep="_", new.row.names=c(1:10000000))
dataLong <- dataLong[which(dataLong$level != ""),]
dataLongMS <- reshape(dataLongMS, varying=c("level_1", "level_2"), direction="long", idvar="commit_sha", sep="_", new.row.names=c(1:10000000))
dataLongMS <- dataLongMS[which(dataLongMS$level != ""),]
# we rename all the levels of the "level" variable so that they appear in the plot as in the paper.
levels(dataLong$level) <- gsub("Enhancement", "A - App enhancement", levels(dataLong$level))
levels(dataLong$level) <- gsub("NewFeature", "A.1 - New feature", levels(dataLong$level))
levels(dataLong$level) <- gsub("Changes", "A.2 - Feature changes", levels(dataLong$level))
levels(dataLong$level) <- gsub("Usability", "A.3 - Usability", levels(dataLong$level))
levels(dataLong$level) <- gsub("Language", "A.4 - Language", levels(dataLong$level))
levels(dataLong$level) <- gsub("Lifecycle", "A.5 - Android lifecycle", levels(dataLong$level))
levels(dataLong$level) <- gsub("Monetization", "A.6 - Monetization", levels(dataLong$level))
levels(dataLong$level) <- gsub("Utility", "A.7 - Utility", levels(dataLong$level))
levels(dataLong$level) <- gsub("Bug Fix", "B - Bug fixing", levels(dataLong$level))
levels(dataLong$level) <- gsub("App Specific", "B.1 - App specific", levels(dataLong$level))
levels(dataLong$level) <- gsub("Performance", "B.2 - Performance", levels(dataLong$level))
levels(dataLong$level) <- gsub("Security", "B.3 - Security", levels(dataLong$level))
levels(dataLong$level) <- gsub("Crash", "B.4 - Crash", levels(dataLong$level))
levels(dataLong$level) <- gsub("Energy", "B.5 - Energy", levels(dataLong$level))
levels(dataLong$level) <- gsub("Project management", "C - Project management", levels(dataLong$level))
levels(dataLong$level) <- gsub("MetaGithub", "C.1 - GitHub-related", levels(dataLong$level))
levels(dataLong$level) <- gsub("Release", "C.2 - Release management", levels(dataLong$level))
levels(dataLong$level) <- gsub("TODO", "C.3 - Todo item", levels(dataLong$level))
levels(dataLong$level) <- gsub("Documentation", "C.4 - Documentation", levels(dataLong$level))
levels(dataLong$level) <- gsub("Build", "C.5 - Build", levels(dataLong$level))
levels(dataLong$level) <- gsub("Manifest", "C.6 - Manifest", levels(dataLong$level))
levels(dataLong$level) <- gsub("IDE", "C.7 - IDE", levels(dataLong$level))
levels(dataLong$level) <- gsub("Code Re-Organization", "D - Code re-organization", levels(dataLong$level))
levels(dataLong$level) <- gsub("Refactoring", "D.1 - Refactoring", levels(dataLong$level))
levels(dataLong$level) <- gsub("Cleanup", "D.2 - Code cleanup", levels(dataLong$level))
levels(dataLong$level) <- gsub("FeatureRemove", "D.3 - Feature removal", levels(dataLong$level))
levels(dataLong$level) <- gsub("Resize", "D.4 - Reduce app size", levels(dataLong$level))
levels(dataLong$level) <- gsub("CodeDead", "D.5 - Dead code elimination", levels(dataLong$level))
levels(dataLong$level) <- gsub("^UI", "E - User experience improvement", levels(dataLong$level))
levels(dataLong$level) <- gsub("GUI", "E.1 - GUI", levels(dataLong$level))
levels(dataLong$level) <- gsub("Strings", "E.2 - Strings", levels(dataLong$level))
levels(dataLong$level) <- gsub("Icon", "E.3 - Images", levels(dataLong$level))
levels(dataLong$level) <- gsub("Gesture", "E.4 - Gesture", levels(dataLong$level))
levels(dataLong$level) <- gsub("Orientation", "E.5 - Orientation", levels(dataLong$level))
levels(dataLong$level) <- gsub("Dialog", "E.6 - Dialog", levels(dataLong$level))
levels(dataLong$level) <- gsub("Menu", "E.7 - Menu", levels(dataLong$level))
levels(dataLong$level) <- gsub("Storage", "F - Storage management", levels(dataLong$level))
levels(dataLong$level) <- gsub("Settings", "F.1 - Settings", levels(dataLong$level))
levels(dataLong$level) <- gsub("DB", "F.2 - Local database", levels(dataLong$level))
levels(dataLong$level) <- gsub("FileSystem", "F.3 - File system", levels(dataLong$level))
levels(dataLong$level) <- gsub("Sensing & communication", "G - Sensing & communication", levels(dataLong$level))
levels(dataLong$level) <- gsub("Networking", "G.1 - Network", levels(dataLong$level))
levels(dataLong$level) <- gsub("Audio", "G.2 - Audio", levels(dataLong$level))
levels(dataLong$level) <- gsub("^Image", "G.3 - Image", levels(dataLong$level))
levels(dataLong$level) <- gsub("Sensors", "G.4 - Sensor", levels(dataLong$level))
levels(dataLong$level) <- gsub("Camera", "G.5 - Camera", levels(dataLong$level))
levels(dataLong$level) <- gsub("Messaging", "G.6 - Messaging", levels(dataLong$level))
levels(dataLong$level) <- gsub("Call", "G.7 - Call", levels(dataLong$level))
levels(dataLong$level) <- gsub("Mic", "G.8 - Microphone", levels(dataLong$level))
levels(dataLong$level) <- gsub("API Management", "H - API management", levels(dataLong$level))
levels(dataLong$level) <- gsub("Library", "H.1 - Library", levels(dataLong$level))
levels(dataLong$level) <- gsub("API_Android", "H.2 - Android API", levels(dataLong$level))
levels(dataLong$level) <- gsub("API_REST", "H.3 - REST API", levels(dataLong$level))
levels(dataLong$level) <- gsub("Deprecation", "H.4 - Deprecation", levels(dataLong$level))
levels(dataLong$level) <- gsub("^Testing & debugging", "I - Testing & debugging", levels(dataLong$level))
levels(dataLong$level) <- gsub("^Test", "I.1 - Testing", levels(dataLong$level))
levels(dataLong$level) <- gsub("Logging", "I.2 - Logging", levels(dataLong$level))
levels(dataLong$level) <- gsub("Debug", "I.3 - Debugging", levels(dataLong$level))
# we rename all the levels of the "level" variable so that they appear in the plot as in the paper.
levels(dataLongMS$level) <- gsub("Enhancement", "A - App enhancement", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("NewFeature", "A.1 - New feature", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Changes", "A.2 - Feature changes", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Usability", "A.3 - Usability", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Language", "A.4 - Language", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Lifecycle", "A.5 - Android lifecycle", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Monetization", "A.6 - Monetization", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Utility", "A.7 - Utility", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Bug Fix", "B - Bug fixing", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("App Specific", "B.1 - App specific", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Performance", "B.2 - Performance", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Security", "B.3 - Security", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Crash", "B.4 - Crash", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Energy", "B.5 - Energy", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Project management", "C - Project management", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("MetaGithub", "C.1 - GitHub-related", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Release", "C.2 - Release management", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("TODO", "C.3 - Todo item", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Documentation", "C.4 - Documentation", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Build", "C.5 - Build", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Manifest", "C.6 - Manifest", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("IDE", "C.7 - IDE", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Code Re-Organization", "D - Code re-organization", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Refactoring", "D.1 - Refactoring", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Cleanup", "D.2 - Code cleanup", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("FeatureRemove", "D.3 - Feature removal", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Resize", "D.4 - Reduce app size", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("CodeDead", "D.5 - Dead code elimination", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("^UI", "E - User experience improvement", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("GUI", "E.1 - GUI", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Strings", "E.2 - Strings", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Icon", "E.3 - Images", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Gesture", "E.4 - Gesture", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Orientation", "E.5 - Orientation", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Dialog", "E.6 - Dialog", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Menu", "E.7 - Menu", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Storage", "F - Storage management", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Settings", "F.1 - Settings", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("DB", "F.2 - Local database", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("FileSystem", "F.3 - File system", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Sensing & communication", "G - Sensing & communication", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Networking", "G.1 - Network", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Audio", "G.2 - Audio", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("^Image", "G.3 - Image", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Sensors", "G.4 - Sensor", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Camera", "G.5 - Camera", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Messaging", "G.6 - Messaging", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Call", "G.7 - Call", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Mic", "G.8 - Microphone", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("API Management", "H - API management", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Library", "H.1 - Library", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("API_Android", "H.2 - Android API", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("API_REST", "H.3 - REST API", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Deprecation", "H.4 - Deprecation", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("^Testing & debugging", "I - Testing & debugging", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("^Test", "I.1 - Testing", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Logging", "I.2 - Logging", levels(dataLongMS$level))
levels(dataLongMS$level) <- gsub("Debug", "I.3 - Debugging", levels(dataLongMS$level))
# Let's order the factors alphabetically so that it will be easy to check the obtained results
dataLong$level <- factor(dataLong$level, levels = sort(as.character(levels(dataLong$level)), decreasing = TRUE))
dataLongMS$level <- factor(dataLongMS$level, levels = sort(as.character(levels(dataLongMS$level)), decreasing = TRUE))
# we store all the counts for each type of developer activity for both datasets (ICSME and MobileSoft)
countsICSME <- plyr::count(dataLong$level)
countsMS <- plyr::count(dataLongMS$level)
# let's check them visually first
print(countsICSME)
print(countsMS)
# x must be a character
countsICSME$x <- as.character(countsICSME$x)
# the following categories where empty in countsICSME, so we need to manually add them, so that they will be levels of the x factor
countsICSME <- rbind(countsICSME, c("x"= "G.6 - Messaging", "freq"= 0))
countsICSME <- rbind(countsICSME, c("x"= "G.7 - Call", "freq"= 0))
countsICSME <- rbind(countsICSME, c("x"= "B.5 - Energy", "freq"= 0))
countsICSME <- rbind(countsICSME, c("x"= "G.8 - Microphone", "freq"= 0))
# x should be a factor column now
countsICSME$x <- as.factor(countsICSME$x)
# here we store the percentages
countsICSME$percICSME <- (as.numeric(countsICSME$freq) / 2112) * 100
countsMS$percMS <- (as.numeric(countsMS$freq) / 5000) * 100
# we rename the columns we initialized before
names(countsICSME)[2] <- "freqICSME"
names(countsMS)[2] <- "freqMS"
# this is for debugging purposes, in the global variable "a" we put the join of the dataframes belonging to the 2 dataframes
a <<- inner_join(countsICSME, countsMS, by = c("x"))
# in the diff column of "a" we put the differences between the ICSME dataset and the MobileSoft one
a$diff <<- a$percICSME - a$percMS
# we add dedicated columns for the Source of the data, it can be either "ICSME" or "MS" (it stands for MobileSoft)
dataLong$Source <- "ICSME"
dataLongMS$Source <- "MS"
# we merge the two datasets into a single dataframe called dataTOTAL
dataLongFINAL <- subset(dataLong, select = c(level, Source))
dataLongMSFINAL <- subset(dataLongMS, select = c(level, Source))
dataTOTAL <- rbind(dataLongFINAL, dataLongMSFINAL)
# we clean the level column by removing unused levels
dataTOTAL$level <- droplevels(dataTOTAL$level)
# we build a table out of dataTOTAL, so that we can calculate its Chi Square value and Cramer's V
tableTOTAL <- table(dataTOTAL$Source, dataTOTAL$level)
chisquare <- chisq.test(tableTOTAL, correct = T, simulate.p.value = F)
cramer <- cramer.v(tableTOTAL)
# we print and return the obtained Chi Square value and its Cramer's V
result <- c("chisquare" = chisquare$p.value, "cramer" = cramer)
print(result)
return(result)
}
# here we will store all Chi Square values and Cramer's Vs
chisquare.pvalues <- c()
cramer <- c()
######### in the following blocks we comput the Chi Square values and Cramer's Vs for each type of maintainability issue
filteredSnapshots <- snapshots[which(snapshots[["outliers.up.mt_density"]] == 1),]
names(filteredSnapshots)[32] <- "snapshotId"
filteredSnapshots <- subset(filteredSnapshots, select = c(snapshotId) )
filteredData <- inner_join(filteredSnapshots, data, by=c("snapshotId", "snapshotId"))
filteredData <- data
result <- analyzePlotData(filteredData, dataMS)
chisquare.pvalues["MT"] <- result[1]
cramer["MT"] <- result[2]
filteredSnapshots <- snapshots[which(snapshots[["outliers.up.us_density"]] == 1),]
names(filteredSnapshots)[32] <- "snapshotId"
filteredSnapshots <- subset(filteredSnapshots, select = c(snapshotId) )
filteredData <- inner_join(filteredSnapshots, data, by=c("snapshotId", "snapshotId"))
result <- analyzePlotData(filteredData, dataMS)
chisquare.pvalues["US"] <- result[1]
cramer["US"] <- result[2]
filteredSnapshots <- snapshots[which(snapshots[["outliers.up.ui_density"]] == 1),]
names(filteredSnapshots)[32] <- "snapshotId"
filteredSnapshots <- subset(filteredSnapshots, select = c(snapshotId) )
filteredData <- inner_join(filteredSnapshots, data, by=c("snapshotId", "snapshotId"))
result <- analyzePlotData(filteredData, dataMS)
chisquare.pvalues["UI"] <- result[1]
cramer["UI"] <- result[2]
filteredSnapshots <- snapshots[which(snapshots[["outliers.up.mc_density"]] == 1),]
names(filteredSnapshots)[32] <- "snapshotId"
filteredSnapshots <- subset(filteredSnapshots, select = c(snapshotId) )
filteredData <- inner_join(filteredSnapshots, data, by=c("snapshotId", "snapshotId"))
result <- analyzePlotData(filteredData, dataMS)
chisquare.pvalues["MC"] <- result[1]
cramer["MC"] <- result[2]
filteredSnapshots <- snapshots[which(snapshots[["outliers.up.uc_density"]] == 1),]
names(filteredSnapshots)[32] <- "snapshotId"
filteredSnapshots <- subset(filteredSnapshots, select = c(snapshotId) )
filteredData <- inner_join(filteredSnapshots, data, by=c("snapshotId", "snapshotId"))
result <- analyzePlotData(filteredData, dataMS)
chisquare.pvalues["UC"] <- result[1]
cramer["UC"] <- result[2]
filteredSnapshots <- snapshots[which(snapshots[["outliers.up.dp_density"]] == 1),]
names(filteredSnapshots)[32] <- "snapshotId"
filteredSnapshots <- subset(filteredSnapshots, select = c(snapshotId) )
filteredData <- inner_join(filteredSnapshots, data, by=c("snapshotId", "snapshotId"))
result <- analyzePlotData(filteredData, dataMS)
chisquare.pvalues["DP"] <- result[1]
cramer["DP"] <- result[2]
filteredSnapshots <- snapshots
names(filteredSnapshots)[32] <- "snapshotId"
filteredSnapshots <- subset(filteredSnapshots, select = c(snapshotId) )
filteredData <- inner_join(filteredSnapshots, data, by=c("snapshotId", "snapshotId"))
result <- analyzePlotData(filteredData, dataMS)
chisquare.pvalues["ALL"] <- result[1]
cramer["ALL"] <- result[2]
# this function creates the large heatmap in which we visually correlate issue types and developer's activities
buildTiles <- function() {
# Let's reshape the data from wide to long format
dataLong <- reshape(data, varying=c("level2_1", "level2_2", "level2_3", "level1_1", "level1_2", "level1_3"), direction="long", idvar="commit_sha", sep="_")
# change the names of the added columns so to have the "_" separator
names(dataLong)[10] <- "level_1"
names(dataLong)[9] <- "level_2"
# Remove all items for which there is no level1 or level2 defined, it is a consequence of the reshape instruction
dataLong <- dataLong[which(dataLong$level_1 != "" | dataLong$level_2 != ""),]
# Remove all items with main category = Turtle
dataLong <- dataLong[which(dataLong$level_1 != "Turtle"),]
# Remove the "time" column
dataLong <- subset(dataLong, select = -c(time) )
# we combine the level_1 and level_2 variables
dataLong <- reshape(dataLong, varying=c("level_1", "level_2"), direction="long", idvar="commit_sha", sep="_", new.row.names=c(1:10000000))
#names(dataLong)[8] <- "level"
dataLong <- dataLong[which(dataLong$level != ""),]
dataLong <- subset(dataLong, select = c(ID, snapshotId, level))
# we rename all levels of dataLong$level to make them more readable
levels(dataLong$level) <- gsub("Enhancement", "A - App enhancement", levels(dataLong$level))
levels(dataLong$level) <- gsub("NewFeature", "A.1 - New feature", levels(dataLong$level))
levels(dataLong$level) <- gsub("Changes", "A.2 - Feature changes", levels(dataLong$level))
levels(dataLong$level) <- gsub("Usability", "A.3 - Usability", levels(dataLong$level))
levels(dataLong$level) <- gsub("Language", "A.4 - Language", levels(dataLong$level))
levels(dataLong$level) <- gsub("Lifecycle", "A.5 - Android lifecycle", levels(dataLong$level))
levels(dataLong$level) <- gsub("Monetization", "A.6 - Monetization", levels(dataLong$level))
levels(dataLong$level) <- gsub("Utility", "A.7 - Utility", levels(dataLong$level))
levels(dataLong$level) <- gsub("Bug Fix", "B - Bug fixing", levels(dataLong$level))
levels(dataLong$level) <- gsub("App Specific", "B.1 - App specific", levels(dataLong$level))
levels(dataLong$level) <- gsub("Performance", "B.2 - Performance", levels(dataLong$level))
levels(dataLong$level) <- gsub("Security", "B.3 - Security", levels(dataLong$level))
levels(dataLong$level) <- gsub("Crash", "B.4 - Crash", levels(dataLong$level))
levels(dataLong$level) <- gsub("Energy", "B.5 - Energy", levels(dataLong$level))
levels(dataLong$level) <- gsub("Project management", "C - Project management", levels(dataLong$level))
levels(dataLong$level) <- gsub("MetaGithub", "C.1 - GitHub-related", levels(dataLong$level))
levels(dataLong$level) <- gsub("Release", "C.2 - Release management", levels(dataLong$level))
levels(dataLong$level) <- gsub("TODO", "C.3 - Todo item", levels(dataLong$level))
levels(dataLong$level) <- gsub("Documentation", "C.4 - Documentation", levels(dataLong$level))
levels(dataLong$level) <- gsub("Build", "C.5 - Build", levels(dataLong$level))
levels(dataLong$level) <- gsub("Manifest", "C.6 - Manifest", levels(dataLong$level))
levels(dataLong$level) <- gsub("IDE", "C.7 - IDE", levels(dataLong$level))
levels(dataLong$level) <- gsub("Code Re-Organization", "D - Code re-organization", levels(dataLong$level))
levels(dataLong$level) <- gsub("Refactoring", "D.1 - Refactoring", levels(dataLong$level))
levels(dataLong$level) <- gsub("Cleanup", "D.2 - Code cleanup", levels(dataLong$level))
levels(dataLong$level) <- gsub("FeatureRemove", "D.3 - Feature removal", levels(dataLong$level))
levels(dataLong$level) <- gsub("Resize", "D.4 - Reduce app size", levels(dataLong$level))
levels(dataLong$level) <- gsub("CodeDead", "D.5 - Dead code elimination", levels(dataLong$level))
levels(dataLong$level) <- gsub("^UI", "E - User experience improvement", levels(dataLong$level))
levels(dataLong$level) <- gsub("GUI", "E.1 - GUI", levels(dataLong$level))
levels(dataLong$level) <- gsub("Strings", "E.2 - Strings", levels(dataLong$level))
levels(dataLong$level) <- gsub("Icon", "E.3 - Images", levels(dataLong$level))
levels(dataLong$level) <- gsub("Gesture", "E.4 - Gesture", levels(dataLong$level))
levels(dataLong$level) <- gsub("Orientation", "E.5 - Orientation", levels(dataLong$level))
levels(dataLong$level) <- gsub("Dialog", "E.6 - Dialog", levels(dataLong$level))
levels(dataLong$level) <- gsub("Menu", "E.7 - Menu", levels(dataLong$level))
levels(dataLong$level) <- gsub("Storage", "F - Storage management", levels(dataLong$level))
levels(dataLong$level) <- gsub("Settings", "F.1 - Settings", levels(dataLong$level))
levels(dataLong$level) <- gsub("DB", "F.2 - Local database", levels(dataLong$level))
levels(dataLong$level) <- gsub("FileSystem", "F.3 - File system", levels(dataLong$level))
levels(dataLong$level) <- gsub("Sensing & communication", "G - Sensing & communication", levels(dataLong$level))
levels(dataLong$level) <- gsub("Networking", "G.1 - Network", levels(dataLong$level))
levels(dataLong$level) <- gsub("Audio", "G.2 - Audio", levels(dataLong$level))
levels(dataLong$level) <- gsub("^Image", "G.3 - Image", levels(dataLong$level))
levels(dataLong$level) <- gsub("Sensors", "G.4 - Sensor", levels(dataLong$level))
levels(dataLong$level) <- gsub("Camera", "G.5 - Camera", levels(dataLong$level))
levels(dataLong$level) <- gsub("Messaging", "G.6 - Messaging", levels(dataLong$level))
levels(dataLong$level) <- gsub("Call", "G.7 - Call", levels(dataLong$level))
levels(dataLong$level) <- gsub("Mic", "G.8 - Microphone", levels(dataLong$level))
levels(dataLong$level) <- gsub("API Management", "H - API management", levels(dataLong$level))
levels(dataLong$level) <- gsub("Library", "H.1 - Library", levels(dataLong$level))
levels(dataLong$level) <- gsub("API_Android", "H.2 - Android API", levels(dataLong$level))
levels(dataLong$level) <- gsub("API_REST", "H.3 - REST API", levels(dataLong$level))
levels(dataLong$level) <- gsub("Deprecation", "H.4 - Deprecation", levels(dataLong$level))
levels(dataLong$level) <- gsub("^Testing & debugging", "I - Testing & debugging", levels(dataLong$level))
levels(dataLong$level) <- gsub("^Test", "I.1 - Testing", levels(dataLong$level))
levels(dataLong$level) <- gsub("Logging", "I.2 - Logging", levels(dataLong$level))
levels(dataLong$level) <- gsub("Debug", "I.3 - Debugging", levels(dataLong$level))
# let's order the levels so that they will be ordered also in the heatmap
dataLong$level <- factor(dataLong$level, levels = sort(as.character(levels(dataLong$level)), decreasing = F))
# here we select and collect all snapshots which are outlier for each specific maintainability issue type
filteredMT <- snapshots[which(snapshots[["outliers.up.mt_density"]] == 1),]
filteredUS <- snapshots[which(snapshots[["outliers.up.us_density"]] == 1),]
filteredUI <- snapshots[which(snapshots[["outliers.up.ui_density"]] == 1),]
filteredMC <- snapshots[which(snapshots[["outliers.up.mc_density"]] == 1),]
filteredUC <- snapshots[which(snapshots[["outliers.up.uc_density"]] == 1),]
filteredDP <- snapshots[which(snapshots[["outliers.up.dp_density"]] == 1),]
# we rename the ID column of each filtered dataframe so that later we can join it with dataLong
names(filteredMT)[32] <- "snapshotId"
names(filteredUS)[32] <- "snapshotId"
names(filteredUI)[32] <- "snapshotId"
names(filteredMC)[32] <- "snapshotId"
names(filteredUC)[32] <- "snapshotId"
names(filteredDP)[32] <- "snapshotId"
# we keep only the column containing the snapshot IDs
filteredMT <- subset(filteredMT, select = c(snapshotId) )
filteredUS <- subset(filteredUS, select = c(snapshotId) )
filteredUI <- subset(filteredUI, select = c(snapshotId) )
filteredMC <- subset(filteredMC, select = c(snapshotId) )
filteredUC <- subset(filteredUC, select = c(snapshotId) )
filteredDP <- subset(filteredDP, select = c(snapshotId) )
# we join the filtered dataframes and the full one by snapshotId, in this way we are selecting all snapshots which are outliers, for each maintainability issue type
filteredDataLongMT <- inner_join(filteredMT, dataLong, by=c("snapshotId", "snapshotId"))
filteredDataLongUS <- inner_join(filteredUS, dataLong, by=c("snapshotId", "snapshotId"))
filteredDataLongUI <- inner_join(filteredUI, dataLong, by=c("snapshotId", "snapshotId"))
filteredDataLongMC <- inner_join(filteredMC, dataLong, by=c("snapshotId", "snapshotId"))
filteredDataLongUC <- inner_join(filteredUC, dataLong, by=c("snapshotId", "snapshotId"))
filteredDataLongDP <- inner_join(filteredDP, dataLong, by=c("snapshotId", "snapshotId"))
# we add a dedicated column containing the abbreviated name of the issue type
filteredDataLongMT$IssueType <- factor("MT",levels=rev(c("MT", "US", "UC", "UI", "MC", "DP")))
filteredDataLongUS$IssueType <- factor("US",levels=rev(c("MT", "US", "UC", "UI", "MC", "DP")))
filteredDataLongUI$IssueType <- factor("UI",levels=rev(c("MT", "US", "UC", "UI", "MC", "DP")))
filteredDataLongMC$IssueType <- factor("MC",levels=rev(c("MT", "US", "UC", "UI", "MC", "DP")))
filteredDataLongUC$IssueType <- factor("UC",levels=rev(c("MT", "US", "UC", "UI", "MC", "DP")))
filteredDataLongDP$IssueType <- factor("DP",levels=rev(c("MT", "US", "UC", "UI", "MC", "DP")))
# we bind all filtered dataset, so that we will be ready to build the final heatmap
labelledTrends <- rbind(filteredDataLongMT, filteredDataLongUS, filteredDataLongUI, filteredDataLongMC, filteredDataLongUC, filteredDataLongDP)
# Issue types transformed into upper case
labelledTrends$IssueType <- toupper(labelledTrends$IssueType)
# each occurrence counts 1
labelledTrends$value <- 1
# we remove all unused columns
labelledTrends <- subset(labelledTrends, select = c(IssueType, level, value))
# Let's reshape the data from wide to long format
labelledTrends.long <- melt(labelledTrends, id.vars = c("IssueType", "level"))
labelledTrends.long$value <- as.numeric(labelledTrends.long$value)
labelledTrends.long$IssueType <- factor(as.character(labelledTrends.long$IssueType),levels=rev(c("MT", "US", "UC", "UI", "MC", "DP")))
# custom sum function returns NA when all values in set are NA,
# in a set mixed with NAs, NAs are removed and remaining summed.
# Source: http://www.roymfrancis.com/a-guide-to-elegant-tiled-heatmaps-in-r/
naSum <- function(x)
{
if(all(is.na(x))) val <- sum(x,na.rm=F)
if(!all(is.na(x))) val <- sum(x,na.rm=T)
return(val)
}
#sums incidences for all weeks into one year
labelledTrends.long <- ddply(labelledTrends.long,c("IssueType","level"),value=round(naSum(value),0),summarise)
g <- ggplot(labelledTrends.long,aes(x=level,y=IssueType,fill=value)) +
#add border white colour of line thickness 0.25
geom_tile()+
#redrawing tiles to remove cross lines from legend
geom_tile(colour="white",size=0.25, show_guide=FALSE)+
#remove axis labels, add title
labs(x="",y="",title="")+
#remove extra space
scale_y_discrete(expand=c(0,0))+
scale_fill_gradientn(colours = c("#eef8ed", "#fee08b", "#fdae61", "#d5643f","#d53e4f")) +
#equal aspect ratio x and y axis
coord_fixed()+
#set base size for all font elements
theme_grey(base_size=10)+
#theme options
theme(
#remove legend title
legend.title=element_blank(),
#remove legend margin
legend.margin = grid::unit(0,"cm"),
#change legend text properties
legend.text=element_text(colour="black",size=8,face="plain"),
#change legend key height
legend.key.height=grid::unit(0.5,"cm"),
#set a slim legend
legend.key.width=grid::unit(0.2,"cm"),
#set x axis text size and colour and define what should be bold or plain text
axis.text.x=element_text(size=8,colour="black", angle = 45, hjust = 1, face = rev(c('plain', 'plain', 'plain', 'plain', 'plain',
'bold', 'plain', 'plain', 'plain', 'plain',
'bold', 'plain', 'plain', 'plain', 'plain',
'plain', 'plain', 'bold', 'plain', 'plain',
'plain', 'bold', 'plain', 'plain', 'plain',
'plain', 'plain', 'plain', 'plain', 'bold',
'plain', 'plain', 'plain', 'plain', 'plain',
'bold', 'plain', 'plain', 'plain', 'plain',
'plain', 'plain', 'plain', 'bold', 'plain',
'plain', 'plain', 'plain', 'plain', 'bold',
'plain', 'plain', 'plain', 'plain', 'plain',
'plain', 'plain', 'bold'))),
#set y axis text colour and adjust vertical justification
axis.text.y=element_text(size=8, vjust = 0.2, colour="black", face = rev(c('bold', 'plain', 'plain', 'plain', 'plain', 'plain'))),
#change axis ticks thickness
axis.ticks=element_line(size=0.4),
#change title font, size, colour and justification
plot.title=element_text(colour="black",hjust=0,size=10,face="plain"),
#remove plot background
plot.background=element_blank(),
#remove plot border
panel.border=element_blank(),
plot.margin = unit(c(0, 0, 0, 0), "cm")
)
# we save the plot into a file
fileName <- "./plots/commitsTiles.pdf"
ggsave(fileName, scale = 2.2, height = 3, width = 15, unit = "cm", device=pdf)
embed_fonts(fileName, outfile=fileName)
}
buildTiles()
# Summary statistics for each issue type in the following blocks
s <- subset(snapshots, select = c(ID, shortName, outliers.up.mt_density))
summary(table(s$shortName, s$outliers.up.mt_density)[,2])
sd(table(s$shortName, s$outliers.up.mt_density)[,2])
sd(table(s$shortName, s$outliers.up.mt_density)[,2]) / mean(table(s$shortName, s$outliers.up.mt_density)[,2])
vioplot(table(s$shortName, s$outliers.up.mt_density)[,2])
TOTAL <- table(s$shortName, s$outliers.up.mt_density)
s <- subset(snapshots, select = c(ID, shortName, outliers.up.us_density))
summary(table(s$shortName, s$outliers.up.us_density)[,2])
sd(table(s$shortName, s$outliers.up.us_density)[,2])
sd(table(s$shortName, s$outliers.up.us_density)[,2]) / mean(table(s$shortName, s$outliers.up.us_density)[,2])
vioplot(table(s$shortName, s$outliers.up.us_density)[,2])
TOTAL <- rbind(TOTAL, table(s$shortName, s$outliers.up.us_density))
s <- subset(snapshots, select = c(ID, shortName, outliers.up.uc_density))
summary(table(s$shortName, s$outliers.up.uc_density)[,2])
sd(table(s$shortName, s$outliers.up.uc_density)[,2])
sd(table(s$shortName, s$outliers.up.uc_density)[,2]) / mean(table(s$shortName, s$outliers.up.uc_density)[,2])
vioplot(table(s$shortName, s$outliers.up.uc_density)[,2])
TOTAL <- rbind(TOTAL, table(s$shortName, s$outliers.up.uc_density))
s <- subset(snapshots, select = c(ID, shortName, outliers.up.ui_density))
summary(table(s$shortName, s$outliers.up.ui_density)[,2])
sd(table(s$shortName, s$outliers.up.ui_density)[,2])
sd(table(s$shortName, s$outliers.up.ui_density)[,2]) / mean(table(s$shortName, s$outliers.up.ui_density)[,2])
vioplot(table(s$shortName, s$outliers.up.ui_density)[,2])
TOTAL <- rbind(TOTAL, table(s$shortName, s$outliers.up.ui_density))
s <- subset(snapshots, select = c(ID, shortName, outliers.up.mc_density))
summary(table(s$shortName, s$outliers.up.mc_density)[,2])
sd(table(s$shortName, s$outliers.up.mc_density)[,2])
sd(table(s$shortName, s$outliers.up.mc_density)[,2]) / mean(table(s$shortName, s$outliers.up.mc_density)[,2])
vioplot(table(s$shortName, s$outliers.up.mc_density)[,2])
TOTAL <- rbind(TOTAL, table(s$shortName, s$outliers.up.mc_density))
s <- subset(snapshots, select = c(ID, shortName, outliers.up.dp_density))
summary(table(s$shortName, s$outliers.up.dp_density)[,2])
sd(table(s$shortName, s$outliers.up.dp_density)[,2])
sd(table(s$shortName, s$outliers.up.dp_density)[,2]) / mean(table(s$shortName, s$outliers.up.dp_density)[,2])
vioplot(table(s$shortName, s$outliers.up.dp_density)[,2])
TOTAL <- rbind(TOTAL, table(s$shortName, s$outliers.up.dp_density))