-
Notifications
You must be signed in to change notification settings - Fork 0
/
Visualising_topics_turnbull_160627.R
138 lines (122 loc) · 5.87 KB
/
Visualising_topics_turnbull_160627.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#####################################################################
# Title: Visualising topics in the Turnbull speech @ Liberal rally 26/06/16
# and Shorten's speech at the Labor launch 19/06/16
# Author: Steph de Silva.
# Email: steph@rex-analytics.com
# Date created: 27/06/16
# Date last altered: 27/06/16
# Attributions and acknowledgment of derivation:
# This script is due to code, information, tips and advice given in:
# (1) ropenscilabs/gutenbergr from https://github.com/ropenscilabs/gutenbergr
# Accessed: 20/05/16
# (2) http://www.rdatamining.com/examples/text-mining
# Accessed: 21/05/16
# (3) this program was particularly important and much of the code is derived from here
# http://www.rdatamining.com/examples/social-network-analysis accessed 21/05/16
# Along with helpful code fixes and tweaks from:
# (1) http://stackoverflow.com/questions/25069798/r-tm-in-mclapplycontentx-fun-all-scheduled-cores-encountered-errors
# (2) http://www.inside-r.org/packages/cran/tm/docs/as.TermDocumentMatrix
# (3) https://stat.ethz.ch/pipermail/r-help/2012-May/313013.html
# (4) http://stackoverflow.com/questions/29358571/termdocumentmatrix-raises-error-in-r
# All accessed 21/05/16
# Also:
# code help: http://www.inside-r.org/packages/cran/tm/docs/as.TermDocumentMatrix
# https://stat.ethz.ch/pipermail/r-help/2012-May/313013.html
# Size of nodes look here: http://www.shizukalab.com/toolkits/sna/plotting-networks-pt-2
# useful information: http://www.r-bloggers.com/going-viral-with-rs-igraph-package/
# colours in R: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
# Purpose: This script analyses the transcript of the speech given by M. Turnbull
# at the liberal party campaign launch/rally on 26/06/16
# and the shorten speech at the labor campaign launch 19/06/16
#########################################################################
# Data Used: Transcript of Turnbull's speech from his website.
# Source: http://www.malcolmturnbull.com.au/media/prime-ministers-address-to-the-2016-federal-campaign-rally
# Specifically:
# Translation by:
# Date Accessed: 27/06/16
# Gutenberg Number: NA
#########################################################################
# Data Used: Transcript of Shorten's speech from Australian politics website.
# Source: http://australianpolitics.com/2016/06/19/shorten-alp-campaign-launch.html
# Specifically:
# Translation by:
# Date Accessed: 27/06/16
# Gutenberg Number: NA
#########################################################################
# Script Outline:
# 1. Load Libraries, load data, clean data
# 2. Term document matrix
# 3. Create Social Network Analysis
#########################################################################
# 1. Load libraries, load data
#########################################################################
rm(list=ls(all=TRUE))
library(dplyr)
library(tidytext)
library(tm)
library(NLP)
library(SnowballC)
library(RWeka)
library(rJava)
library(RWekajars)
library(igraph)
cname <- file.path("~", "Desktop", "texts")
dir(cname)
transcript <- Corpus(DirSource(cname))
savetoauspol<-"~/Documents/Rex Analytics/Blog/auspol/network_campaign_launches_160627"
#########################################################################
# 2. Term Document Matrix
#########################################################################
myCorpus <- transcript
myCorpus <- tm_map(myCorpus,
content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')),
mc.cores=1)
myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myStopwords <- c(stopwords('english'))
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
myCorpus<-tm_map(myCorpus, stripWhitespace)
myCorpus <- tm_map(myCorpus, stemDocument,lazy=TRUE)
myCorpus<- tm_map(myCorpus, PlainTextDocument)
# Lenore taylor made a great argument for things that matter in this election, I've used that as a basis. https://www.theguardian.com/australia-news/2016/jun/17/debate-forces-leaders-off-script-but-small-targets-still-rule-the-campaign
campaign_dictionary<-c("jobs", "growth","housing", "childcare", "superannuation", "health", "education", "borders", "immigration", "tax", "medicare", "climate change", "marriage equality", "offshore processing", "environment", "boats", "asylum", "business", "bulk billing")
ctrl<-list(minWordLength=1, dictionary=campaign_dictionary)
mydtm<-TermDocumentMatrix(myCorpus,control=ctrl)
termDocMatrix <- is.matrix(mydtm)
termDocMatrix <- as.matrix(mydtm)
dim(termDocMatrix)
# change it to a Boolean matrix
termDocMatrix[termDocMatrix>=1] <- 1
# transform into a term-term adjacency matrix
termMatrix <- termDocMatrix %*% t(termDocMatrix)
#########################################################################
# 3.network analysis
#########################################################################
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
set.seed(1234)
vertex.label=c(vertex.label.dist=1,
vertex.label.color="black")
egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4)
label.size <- 1
V(g)$label.cex <- log(scale(degree(g)) + max(abs(scale(degree(g))))+ label.size)
# Label handling see here: https://trinkerrstuff.wordpress.com/2012/06/30/igraph-and-sna-an-amateurs-dabbling/
layout1 <- layout.kamada.kawai(g)
V(g)$color <- "lightblue"
opar <- par()$mar; par(mar=rep(0, 4)) # same link as above.
plot(g, layout=layout1,
vertex.color="lightblue",
vertex.frame.color= "grey",
vertex.label.color = "black",
vertex.label.family = "sans",
edge.width=1,
edge.color="grey")
# Other options you can try:
# layout1 <- layout.fruchterman.reingold(g)
# layout1 <- layout.random(g)
# layout1 <- layout.kamada.kawai(g)