In [None]:
# This R environment comes with all of CRAN preinstalled, as well as many other helpful packages
# The environment is defined by the kaggle/rstats docker image: https://github.com/kaggle/docker-rstats
# For example, here's several helpful packages to load in 

library(readr) # CSV file I/O, e.g. the read_csv function
library(tm)
library(RColorBrewer)
library(wordcloud) #Importing wordcloud package
# Input data files are available in the "../input/" directory.

In [None]:
debate <- read.csv('../input/debate.csv', stringsAsFactors=FALSE) #Reading CSV file
str(debate)

In [None]:
clinton <- subset(debate, Speaker=="Clinton") #Subset where Speaker is Clinton
trump <- subset(debate, Speaker=="Trump") #Subset where Speaker is Trump
str(clinton)
str(trump)

In [None]:
clinton <- clinton[-c(1,2,4)] #Removing 'Line', 'Speaker' and 'Date'
clinton$Text<-iconv(enc2utf8(clinton$Text),sub="byte")
str(clinton)
trump <- trump[-c(1,2,4)] #Removing 'Line', 'Speaker' and 'Date'
trump$Text<-iconv(enc2utf8(trump$Text),sub="byte")
str(trump)

In [None]:
clinton_corpus <- VCorpus(VectorSource(clinton$Text)) #Converting text in dataframe into corpus
print(clinton_corpus)
as.character(clinton_corpus[[1]])
trump_corpus <- VCorpus(VectorSource(trump$Text)) #Converting text in dataframe into corpus
print(trump_corpus)
as.character(trump_corpus[[1]])

In [None]:
clinton_clean <- tm_map(clinton_corpus, removeWords, stopwords()) #Removing stopwords
clinton_clean <- tm_map(clinton_clean, content_transformer(tolower)) #Converting corpus to lower case
clinton_clean <- tm_map(clinton_clean, removeNumbers) #Removing numbers
clinton_clean <- tm_map(clinton_clean, removePunctuation) #Removing punctuation
clinton_clean <- tm_map(clinton_clean, stripWhitespace) #Removing whitespace
as.character(clinton_clean[[1]])

In [None]:
trump_clean <- tm_map(trump_corpus, removeWords,stopwords()) #Removing stopwords
trump_clean <- tm_map(trump_clean, removeNumbers) #Removing numbers
trump_clean <- tm_map(trump_clean, removePunctuation) #Removing punctuation
trump_clean <- tm_map(trump_clean, content_transformer(tolower)) #Converting corpus to lower case
trump_clean <- tm_map(trump_clean, stripWhitespace) #Removing whitespace
as.character(trump_clean[[1]])

In [None]:
wordcloud(clinton_clean, min.freq=20, random.order=FALSE) #Word cloud with words of minimum frequence 20
wordcloud(trump_clean, min.freq=20, random.order=FALSE) #Word cloud with words of minimum frequency 20