<a href="https://colab.research.google.com/github/stephenfrein/csc8491/blob/main/TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install package that will take forever - will use later
install.packages("caret")

In [None]:
# build a corpus (body of docs) using the text mining (tm) package
install.packages("tm")
library(tm)

In [None]:
# read the sms data into the sms data frame
sms_raw <- read.csv("https://csc8491.s3.amazonaws.com/sms_spam.csv")
# examine the structure of the sms data
str(sms_raw)

In [None]:
# convert spam/ham to factor.
sms_raw$type <- factor(sms_raw$type)
# examine the type variable more carefully
str(sms_raw$type)
table(sms_raw$type)

In [None]:
# VCorpus is an in-memory (volatile) corpus – simplest choice here
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
# examine the sms corpus
print(sms_corpus)
inspect(sms_corpus[1:2])
# see contents
as.character(sms_corpus[[1]])
# returns a list in which the function has been applied to each item
lapply(sms_corpus[1:2], as.character)




In [None]:
# clean up the corpus using tm_map()
# start by making everything lowercase
sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower))
# show the difference between sms_corpus and corpus_clean
as.character(sms_corpus[[1]])
as.character(sms_corpus_clean[[1]])

In [None]:
# remove numbers
sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers)
# remove punctuation
sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation)
# remove stop words
sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords())


In [None]:
# illustration of word stemming
install.packages("SnowballC")
library(SnowballC)
wordStem(c("learn", "learned", "learning", "learns"))

In [None]:
# now stem our actual corpus
sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)
# eliminate unneeded whitespace
sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace)
# examine the final clean corpus
lapply(sms_corpus[1:3], as.character)
lapply(sms_corpus_clean[1:3], as.character)


In [None]:
# create a document-term sparse matrix
sms_dtm <- DocumentTermMatrix(sms_corpus_clean)
# see the result
head(as.matrix(sms_dtm))

In [None]:
inspect(sms_dtm)

In [None]:
# creating training and test datasets
train_pct <- .8
set.seed(123)
train = sample(1:nrow(sms_dtm), train_pct * nrow(sms_dtm))
sms_dtm_train <- sms_dtm[train, ]
sms_dtm_test  <- sms_dtm[-train, ]
nrow(sms_dtm_train)
nrow(sms_dtm_test)

In [None]:
# also save the labels
sms_train_labels <- sms_raw[train, ]$type
sms_test_labels  <- sms_raw[-train, ]$type
# check that the proportion of spam is similar
prop.table(table(sms_train_labels))
prop.table(table(sms_test_labels))

In [None]:
# subset the training data into spam and ham groups
spam <- subset(sms_raw, type == "spam")
ham  <- subset(sms_raw, type == "ham")

In [None]:
# word cloud visualization
install.packages("wordcloud")
library(wordcloud)

In [None]:
wordcloud(VCorpus(VectorSource(spam$text)), max.words = 20, scale = c(10, 0.5))

In [None]:
wordcloud(VCorpus(VectorSource(ham$text)), max.words = 20, scale = c(10, 0.5))

In [None]:
# indicator features for frequent words
findFreqTerms(sms_dtm_train, 5)
# save frequently-appearing terms to a character vector
sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
str(sms_freq_words)

In [None]:
# create DTMs with only the frequent terms
sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]
#see what is in a DTM
inspect(sms_dtm_freq_train)

In [None]:
as.character(sms_corpus_clean[[2046]])

In [None]:
# convert counts to presence indicator - # times not important
convert_counts <- function(x) {
  return(ifelse(x > 0, "Yes", "No"))
}
# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test  <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)
head(sms_train)

In [None]:
# install naive bayes packages
install.packages("e1071")

In [None]:
# training a model on the data ----
library(caret)
library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)
# evaluating model performance ----
sms_test_pred <- predict(sms_classifier, sms_test)
confusionMatrix(sms_test_pred, sms_test_labels, positive="spam",
	mode="prec_recall")

In [None]:
# predict with new messages
new_sms1 = 'what time is the movie? 7:30?'
new_sms2 = 'Free vacation if you text back'
# put both into same character vector
new_sms <- rbind(new_sms1, new_sms2)
new_corpus <- VCorpus(VectorSource(new_sms))
new_corpus = tm_map(new_corpus, content_transformer(tolower))
new_corpus = tm_map(new_corpus, removeNumbers)
new_corpus = tm_map(new_corpus, removePunctuation)
new_corpus = tm_map(new_corpus, removeWords, stopwords())
new_corpus = tm_map(new_corpus, stripWhitespace)
as.character(new_corpus[[1]])
as.character(new_corpus[[2]])

In [None]:
# new DTM for these messages
new_dtm = DocumentTermMatrix(new_corpus)
new_dtm
inspect(new_dtm)

In [None]:
new_dtm_df <- apply(new_dtm, MARGIN = 2, convert_counts)
new_sms_pred <- predict(sms_classifier, new_dtm_df)
new_sms_pred[1]
new_sms_pred[2]

In [None]:
# see predictions as probabilities
new_sms_pred <- predict(sms_classifier, new_dtm_df, type="raw")
new_sms_pred
