# Classifying News Articles

Created by Wei Xin Tan

### Method: TF-IDF + Stacked LDA (Latent Dirichlet Allocation) + SWM

In [1]:
# Import libraries
library(e1071)
library(tm)
library(lda)
library(stm)
library(dplyr)

Loading required package: NLP
stm v1.3.3 (2018-1-26) successfully loaded. See ?stm for help. 
 Papers, resources, and other materials at structuraltopicmodel.com

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



### 0. Macro function

In [2]:
macro.f1 <- function(predicted,true.label){
    
    predicted <- as.integer(predicted);true.label <- as.integer(true.label)   
    classes <- unique(true.label)
    no.classes  <- length(classes)
    f1 <- 0

    for (class in classes){ 
        actual.positive <- sum(true.label == class)
        true.positive <- sum((predicted == true.label)&(predicted == class))
        total.predicted  <- sum(predicted == class)
        recall <- true.positive/actual.positive
        precision <- true.positive/total.predicted
        temp.f1 <- (2*((recall*precision)/(recall+precision)))
        #temp.f1 <- ifelse(is.na(temp.f1), 0, temp.f1)
        f1 <- f1 + temp.f1    

    }
    macro.f1 <- f1/no.classes
    return(macro.f1)
}

### 1. Loading text and extract each document

In [24]:
# reading data
train.text <- readLines('./training_docs.txt')
train.text <- train.text[train.text > 0 & train.text != 'EOD']
train.docs <- data.frame('doc_id'=train.text[seq(1,length(train.text),2)],'text'=train.text[seq(2,length(train.text),2)])
train.label <- read.table('./training_labels_final.txt',stringsAsFactors=FALSE)
names(train.label) <- c('document','class')

# read test data
test.text <- readLines('./testing_docs.txt')
test.text <- test.text[test.text > 0 & test.text != 'EOD']
test.docs <- data.frame('doc_id'=test.text[seq(1,length(test.text),2)],'text'=test.text[seq(2,length(test.text),2)])
cut.point <- nrow(test.docs)

docs <- rbind(test.docs,train.docs)

In [None]:
# create a corpus
docs <- DataframeSource(docs)
docs <- Corpus(docs)

# Preprocessing:
docs <- tm_map(docs, removeNumbers) # remove all numbers
docs <- tm_map(docs, stripWhitespace) # remove redundant spaces 
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, c('text','will',"say", "said", 'can','saying','tell', stopwords("english"))) # remove stop words (the most common word in a language that can be find in any document)
docs <- tm_map(docs, removePunctuation) # remove punctuation
docs <- tm_map(docs, stemDocument) # perform stemming (reducing inflected and derived words to their root form)
docs <- tm_map(docs, removeWords, c('text','will',"say", "said", 'can','saying','tell', stopwords("english"))) 

# dtm for lda
lda.dtm <- DocumentTermMatrix(docs)

# dtm for svm
svm.dtm <- DocumentTermMatrix(docs, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
svm.dtm <- removeSparseTerms(svm.dtm, 0.98)

In [None]:
lda.dtm

In [None]:
svm.dtm

### 3. Topic modelling with LDA

In [None]:
# Further processing to LDA required data
minusone <- function(item){
    return(matrix(as.integer(c(item[1,] - 1,item[2,])),2,,byrow = TRUE))
    }

slam.dtm <- readCorpus(lda.dtm, type = "slam")
rowTotals <- lapply(slam.dtm$documents, sum) #Find the sum of words in each Document
slam.dtm$documents  <- slam.dtm$documents[rowTotals > 0]
slam.dtm$documents <- lapply(slam.dtm$documents,minusone)

In [None]:
# Train LDA
num.topics <- 23
result <- lda.collapsed.gibbs.sampler(slam.dtm$documents,
                                      num.topics,  ## Num clusters
                                      slam.dtm$vocab,
                                      num.iterations = 100,  ## Num iterations
                                      alpha = 50/num.topics,
                                      eta = 0.1,
                                      compute.log.likelihood=FALSE,
                                      trace=1L)
# Getting the Gammas/Posteriors
gamma <- t(result$document_sums) / colSums(result$document_sums)
gamma.df <- data.frame(names(slam.dtm$documents),gamma) # check this order
colnames(gamma.df) <- c('document',1:num.topics)
gamma.df$document <- gsub('ID ','', gamma.df$document)
grab.indx <- grep('te+',gamma.df$document)
lda.test.data <- gamma.df[grab.indx,]
lda.train.data <- gamma.df[-grab.indx,]

for (k in c(55,110,160,210,260)){
    message('Training',k)
    # Training LDA
    num.topics <- k
    result <- lda.collapsed.gibbs.sampler(slam.dtm$documents,
                                          num.topics,  ## Num clusters
                                          slam.dtm$vocab,
                                          num.iterations = 100,  ## Num iterations
                                          alpha = 50/num.topics,
                                          eta = 0.1,
                                          compute.log.likelihood=FALSE,
                                          trace=1L)
    # Getting the Gammas/Posteriors
    gamma <- t(result$document_sums) / colSums(result$document_sums)
    gamma.df <- data.frame(names(slam.dtm$documents),gamma) # check this order
    colnames(gamma.df) <- c('document',1:num.topics)
    gamma.df$document <- gsub('ID ','', gamma.df$document)
    grab.indx <- grep('te+',gamma.df$document)
    lda.test.data <- cbind(lda.test.data,gamma.df[grab.indx,-1])
    lda.train.data <- cbind(lda.train.data,gamma.df[-grab.indx,-1])
    }

# rename the columns
colnames(lda.train.data) <- c('document',1:818)
colnames(lda.test.data) <- c('document',1:818)

### 4. Merging the Data

In [None]:
# extract tfidf data for training svm
m <- as.matrix(svm.dtm)
df.m <- data.frame(document=rownames(m),m,row.names = NULL,stringsAsFactors=FALSE)
df.m$document <- gsub('ID ','', df.m$document)
svm.test.data <- df.m[1:cut.point,]
svm.all.train.data <- df.m[-(1:cut.point),]

# Merging the posterior from lda and tf-dif
new.data <- merge(lda.train.data , svm.all.train.data , by="document",all=TRUE)
new.data [is.na(new.data )] <- 0
new.data.w.label <- merge(new.data , train.label , by="document")
new.label <- new.data.w.label[,ncol(new.data.w.label)]
new.features <- new.data.w.label[,2:(ncol(new.data.w.label)-1)]

### 5. Sampling and spliting data into train and validation dataset

In [None]:
# Sample portion of the data to train our classifier
sam.idx <- sample(1:nrow(train.docs),60000)
new.features <- new.features[sam.idx,]
new.label <- new.label[sam.idx]

In [None]:
# Creating the index according to defined percentage.
smp_size <- floor(0.75 * nrow(new.features))
train_ind <- sample(seq_len(nrow(new.features)), size = smp_size)

# Features set
to.train.data <- new.features[train_ind, ]
val.data <- new.features[-train_ind, ]

# Label set
to.train.label <- as.factor(new.label[train_ind])
val.label <- as.factor(new.label[-train_ind])

### 6. Training SVM multi-class classifier (One vs One) 

In [None]:
# Train the model
svm_model <- svm(x=to.train.data,y=to.train.label,method="C-classification", kernel="sigmoid",cachesize = 400)
pred <-predict(svm_model,val.data)
macro.f1(pred,val.label)
#system('say -r 180 hey jarvis, your model has finished training')

### 7. Prediction

In [None]:
# Run test set 
new.test.data <- merge(lda.test.data , svm.test.data , by="document",all=TRUE)
new.test.data [is.na(new.test.data )] <- 0

In [None]:
# Minor reordering
test.pred <- predict(svm_model, new.test.data[-1])
pred.df <- data.frame('doc'=new.test.data$document,'pred'=test.pred,'idx'=as.numeric(gsub('te_doc_','', new.test.data$document)))
pred.df <- pred.df[order(pred.df$idx),]

In [None]:
# Export the prediction
to.write <- pred.df[,c('doc','pred')]
write.table(to.write, file = "testing_labels_pred.txt",quote = FALSE,row.names = FALSE, col.names=FALSE)