# Assignment - 2

## Author: Sudarshan Suresh Srikant - B00808452 - sudarshan.suresh@dal.ca

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.collocations import *
import nltk
import re
import pandas as pd
import numpy as np

In [2]:
nltk.download('averaged_perceptron_tagger')

#get english stopwords
en_stopwords = set(stopwords.words('english'))

#function to filter for VB/ADJ bigrams
def filterTypes(ngram):
    acceptable_types = ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS')
    second_type = ( 'JJ', 'JJR', 'JJS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sudarshansuresh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
newsgroups = fetch_20newsgroups()
newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
categories = ['alt.atheism','talk.religion.misc','comp.graphics','sci.space']
newsgroups = fetch_20newsgroups(categories=categories, subset='all')
print(list(newsgroups.target_names))
print(len(newsgroups.data))

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
3387


## 1. a. Tokenizing and performing POS tagging

In [5]:
tokens = []
tokenizer = RegexpTokenizer(r'[^_\W0-9]+') # Extract only alphabetical characters
tokens = [tokenizer.tokenize(token) for token in newsgroups.data]
pos_tagged_words = [nltk.pos_tag(token) for token in tokens] # Perform POS tagging on the tokens
print(pos_tagged_words[0])

[('From', 'IN'), ('healta', 'JJ'), ('saturn', 'NN'), ('wwc', 'NN'), ('edu', 'NN'), ('Tammy', 'NNP'), ('R', 'NNP'), ('Healy', 'NNP'), ('Subject', 'NNP'), ('Re', 'NNP'), ('who', 'WP'), ('are', 'VBP'), ('we', 'PRP'), ('to', 'TO'), ('judge', 'VB'), ('Bobby', 'NNP'), ('Lines', 'NNPS'), ('Organization', 'NNP'), ('Walla', 'NNP'), ('Walla', 'NNP'), ('College', 'NNP'), ('Lines', 'NNP'), ('In', 'IN'), ('article', 'NN'), ('Apr', 'NNP'), ('ultb', 'JJ'), ('isc', 'NN'), ('rit', 'NN'), ('edu', 'NN'), ('snm', 'NN'), ('ultb', 'JJ'), ('isc', 'NN'), ('rit', 'NN'), ('edu', 'NN'), ('S', 'NNP'), ('N', 'NNP'), ('Mozumder', 'NNP'), ('writes', 'VBZ'), ('From', 'IN'), ('snm', 'NN'), ('ultb', 'JJ'), ('isc', 'NN'), ('rit', 'NN'), ('edu', 'NN'), ('S', 'NNP'), ('N', 'NNP'), ('Mozumder', 'NNP'), ('Subject', 'NNP'), ('Re', 'NNP'), ('who', 'WP'), ('are', 'VBP'), ('we', 'PRP'), ('to', 'TO'), ('judge', 'VB'), ('Bobby', 'NNP'), ('Date', 'NNP'), ('Wed', 'NNP'), ('Apr', 'NNP'), ('GMT', 'NNP'), ('In', 'IN'), ('article', 'NN

## 1. b. Extract bigram collocations and apply techniques

In [6]:
# PMI measure to check the occurance of a bigram
text = ''.join(newsgroups.data)
text = re.sub('[^ A-Za-z]+', '', text)
word_tokens = nltk.wordpunct_tokenize(text)
finder = BigramCollocationFinder.from_words(word_tokens)
bigram_measures = nltk.collocations.BigramAssocMeasures()

pmi_df = pd.DataFrame(list(finder.score_ngrams(bigram_measures.pmi)), columns=['bigram', 'pmi']).sort_values(by='pmi', ascending=False)
pmi_df.head(20).reset_index(drop=True)


Unnamed: 0,bigram,pmi
0,"(AALines, RAPTURE)",19.802676
1,"(hideously, expensiveMary)",19.802676
2,"(hetverschuldigde, bedrag)",19.802676
3,"(hestalking, aboutCarl)",19.802676
4,"(hermanrsilbigerSubject, ANSIAIIM)",19.802676
5,"(herjohan, engevik)",19.802676
6,"(herethanksFrom, MORIARTYNDSUVMBITNETSubject)",19.802676
7,"(hereping, ICMP)",19.802676
8,"(herd, Thevictor)",19.802676
9,"(herbertRetro, AerospaceFrom)",19.802676


In [7]:
# Chi-square measure to check the occurance of a bigram
chi_sq_df = pd.DataFrame(list(finder.score_ngrams(bigram_measures.chi_sq)), columns=['bigram', 'chi-sq']).sort_values(by='chi-sq',ascending=False)
chi_sq_df.head(20).reset_index(drop=True)


Unnamed: 0,bigram,chi-sq
0,"(AAAA, BBBB)",914533.0
1,"(havewith, weaponsFYI)",914533.0
2,"(helmet, cams)",914533.0
3,"(helixmedunceduReplyTo, cptullymedunceduOrgani...",914533.0
4,"(hejust, rattles)",914533.0
5,"(heikomathfuberlindemaggiaethzch, pubinetray)",914533.0
6,"(hedendaagsetechnologie, stelt)",914533.0
7,"(heatis, reradiated)",914533.0
8,"(hearken, onlyof)",914533.0
9,"(heardthe, gasp)",914533.0


In [8]:
# Frequency with filter

raw_freq_with_filter = pd.DataFrame(list(finder.score_ngrams(bigram_measures.raw_freq)), columns=['bigram', 'filtered-freq']).sort_values(by='filtered-freq',ascending=False)
filtered_raw_freq_bi = raw_freq_with_filter[raw_freq_with_filter.bigram.map(lambda x: filterTypes(x))]
filtered_raw_freq_bi.head(20).reset_index(drop=True)

Unnamed: 0,bigram,filtered-freq
0,"(be, able)",0.000187
1,"(is, available)",0.000174
2,"(is, true)",9.5e-05
3,"(are, available)",7.7e-05
4,"(is, wrong)",6.2e-05
5,"(be, more)",6e-05
6,"(are, many)",5.8e-05
7,"(is, possible)",5.6e-05
8,"(be, wrong)",5.2e-05
9,"(is, good)",5e-05


In [9]:
# T-test with filter

ttest_with_filter = pd.DataFrame(list(finder.score_ngrams(bigram_measures.student_t)), columns=['bigram','filtered-t-test']).sort_values(by='filtered-t-test', ascending=False)

filtered_ttest_bi = ttest_with_filter[ttest_with_filter.bigram.map(lambda x: filterTypes(x))]
filtered_ttest_bi.head(20).reset_index(drop=True)


Unnamed: 0,bigram,filtered-t-test
0,"(be, able)",12.934417
1,"(is, available)",11.842059
2,"(is, true)",8.516765
3,"(are, available)",7.917714
4,"(is, wrong)",6.783755
5,"(be, wrong)",6.579869
6,"(are, many)",6.519642
7,"(is, possible)",6.347374
8,"(be, nice)",6.180663
9,"(be, true)",5.993936


## 1. c. Top 20 results from each bigram measures

According to the results above, we see that there is a good amount of overlap among the measures that are filtered. It makes sense to take the union of the filtered measures because the filtering is done based on which word types form a bigram.

I have selected a filter function called filterTypes(x) which filters the bigrams to contain pairs of adjectives and verbs only. This way the top 20 combinations of (verb, adjective) pairs can be selected.

## 2. a. Text cleaning

The text cleaning process has been carried out by removing non-alphabetic characters, stop words and by stemming the final result. The removal of non-alphabetic characters has been done in Cell #5 by providing the regular expression to the RegExpTokenizer.

In [10]:
stop_words=set(stopwords.words("english"))
filtered_tokens=[]
for token in tokens:
    for w in token:
        if w not in stop_words:
            filtered_tokens.append(w)

ps = PorterStemmer()

stemmed_words=[]
for w in filtered_tokens:
    stemmed_words.append(ps.stem(w))
    
print(f"Stemmed tokens: {stemmed_words[0:200]}")

Stemmed tokens: ['from', 'healta', 'saturn', 'wwc', 'edu', 'tammi', 'R', 'heali', 'subject', 'Re', 'judg', 'bobbi', 'line', 'organ', 'walla', 'walla', 'colleg', 'line', 'In', 'articl', 'apr', 'ultb', 'isc', 'rit', 'edu', 'snm', 'ultb', 'isc', 'rit', 'edu', 'S', 'N', 'mozumd', 'write', 'from', 'snm', 'ultb', 'isc', 'rit', 'edu', 'S', 'N', 'mozumd', 'subject', 'Re', 'judg', 'bobbi', 'date', 'wed', 'apr', 'gmt', 'In', 'articl', 'healta', 'saturn', 'wwc', 'edu', 'healta', 'saturn', 'wwc', 'edu', 'tammi', 'R', 'heali', 'write', 'bobbi', 'I', 'would', 'like', 'take', 'liberti', 'quot', 'christian', 'writer', 'name', 'ellen', 'G', 'white', 'I', 'hope', 'said', 'help', 'edit', 'remark', 'group', 'futur', 'Do', 'set', 'standard', 'Do', 'make', 'opinion', 'view', 'duti', 'interpret', 'scriptur', 'criterion', 'other', 'heart', 'condemn', 'come', 'ideal', 'thought', 'fromth', 'mount', 'bless', 'p', 'I', 'hope', 'quot', 'make', 'atheist', 'gag', 'I', 'think', 'ellen', 'white', 'put', 'better', 'I',

## 2. b. and 2. c. SVM and Multinomial Naive Bayes

The corpus has been converted into a bag-of-words tf-idf weighted vector representation using the TfidfVectorizer(). In order to remove the stop words and non-alhpabetic characters from the data, TfidfVectorizer takes stop_words and token_pattern as parameters to simplify the cleaning process. After the vectorization process has been complete, the vectors are trained onto two ML models as shown below:

In [11]:
data = np.array(newsgroups.data)
labels = np.array(newsgroups.target)

data = np.reshape(data, (data.shape[0],1))
labels = np.reshape(labels, (labels.shape[0],1))

newsgroups_train, newsgroups_test, newsgroups_target_train, newsgroups_target_test = train_test_split(data, labels, test_size=0.3, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', token_pattern='[a-zA-Z]+')
vectors = vectorizer.fit_transform(newsgroups_train.ravel())

vectors_test = vectorizer.transform(newsgroups_test.ravel())

svm_clf = SVC(gamma='auto', kernel='rbf')
svm_clf.fit(vectors, newsgroups_target_train.ravel())
pred = svm_clf.predict(vectors_test)
print(f"SVC results with rbf kernel:\n {metrics.confusion_matrix(newsgroups_target_test.ravel(), pred)}")


nb_clf = MultinomialNB()
nb_clf.fit(vectors, newsgroups_target_train.ravel())
pred = nb_clf.predict(vectors_test)
print(f"Multinomial Naive Bayes Classifier results:\n {metrics.confusion_matrix(newsgroups_target_test.ravel(), pred)}")

print(f"\nTotal Vocabulary length: {len(vectorizer.vocabulary_)}")

SVC results with rbf kernel:
 [[  0   0 224   0]
 [  0   0 297   0]
 [  0   0 307   0]
 [  0   0 189   0]]
Multinomial Naive Bayes Classifier results:
 [[221   0   2   1]
 [  1 293   3   0]
 [  0   7 300   0]
 [ 47   7  11 124]]

Total Vocabulary length: 31500


In [12]:
svm_clf = SVC(gamma='auto', kernel='linear')
svm_clf.fit(vectors, newsgroups_target_train.ravel())
pred = svm_clf.predict(vectors_test)
print(f"SVC results with linear kernel:\n {metrics.confusion_matrix(newsgroups_target_test.ravel(), pred)}")

SVC results with linear kernel:
 [[213   0   2   9]
 [  0 297   0   0]
 [  2   9 295   1]
 [ 11   4   3 171]]


## Accuracy Results:

From the above cells, we can see that the accuracy of Multinomial Naive Bayes is much higher than that of SVM. The reason for this being that Naive Bayes is based on a probablistic model and works well when combined with TfidfVectorizer in calculating the probablities of occurance of each word. When vectorizied with TfidfVectorizer, the importance of every word is obtained along with the frequency of occurance. Naive Bayes then calculates the probability of occurance of these words by treating every word in the sentence to be independent from one another. This way a high accuracy with Naive Bayes was obtained [1].

Initially when the model was trained on SVM with an 'rbf' kernel, the accuracy of the model seemed to be very less. From the confusion matrix with SVM(kernel='rbf), the classifier was mistaken in prediciting the right word class for every word. However, when the same SVM model had linear kernel, the accuracy of the model was almost the same as that of the multinomial naive bayes model. The reason for this is because of the 'kernel trick' [2]. The RBF kernel divides the data using a non-linear hyperplane in an infinite dimensional space, whereas a linear kernel will divide the data based on a linear hyperplane. Since the dataset is quite large, a linear kernel would be effective in classifiying the words, hence the reason for higher accuracy with a linear kernel.

In [13]:
tokenizer = RegexpTokenizer(r'[^_\W0-9]+')
tokens_data = [tokenizer.tokenize(token) for token in newsgroups.data]
pos_tagged_words_data = [nltk.pos_tag(token) for token in tokens_data]


In [14]:
noun_data = []
nouns = ['NN','NNS','NNP', 'NNPS']

noun_words = []

for i in range(len(pos_tagged_words_data)):
    noun_data = []
    for j in range(len(pos_tagged_words_data[i])):
        if (pos_tagged_words_data[i][j][1] in nouns):
            noun_data.append(pos_tagged_words_data[i][j][0])
    noun_words.append(noun_data)
    
noun_words = [" ".join(record) for record in noun_words] 

In [15]:
newsgroups_noun_train, newsgroups_noun_test, newsgroups_target_noun_train, newsgroups_target_noun_test = train_test_split(noun_words, newsgroups.target, test_size=0.3, random_state=42)


vectorizer = TfidfVectorizer(stop_words='english', token_pattern='[a-zA-Z]+')
vectors = vectorizer.fit_transform(newsgroups_noun_train)

vectors_test = vectorizer.transform(newsgroups_noun_test)

svm_clf = SVC(gamma='auto', kernel='rbf')
svm_clf.fit(vectors, newsgroups_target_noun_train)
pred = svm_clf.predict(vectors_test)
print(f"SVC results with rbf kernel:\n {metrics.confusion_matrix(newsgroups_target_noun_test, pred)}")


nb_clf = MultinomialNB()
nb_clf.fit(vectors, newsgroups_target_noun_train)
pred = nb_clf.predict(vectors_test)
print(f"Multinomial Naive Bayes Classifier results:\n {metrics.confusion_matrix(newsgroups_target_noun_test, pred)}")

print(f"\nVocabulary length of nouns: {len(vectorizer.vocabulary_)}")

      

SVC results with rbf kernel:
 [[  0   0 224   0]
 [  0   0 297   0]
 [  0   0 307   0]
 [  0   0 189   0]]
Multinomial Naive Bayes Classifier results:
 [[219   0   2   3]
 [  1 295   1   0]
 [  1   5 301   0]
 [ 42   5  14 128]]

Vocabulary length of nouns: 23779


In [16]:
svm_clf = SVC(gamma='auto', kernel='linear')
svm_clf.fit(vectors, newsgroups_target_noun_train)
pred = svm_clf.predict(vectors_test)
print(f"SVC results with linear kernel:\n {metrics.confusion_matrix(newsgroups_target_noun_test, pred)}")
      

SVC results with linear kernel:
 [[212   0   2  10]
 [  0 296   1   0]
 [  0   8 297   2]
 [ 12   2   2 173]]


## 2. d. Accuracy Results for bag-of-words tf-idf weighted vector representation using only nouns

As seen above the accuracies reported for tf-idf weighted noun vectors are almost the same as that of the entire corpus. However, the size of the vocabularies are different: Noun vocabulary size: 23779, Total vocabulary size: 31500. 

## References:

[1]"Applying Multinomial Naive Bayes to NLP Problems: A Practical Explanation", Medium, 2019. [Online]. Available: https://medium.com/syncedreview/applying-multinomial-naive-bayes-to-nlp-problems-a-practical-explanation-4f5271768ebf. [Accessed: 16- Jul- 2019].

[2]H. SVM, "How to understand effect of RBF SVM", Cross Validated, 2019. [Online]. Available: https://stats.stackexchange.com/questions/58585/how-to-understand-effect-of-rbf-svm. [Accessed: 16- Jul- 2019].

[3]A. Syntax, L. Ryan, N. Kronenfeld and G. Maynard, "Advanced Nested List Comprehension Syntax", Stack Overflow, 2019. [Online]. Available: https://stackoverflow.com/questions/3766711/advanced-nested-list-comprehension-syntax. [Accessed: 16- Jul- 2019].
