In [1]:
from sklearn.datasets import fetch_20newsgroups
import nltk
import re
import pandas as pd
import numpy as np

In [2]:
newsgroups = fetch_20newsgroups()
newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
categories = ['alt.atheism','talk.religion.misc','comp.graphics','sci.space']
newsgroups = fetch_20newsgroups(categories=categories, subset='all')
print(list(newsgroups.target_names))
print(len(newsgroups.data))

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
3387


In [4]:
from nltk.tokenize import RegexpTokenizer

tokens = []
tokenizer = RegexpTokenizer(r'[^_\W0-9]+')
tokens = [tokenizer.tokenize(token) for token in newsgroups.data]
pos_tagged_words = [nltk.pos_tag(token) for token in tokens]
print(pos_tagged_words[0:21])

[[('From', 'IN'), ('healta', 'JJ'), ('saturn', 'NN'), ('wwc', 'NN'), ('edu', 'NN'), ('Tammy', 'NNP'), ('R', 'NNP'), ('Healy', 'NNP'), ('Subject', 'NNP'), ('Re', 'NNP'), ('who', 'WP'), ('are', 'VBP'), ('we', 'PRP'), ('to', 'TO'), ('judge', 'VB'), ('Bobby', 'NNP'), ('Lines', 'NNPS'), ('Organization', 'NNP'), ('Walla', 'NNP'), ('Walla', 'NNP'), ('College', 'NNP'), ('Lines', 'NNP'), ('In', 'IN'), ('article', 'NN'), ('Apr', 'NNP'), ('ultb', 'JJ'), ('isc', 'NN'), ('rit', 'NN'), ('edu', 'NN'), ('snm', 'NN'), ('ultb', 'JJ'), ('isc', 'NN'), ('rit', 'NN'), ('edu', 'NN'), ('S', 'NNP'), ('N', 'NNP'), ('Mozumder', 'NNP'), ('writes', 'VBZ'), ('From', 'IN'), ('snm', 'NN'), ('ultb', 'JJ'), ('isc', 'NN'), ('rit', 'NN'), ('edu', 'NN'), ('S', 'NNP'), ('N', 'NNP'), ('Mozumder', 'NNP'), ('Subject', 'NNP'), ('Re', 'NNP'), ('who', 'WP'), ('are', 'VBP'), ('we', 'PRP'), ('to', 'TO'), ('judge', 'VB'), ('Bobby', 'NNP'), ('Date', 'NNP'), ('Wed', 'NNP'), ('Apr', 'NNP'), ('GMT', 'NNP'), ('In', 'IN'), ('article', 'N

In [5]:
from nltk.collocations import *
text = ''.join(newsgroups.data)
text = re.sub('[^\\sA-Za-z]+', '', text)
word_tokens = nltk.wordpunct_tokenize(text)
finder = BigramCollocationFinder.from_words(word_tokens)
bigram_measures = nltk.collocations.BigramAssocMeasures()

pmi_df = pd.DataFrame(list(finder.score_ngrams(bigram_measures.pmi)), columns=['bigram', 'pmi']).sort_values(by='pmi', ascending=False)
pmi_df.head(20)



Unnamed: 0,bigram,pmi
0,"(AAO, AngloAustralian)",19.916301
1453,"(charIMAGEHEIGHT, IMAGEWIDTHOriginalImagefp)",19.916301
1447,"(ccraignmtedu, Catherine)",19.916301
1448,"(censoring, overbearing)",19.916301
1449,"(cerebral, edema)",19.916301
1450,"(ceremonially, unclean)",19.916301
1451,"(chandrabpasbicom, Chandra)",19.916301
1452,"(chandrasbicom, jonsbicom)",19.916301
1454,"(chaveycswiscedu, Darrah)",19.916301
1088,"(SnapshoT, explorer)",19.916301


In [6]:
chi_sq_df = pd.DataFrame(list(finder.score_ngrams(bigram_measures.chi_sq)), columns=['bigram', 'chi-sq']).sort_values(by='chi-sq',ascending=False)
chi_sq_df.head(20)

Unnamed: 0,bigram,chi-sq
0,"(AAAA, BBBB)",989473.0
2097,"(coelomate, deuterostome)",989473.0
2075,"(charIMAGEHEIGHT, IMAGEWIDTHOriginalImagefp)",989473.0
2076,"(charles, boesel)",989473.0
2077,"(charsetiso, ContentTransferEncoding)",989473.0
2078,"(chaveycswiscedu, Darrah)",989473.0
2079,"(checksum, Unique)",989473.0
2080,"(chemscn, ppmblend)",989473.0
2081,"(chipset, HWfuncs)",989473.0
2082,"(chprismgatechEDU, claye)",989473.0


In [7]:
finder.apply_freq_filter(2)

raw_freq_with_filter = pd.DataFrame(list(finder.score_ngrams(bigram_measures.raw_freq)), columns=['bigram', 'filtered-freq']).sort_values(by='filtered-freq',ascending=False)
raw_freq_with_filter.head(20)


Unnamed: 0,bigram,filtered-freq
0,"(of, the)",0.005465
1,"(in, the)",0.003126
2,"(Subject, Re)",0.002459
3,"(In, article)",0.001982
4,"(to, the)",0.001876
5,"(to, be)",0.001722
6,"(on, the)",0.001672
7,"(is, a)",0.001525
8,"(for, the)",0.001306
9,"(it, is)",0.001293


In [8]:
finder.apply_freq_filter(2)

ttest_with_filter = pd.DataFrame(list(finder.score_ngrams(bigram_measures.student_t)), columns=['bigram','filtered-t-test']).sort_values(by='filtered-t-test', ascending=False)
ttest_with_filter.head(20)


Unnamed: 0,bigram,filtered-t-test
0,"(of, the)",59.688562
1,"(Subject, Re)",49.13522
2,"(in, the)",46.409497
3,"(In, article)",44.121113
4,"(to, be)",37.401619
5,"(on, the)",34.617501
6,"(it, is)",31.977423
7,"(Lines, In)",31.498479
8,"(is, a)",30.966998
9,"(I, have)",28.842104


In [None]:
pd.concat([pmi_df,chi_sq_df,ttest_with_filter,raw_freq_with_filter], axis=1).head(20)

Unnamed: 0,bigram,pmi,bigram.1,chi-sq,bigram.2,filtered-t-test,bigram.3,filtered-freq
0,"(AAO, AngloAustralian)",19.916301,"(AAAA, BBBB)",989473.0,"(of, the)",59.688562,"(of, the)",0.005465
1,"(ACAD, nffsff)",19.916301,"(AAH, EXAMINER)",989473.0,"(Subject, Re)",49.13522,"(in, the)",0.003126
2,"(ACCENT, Accent)",19.916301,"(AAO, AngloAustralian)",989473.0,"(in, the)",46.409497,"(Subject, Re)",0.002459
3,"(ACTIVIST, NEWSLETTER)",19.916301,"(ABORTED, ABORT)",989473.0,"(In, article)",44.121113,"(In, article)",0.001982
4,"(ACTIVITY, Cary)",19.916301,"(ABPSoft, mehl)",989473.0,"(to, be)",37.401619,"(to, the)",0.001876
5,"(AEA, GEGno)",19.916301,"(ACAD, nffsff)",989473.0,"(on, the)",34.617501,"(to, be)",0.001722
6,"(AEAyesBEBno, GEGyes)",19.916301,"(ACCENT, Accent)",989473.0,"(it, is)",31.977423,"(on, the)",0.001672
7,"(AFDDBEFgargravarrccutexasedu, XXXDate)",19.916301,"(ACTIVIST, NEWSLETTER)",989473.0,"(Lines, In)",31.498479,"(is, a)",0.001525
8,"(AGE, GLORY)",19.916301,"(ACTIVITY, Cary)",989473.0,"(is, a)",30.966998,"(for, the)",0.001306
9,"(AKIRA, KIMURA)",19.916301,"(ADMINISTRATION, PASADENA)",989473.0,"(I, have)",28.842104,"(it, is)",0.001293


In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop_words=set(stopwords.words("english"))
filtered_tokens=[]
for token in tokens:
    for w in token:
        if w not in stop_words:
            filtered_tokens.append(w)

ps = PorterStemmer()

stemmed_words=[]
for w in filtered_tokens:
    stemmed_words.append(ps.stem(w))
    
print(f"Stemmed tokens: {stemmed_words[0:200]}")

Stemmed tokens: ['from', 'healta', 'saturn', 'wwc', 'edu', 'tammi', 'R', 'heali', 'subject', 'Re', 'judg', 'bobbi', 'line', 'organ', 'walla', 'walla', 'colleg', 'line', 'In', 'articl', 'apr', 'ultb', 'isc', 'rit', 'edu', 'snm', 'ultb', 'isc', 'rit', 'edu', 'S', 'N', 'mozumd', 'write', 'from', 'snm', 'ultb', 'isc', 'rit', 'edu', 'S', 'N', 'mozumd', 'subject', 'Re', 'judg', 'bobbi', 'date', 'wed', 'apr', 'gmt', 'In', 'articl', 'healta', 'saturn', 'wwc', 'edu', 'healta', 'saturn', 'wwc', 'edu', 'tammi', 'R', 'heali', 'write', 'bobbi', 'I', 'would', 'like', 'take', 'liberti', 'quot', 'christian', 'writer', 'name', 'ellen', 'G', 'white', 'I', 'hope', 'said', 'help', 'edit', 'remark', 'group', 'futur', 'Do', 'set', 'standard', 'Do', 'make', 'opinion', 'view', 'duti', 'interpret', 'scriptur', 'criterion', 'other', 'heart', 'condemn', 'come', 'ideal', 'thought', 'fromth', 'mount', 'bless', 'p', 'I', 'hope', 'quot', 'make', 'atheist', 'gag', 'I', 'think', 'ellen', 'white', 'put', 'better', 'I',

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn import metrics

raw_data = np.array(newsgroups.data)
raw_labels = np.array(newsgroups.target)

raw_data = np.reshape(raw_data, (raw_data.shape[0],1))
raw_labels = np.reshape(raw_labels, (raw_labels.shape[0],1))

newsgroups_train, newsgroups_test, newsgroups_target_train, newsgroups_target_test = train_test_split(raw_data, raw_labels, test_size=0.3, random_state=42)


vectorizer = TfidfVectorizer(stop_words='english', token_pattern='[a-zA-Z]+')
vectors = vectorizer.fit_transform(newsgroups_train.ravel())

vectors_test = vectorizer.transform(newsgroups_test.ravel())

svm_clf = SVC(gamma='auto', kernel='rbf')
svm_clf.fit(vectors, newsgroups_target_train.ravel())
pred = svm_clf.predict(vectors_test)
print(f"SVC results with rbf kernel:\n {metrics.confusion_matrix(newsgroups_target_test.ravel(), pred)}")


nb_clf = MultinomialNB()
nb_clf.fit(vectors, newsgroups_target_train.ravel())
pred = nb_clf.predict(vectors_test)
print(f"Multinomial Naive Bayes Classifier results:\n {metrics.confusion_matrix(newsgroups_target_test.ravel(), pred)}")



SVC results with rbf kernel:
 [[  0   0 224   0]
 [  0   0 297   0]
 [  0   0 307   0]
 [  0   0 189   0]]
Multinomial Naive Bayes Classifier results:
 [[221   0   2   1]
 [  1 293   3   0]
 [  0   7 300   0]
 [ 47   7  11 124]]


In [None]:
svm_clf = SVC(gamma='auto', kernel='linear')
svm_clf.fit(vectors, newsgroups_target_train.ravel())
pred = svm_clf.predict(vectors_test)
print(f"SVC results with linear kernel:\n {metrics.confusion_matrix(newsgroups_target_test.ravel(), pred)}")

SVC results with linear kernel:
 [[213   0   2   9]
 [  0 297   0   0]
 [  2   9 295   1]
 [ 11   4   3 171]]


In [None]:
tokenizer = RegexpTokenizer(r'[^_\W0-9]+')
tokens_data = [tokenizer.tokenize(token) for token in newsgroups.data]
pos_tagged_words_data = [nltk.pos_tag(token) for token in tokens_data]

noun_data = []
nouns = ['NN','NNS','NNP', 'NNPS']

noun_word = []
for i in range(len(pos_tagged_words)):
    noun_word.append(noun_data)
    for j in range(len(pos_tagged_words[i])):
        for k in range(len(nouns)):
            if pos_tagged_words[i][j][1] in nouns[k]:
                noun_data.append(pos_tagged_words[i][j])
# len(vectorizer.vocabulary_)
# noun_data = list(set(noun_data))

# noun_target = []
# pos_tagged_words_target = pos_tagged_words_data[0:100]
# # for i in range(len(pos_tagged_words)):
# #     for j in range(len(pos_tagged_words[i])):
# #         for k in range(len(nouns)):
# #             if pos_tagged_words[i][j][1] in nouns[k]:
# #                 noun_words.append(pos_tagged_words[i][j])
# noun_target = [item[0] for pos_tagged_word_sublist in pos_tagged_words_target for item in pos_tagged_word_sublist for noun in nouns if item[1] in noun]
# # len(vectorizer.vocabulary_)
# noun_target = list(set(noun_target))




In [None]:
print(newsgroups.target.shape)
print(len(noun_word[0]))

In [None]:
# tokens_target = [tokenizer.tokenize(token) for token in newsgroups.target]
# # pos_tagged_words_target = [nltk.pos_tag(token) for token in tokens_target]

raw_data = np.array(noun_word)
raw_labels = np.array(newsgroups.target)

raw_data = np.reshape(raw_data, (raw_data.shape[0],1))
raw_labels = np.reshape(raw_labels, (raw_labels.shape[0],1))

print(raw_data.shape)
print(raw_labels.shape)

# newsgroups_noun_train, newsgroups_noun_test, newsgroups_target_noun_train, newsgroups_target_noun_test = train_test_split(raw_data, raw_labels, test_size=0.3, random_state=42)


# vectorizer = TfidfVectorizer(stop_words='english', token_pattern='[a-zA-Z]+')
# vectors = vectorizer.fit_transform(newsgroups_noun_train.ravel())

# vectors_test = vectorizer.transform(newsgroups_noun_test.ravel())

# svm_clf = SVC(gamma='auto', kernel='rbf')
# svm_clf.fit(vectors, newsgroups_target_noun_train.ravel())
# pred = svm_clf.predict(vectors_test)
# print(f"SVC results with rbf kernel:\n {metrics.f1_score(newsgroups_target_noun_test.ravel(), pred, average='macro')}")


# nb_clf = MultinomialNB()
# nb_clf.fit(vectors, newsgroups_target_noun_train.ravel())
# pred = nb_clf.predict(vectors_test)
# print(f"Multinomial Naive Bayes Classifier results:\n {metrics.f1_score(newsgroups_target_noun_test.ravel(), pred, average='macro')}")
      