In [52]:
from sklearn.datasets import fetch_20newsgroups
import nltk
import re
import pandas as pd
import numpy as np

In [2]:
newsgroups = fetch_20newsgroups()
newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [26]:
categories = ['alt.atheism','talk.religion.misc','comp.graphics','sci.space']
newsgroups = fetch_20newsgroups(categories=categories, subset='all')
print(list(newsgroups.target_names))
print(len(newsgroups.data))

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
3387


In [4]:
from nltk.tokenize import RegexpTokenizer

tokens = []
tokenizer = RegexpTokenizer(r'[^_\W0-9]+')
tokens = [tokenizer.tokenize(token) for token in newsgroups.data]
pos_tagged_words = [nltk.pos_tag(token) for token in tokens]
print(pos_tagged_words[0:21])

[[('From', 'IN'), ('rych', 'JJ'), ('festival', 'NN'), ('ed', 'NN'), ('ac', 'NN'), ('uk', 'JJ'), ('R', 'NNP'), ('Hawkes', 'NNP'), ('Subject', 'NNP'), ('DS', 'NNP'), ('Where', 'NNP'), ('did', 'VBD'), ('all', 'PDT'), ('the', 'DT'), ('texture', 'NN'), ('rules', 'NNS'), ('go', 'VBP'), ('Lines', 'NNPS'), ('Hi', 'NNP'), ('I', 'PRP'), ('ve', 'VBP'), ('noticed', 'VBN'), ('that', 'IN'), ('if', 'IN'), ('you', 'PRP'), ('only', 'RB'), ('save', 'VB'), ('a', 'DT'), ('model', 'NN'), ('with', 'IN'), ('all', 'DT'), ('your', 'PRP$'), ('mapping', 'NN'), ('planes', 'NNS'), ('positioned', 'VBD'), ('carefully', 'RB'), ('to', 'TO'), ('a', 'DT'), ('DS', 'NNP'), ('file', 'NN'), ('that', 'WDT'), ('when', 'WRB'), ('you', 'PRP'), ('reload', 'VBP'), ('it', 'PRP'), ('after', 'IN'), ('restarting', 'VBG'), ('DS', 'NNP'), ('they', 'PRP'), ('are', 'VBP'), ('given', 'VBN'), ('a', 'DT'), ('default', 'NN'), ('position', 'NN'), ('and', 'CC'), ('orientation', 'NN'), ('But', 'CC'), ('if', 'IN'), ('you', 'PRP'), ('save', 'VBP'

In [5]:
from nltk.collocations import *
text = ''.join(newsgroups.data)
text = re.sub('[^\\sA-Za-z]+', '', text)
word_tokens = nltk.wordpunct_tokenize(text)
finder = BigramCollocationFinder.from_words(word_tokens)
bigram_measures = nltk.collocations.BigramAssocMeasures()

pmi_df = pd.DataFrame(list(finder.score_ngrams(bigram_measures.pmi)), columns=['bigram', 'pmi']).sort_values(by='pmi', ascending=False)
pmi_df.head(20)



Unnamed: 0,bigram,pmi
0,"(AAH, EXAMINER)",19.133962
1502,"(claicerintintinColoradoEDU, Farmer)",19.133962
1496,"(chertdungeonlonestarorg, PANIC)",19.133962
1497,"(chipset, HWfuncs)",19.133962
1498,"(chprismgatechEDU, claye)",19.133962
1499,"(cicerocsumassedu, texturetemp)",19.133962
1500,"(cindysolansolanunitno, Cynthia)",19.133962
1501,"(cjkCsyGKonetcomcom, cjknetcomcom)",19.133962
1503,"(cliche, derision)",19.133962
1494,"(chaveycswiscedu, Darrah)",19.133962


In [6]:
chi_sq_df = pd.DataFrame(list(finder.score_ngrams(bigram_measures.chi_sq)), columns=['bigram', 'chi-sq']).sort_values(by='chi-sq',ascending=False)
chi_sq_df.head(20)

Unnamed: 0,bigram,chi-sq
0,"(AAAA, BBBB)",575303.0
1819,"(catastrophically, deflated)",575303.0
1811,"(cainfsubitnet, scricain)",575303.0
1812,"(calm, carefullywritten)",575303.0
1813,"(capitals, EMERGENCY)",575303.0
1814,"(cardboard, cutout)",575303.0
1815,"(carlosnphoenixprincetonedu, spacephoenixprinc...",575303.0
1816,"(carnage, chaplains)",575303.0
1817,"(castlabengrwiscedu, pubxdtarZ)",575303.0
1818,"(catalogtelnet, bisonacsubuffaloedu)",575303.0


In [7]:
finder.apply_freq_filter(2)

raw_freq_with_filter = pd.DataFrame(list(finder.score_ngrams(bigram_measures.raw_freq)), columns=['bigram', 'filtered-freq']).sort_values(by='filtered-freq',ascending=False)
raw_freq_with_filter.head(20)


Unnamed: 0,bigram,filtered-freq
0,"(of, the)",0.005425
1,"(in, the)",0.003071
2,"(Subject, Re)",0.002533
3,"(In, article)",0.001978
4,"(to, the)",0.001836
5,"(to, be)",0.001763
6,"(on, the)",0.001723
7,"(is, a)",0.001486
8,"(it, is)",0.001364
9,"(that, the)",0.00133


In [8]:
finder.apply_freq_filter(2)

ttest_with_filter = pd.DataFrame(list(finder.score_ngrams(bigram_measures.student_t)), columns=['bigram','filtered-t-test']).sort_values(by='filtered-t-test', ascending=False)
ttest_with_filter.head(20)


Unnamed: 0,bigram,filtered-t-test
0,"(of, the)",45.444401
1,"(Subject, Re)",38.018105
2,"(in, the)",34.856437
3,"(In, article)",33.608131
4,"(to, be)",28.900686
5,"(on, the)",26.937322
6,"(it, is)",25.210055
7,"(Lines, In)",24.592866
8,"(is, a)",23.28236
9,"(I, dont)",22.512556


In [9]:
pd.concat([pmi_df,chi_sq_df,ttest_with_filter,raw_freq_with_filter], axis=1).head(20)

Unnamed: 0,bigram,pmi,bigram.1,chi-sq,bigram.2,filtered-t-test,bigram.3,filtered-freq
0,"(AAH, EXAMINER)",19.133962,"(AAAA, BBBB)",575303.0,"(of, the)",45.444401,"(of, the)",0.005425
1,"(AAO, AngloAustralian)",19.133962,"(AAH, EXAMINER)",575303.0,"(Subject, Re)",38.018105,"(in, the)",0.003071
2,"(ABPSoft, mehl)",19.133962,"(AAO, AngloAustralian)",575303.0,"(in, the)",34.856437,"(Subject, Re)",0.002533
3,"(ACAD, nffsff)",19.133962,"(ABORTED, ABORT)",575303.0,"(In, article)",33.608131,"(In, article)",0.001978
4,"(ACCENT, Accent)",19.133962,"(ABPSoft, mehl)",575303.0,"(to, be)",28.900686,"(to, the)",0.001836
5,"(ACCOMMODATION, Accommodation)",19.133962,"(ACAD, nffsff)",575303.0,"(on, the)",26.937322,"(to, be)",0.001763
6,"(ACTIVIST, NEWSLETTER)",19.133962,"(ACCENT, Accent)",575303.0,"(it, is)",25.210055,"(on, the)",0.001723
7,"(ACTIVITY, Cary)",19.133962,"(ACCOMMODATION, Accommodation)",575303.0,"(Lines, In)",24.592866,"(is, a)",0.001486
8,"(ADRG, Arc)",19.133962,"(ACTIVIST, NEWSLETTER)",575303.0,"(is, a)",23.28236,"(it, is)",0.001364
9,"(AEA, GEGno)",19.133962,"(ACTIVITY, Cary)",575303.0,"(I, dont)",22.512556,"(that, the)",0.00133


In [10]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop_words=set(stopwords.words("english"))
filtered_tokens=[]
for token in tokens:
    for w in token:
        if w not in stop_words:
            filtered_tokens.append(w)

ps = PorterStemmer()

stemmed_words=[]
for w in filtered_tokens:
    stemmed_words.append(ps.stem(w))
    
print(f"Stemmed tokens: {stemmed_words[0:200]}")

Stemmed tokens: ['from', 'rych', 'festiv', 'ed', 'ac', 'uk', 'R', 'hawk', 'subject', 'DS', 'where', 'textur', 'rule', 'go', 'line', 'Hi', 'I', 'notic', 'save', 'model', 'map', 'plane', 'posit', 'care', 'DS', 'file', 'reload', 'restart', 'DS', 'given', 'default', 'posit', 'orient', 'but', 'save', 'prj', 'file', 'posit', 'orient', 'preserv', 'doe', 'anyon', 'know', 'inform', 'store', 'DS', 'file', 'noth', 'explicitli', 'said', 'manual', 'save', 'textur', 'rule', 'prj', 'file', 'I', 'like', 'abl', 'read', 'textur', 'rule', 'inform', 'anyon', 'format', 'prj', 'file', 'Is', 'cel', 'file', 'format', 'avail', 'somewher', 'rych', 'rychard', 'hawk', 'email', 'rych', 'festiv', 'ed', 'ac', 'uk', 'virtual', 'environ', 'laboratori', 'dept', 'psycholog', 'tel', 'univ', 'edinburgh', 'fax', 'subject', 'Re', 'biblic', 'back', 'koresh', 'tape', 'cite', 'enclos', 'from', 'kmcvay', 'oneb', 'almanac', 'bc', 'ca', 'ken', 'mcvay', 'organ', 'the', 'old', 'frog', 'almanac', 'line', 'In', 'articl', 'apr', 'utar

In [79]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn import metrics

raw_data = np.array(newsgroups.data)
raw_labels = np.array(newsgroups.target)

raw_data = np.reshape(raw_data, (raw_data.shape[0],1))
raw_labels = np.reshape(raw_labels, (raw_labels.shape[0],1))

newsgroups_train, newsgroups_test, newsgroups_target_train, newsgroups_target_test = train_test_split(raw_data, raw_labels, test_size=0.3, random_state=42)


vectorizer = TfidfVectorizer(stop_words='english', token_pattern='[a-zA-Z]+')
vectors = vectorizer.fit_transform(newsgroups_train.ravel())

vectors_test = vectorizer.transform(newsgroups_test.ravel())

svm_clf = SVC(gamma='auto')
svm_clf.fit(vectors, newsgroups_target_train.ravel())
pred = svm_clf.predict(vectors_test)
print(f"SVC results:\n {metrics.confusion_matrix(newsgroups_target_test.ravel(), pred)}")


nb_clf = MultinomialNB()
nb_clf.fit(vectors, newsgroups_target_train.ravel())
pred = nb_clf.predict(vectors_test)
print(f"Multinomial Naive Bayes Classifier results:\n {metrics.confusion_matrix(newsgroups_target_test.ravel(), pred)}")



SVC results:
 [[  0   0 224   0]
 [  0   0 297   0]
 [  0   0 307   0]
 [  0   0 189   0]]
Multinomial Naive Bayes Classifier results:
 [[221   0   2   1]
 [  1 293   3   0]
 [  0   7 300   0]
 [ 47   7  11 124]]
