In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import math
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [2]:
# 5 Classes
labelToNum = {'comp.graphics': 0, 'sci.med': 1, 'talk.politics.misc': 2, 'rec.sport.hockey': 3, 'sci.space': 4}

In [3]:
path = "20_newsgroups/"

docs = []
labels = []

# For subfolders in folders.
for doc in os.listdir(path):
    # only requested folders.
    if doc in labelToNum:
        #print(doc)
        subDir = os.listdir(path+"/"+doc)
        for ins_doc in subDir:
            #print(ins_doc)
            f = open(path+"/"+doc+"/"+ins_doc, 'r', encoding ="ascii", errors ="surrogateescape")
            docs.append(f.read())
            labels.append(labelToNum[doc])
        
docs = np.array(docs)
labels = np.array(labels)

### 1.

In [4]:
lemmatizer = WordNetLemmatizer() 
stopWords = set(stopwords.words('english'))

In [5]:
def preprocessing(sent):
    sent = sent.lower()
    sent = re.sub(r'[^a-zA-Z0-9]+', ' ', sent)
    sent = [w for w in sent.split() if w not in stopWords]
    sent = [w for w in sent if len(w) > 2]
    sent = [lemmatizer.lemmatize(w) for w in sent]
    return " ".join(sent).strip()

In [6]:
for i,d in enumerate(docs):
    docs[i] = preprocessing(d)

### 2.

In [7]:
#np.random.choice(80, size=50)
def trainTestSplit(ratio = 0.6):
    # Compute length
    len_of_train = int(docs.shape[0] * ratio)
    # Get train indices randomly.
    index_of_train = np.random.choice(docs.shape[0], size=len_of_train)
    
    # return data
    return (docs[index_of_train], 
            labels[index_of_train], 
            docs[np.setdiff1d(range(docs.shape[0]), index_of_train)], 
            labels[np.setdiff1d(range(docs.shape[0]), index_of_train)])

In [8]:
train, trainLabel, test, testLabel = trainTestSplit()


Train Data Size:  3000 3000
Test Data Size:  2752 2752


### 3.

In [9]:

ClassFreq = {}
for j,k in enumerate(train):
    for word in k.split(" "):
        if word not in ClassFreq:
            ClassFreq[word] = set()
        ClassFreq[word].add(labels[j])

In [10]:
InverseClassFreq = {}
length=len(ClassFreq[word]
for word in ClassFreq.keys():
    InverseClassFreq[word] = math.log(5/length)

In [11]:
def count_term_freq(term_freq,train,trainLabel):
    for i,sent in enumerate(train):
        for w in sent.split(" "):
            if w not in term_freq[trainLabel[i]]:
                term_freq[trainLabel[i]][w] = 0
            term_freq[trainLabel[i]][w] += 1

    return term_freq

In [12]:
term_freq = {0: {}, 1: {}, 2: {}, 3: {}, 4: {}}
term_freq = count_term_freq(term_freq,train,trainLabel)


In [13]:
tf_icf = {}
for i in range(0,5):
    tf_icf[i] = {}
    for w in term_freq[i]:
        tf_icf[i][w] = term_freq[i][w] * InverseClassFreq[w]

In [15]:
k = 200
features = set()
for i in range(5):
    z = list(dict(sorted(tf_icf[i].items(), key=lambda y: y[1], reverse=True)).keys())
    features = features.union(set(z[:k]))
vocab = features

In [16]:
def generateNewData(data, vocab):
    updatedData = []
    for text in data:
        newText = []
        for word in text.split(" "):
            if word in vocab:
                newText.append(word)
        updatedData.append(" ".join(newText).strip())
        
    return updatedData

In [17]:
def generateFeatures(train_data, test_data):
    vect = TfidfVectorizer()
    A = vect.fit_transform(train_data)
    B = vect.transform(test_data)
    return (A, B)

In [18]:
train = generateNewData(train, vocab)
test = generateNewData(test, vocab)
(train, test) = generateFeatures(train, test)

### 4.

In [32]:
from sklearn.naive_bayes import GaussianNB
naiveBayesModel = GaussianNB()

In [33]:
naiveBayesModel.fit(train.toarray(),trainLabel)

GaussianNB()

In [34]:
predict_train = naiveBayesModel.predict(train.toarray())
predict_test = naiveBayesModel.predict(test.toarray())


### 5.

In [35]:
print("Train Accuracy: ", sum(predict_train == trainLabel)/len(trainLabel))
print("Test Accuracy: ", sum(predict_test == testLabel)/len(testLabel))

Train Accuracy:  0.9731428571428572
Test Accuracy:  0.9218623481781376


In [36]:
from sklearn.metrics import confusion_matrix
confusion_matrix(testLabel,y)

array([[471,  13,   0,   0,  12],
       [114, 371,   0,   0,  15],
       [ 22,   9, 474,   1,  10],
       [ 30,   0,   0, 451,   0],
       [ 68,   5,   3,   0, 401]], dtype=int64)

### 6.

##### 50-50 Split

In [43]:
trainX, trainy, testX, testy = trainTestSplit(0.5)

trainX = generateNewData(trainX, vocab)
testX = generateNewData(testX, vocab)
(trainX, testX) = generateFeatures(trainX, testX)

model = GaussianNB()
model.fit(trainX.toarray(), trainy)
x = model.predict(trainX.toarray())
y = model.predict(testX.toarray())
print("Train Accuracy: ", sum(x == trainy)/len(trainy))
print("Test Accuracy: ", sum(y == testy)/len(testy))
confusion_matrix(testy,y)

Train Accuracy:  0.9808
Test Accuracy:  0.9230769230769231


array([[528,  35,   0,   0,  32],
       [ 48, 522,  18,   1,   4],
       [  2,  19, 582,   3,  17],
       [  0,   2,   8, 585,   3],
       [ 20,  10,  10,   1, 579]], dtype=int64)

##### 70-30 split

In [44]:
trainX, trainy, testX, testy = trainTestSplit(0.7)

trainX = generateNewData(trainX, vocab)
testX = generateNewData(testX, vocab)
(trainX, testX) = generateFeatures(trainX, testX)

model = GaussianNB()
model.fit(trainX.toarray(), trainy)
x = model.predict(trainX.toarray())
y = model.predict(testX.toarray())

print("Train Accuracy: ", sum(x == trainy)/len(trainy))
print("Test Accuracy: ", sum(y == testy)/len(testy))
confusion_matrix(testy,y)

Train Accuracy:  0.9694285714285714
Test Accuracy:  0.9113100081366965


array([[452,  22,   0,   2,  29],
       [ 46, 427,  20,   0,  15],
       [  3,   9, 480,   0,  17],
       [  1,   3,   6, 470,   0],
       [ 18,   7,  20,   0, 411]], dtype=int64)