## Importing modules

In [3]:
import numpy as np
import pandas as pd
import os
import io

## Listing folders

In [8]:
folders=sorted(os.listdir(os.path.join('/media/root/par2/scripts/20_newsgroups'))) # os.listdir gives a list of all files in this path
folders

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [9]:
folders[19] 

'talk.religion.misc'

## Loading the data into kernel

In [10]:
DATA_DIR='/media/root/par2/scripts/20_newsgroup'

In [11]:
data={} # data is a dictionary of the form { folder1 : [doc1,doc2,....,doc1000] , folder2 : [doc1,doc2,doc3,....] }
for folder in folders:
    data[folder]=[]
    for file in os.listdir(os.path.join(DATA_DIR,folder)):
        with io.open(os.path.join(DATA_DIR,folder,file),encoding='latin-1') as opened_file:
            data[folder].append(opened_file.read())
print(len(data[folders[1]]))

1000


## Building vocabulary (feature set)

#### > Creating list of stop words 

In [12]:
from nltk.corpus import stopwords # Importing list of stop words from nltk
from string import punctuation # Importing list of punctuations from string
punctuations=list(punctuation)
stopWords=stopwords.words('english')
stopWords+=punctuations # Combined list of stop words

#### > Own list of stop words

In [13]:
# Common words throughout all docs play no part in classification ,so removing them
stopWords+=['subject:','from:', 'date:', 'newsgroups:', 'message-id:', 'lines:', 'path:', 'organization:', 
            'would', 'writes:', 'references:', 'article', 'sender:', 'nntp-posting-host:', 'people', 
            'university', 'think', 'xref:', 'cantaloupe.srv.cs.cmu.edu', 'could', 'distribution:', 'first', 
            'anyone','world', 'really', 'since', 'right', 'believe', 'still', 
            "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'"]

#### > Building Vocab

In [14]:
vocab={}
# Creating a dictionary of words and their frequency
for i in range(len(data)): # For each key(newsgroup)
    for doc in data[folders[i]]: # For each document corresponding to key(newsgroup)
        for word in doc.split(): # For each word in that document
            if word.lower() not in stopWords and len(word.lower()) >= 5:
                if word.lower() not in vocab:
                    vocab[word.lower()]=1
                else:
                    vocab[word.lower()]+=1
len(vocab)

390170

In [17]:
# Sort the dictionary based on frequency of each 'possible' vocabulary word
import operator
sorted_vocab=sorted(vocab.items(),key=operator.itemgetter(1),reverse=True)

### Building final feature list from vocab

In [18]:
# Choosing top 2000 vocab words as features
feature_list=[]
for key in sorted_vocab:
    feature_list.append(key[0])
feature_list=feature_list[0:2000] # K = 2000 (number of words in vocab)

### Transforming data into X and Y 

In [19]:
Y=[] # list of newsgroups 
for i in range(len(data)):
    for doc in data[folders[i]]:
        Y.append(folders[i])
Y=np.array(Y)

In [20]:
type(data[folders[1]])

list

In [21]:
# Each row : one doc and each column : one word from feature_list
# Columns headers will be the names of features 
df = pd.DataFrame(columns = feature_list)

for folder in folders:
    # Insert each file as a new row 
    for file in os.listdir(os.path.join(DATA_DIR,folder)):
        # Add a new row for every file
        df.loc[len(df)] = np.zeros(len(feature_list))
        with io.open(os.path.join(DATA_DIR,folder,file),encoding='latin-1') as opened_file:
            for word in opened_file.read().split():
                if word.lower() in feature_list:
                    df[word.lower()][len(df)-1] += 1 #df[current_column][current_row]
df

Unnamed: 0,going,something,computer,system,might,please,reply-to:,using,never,can't,...,returned,split,argic,earth.,plastic,cellular,"back,",sure.,spencer),gordon
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
X=df.values

In [23]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 3., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Splitting X and Y into training and testing data

In [24]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=0,test_size=0.25)

### Using the inbuilt Multinomial Naive Bayes

In [25]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
clf.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
#add this line
Y_predict = clf.predict(x_train)

In [41]:
print("testing Accuracy =")
clf.score(x_test,y_test)

testing Accuracy =


0.8426

In [42]:
print("Traning Accuracy =")
clf.score(x_train,y_train)

Traning Accuracy =


0.8935120357404814

In [43]:
dictionary=fit(x_train,y_train)

In [44]:
y_pred=predict(x_test,dictionary)

In [45]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_pred,y_test))
print(classification_report(y_pred,y_test))

[[187   1   0   0   0   0   0   1   1   0   0   0   0   5   3   1   2   3
    1  62]
 [  0 204   7   6   1  10   2   1   0   0   0   2   4   3   7   0   0   1
    1   1]
 [  0   9 207   4   3  10   3   0   1   0   0   1   6   2   0   0   0   0
    1   0]
 [  0   6  12 203   5   2   3   0   0   2   0   1   3   0   1   0   1   0
    0   0]
 [  1   6   3  13 212   2   2   2   0   0   0   1   5   0   0   0   1   0
    0   0]
 [  0   4   5   3   0 207   2   1   0   0   0   4   0   1   1   0   0   0
    0   0]
 [  1   3   0   5   4   3 227   3  12   0   1   1   7   1   0   0   2   1
    1   0]
 [  1   3   2   0   4   1   5 238  12   3   0   1   3   2   3   0   3   1
    5   1]
 [  0   4   0   2   1   1   3   7 249   0   0   1   2   4   3   0   0   3
    2   2]
 [  0   1   0   0   0   0   1   1   2 240   0   0   0   0   2   0   1   0
    1   1]
 [  2   0   1   0   0   0   1   0   1   3 229   1   1   1   1   0   0   1
    1   0]
 [  0   2   2   0   0   1   1   2   0   0   0 206   1   0   0   0

[[187   1   0   0   0   0   0   1   1   0   0   0   0   5   3   1   2   3
    1  62]
 [  0 204   7   6   1  10   2   1   0   0   0   2   4   3   7   0   0   1
    1   1]
 [  0   9 207   4   3  10   3   0   1   0   0   1   6   2   0   0   0   0
    1   0]
 [  0   6  12 203   5   2   3   0   0   2   0   1   3   0   1   0   1   0
    0   0]
 [  1   6   3  13 212   2   2   2   0   0   0   1   5   0   0   0   1   0
    0   0]
 [  0   4   5   3   0 207   2   1   0   0   0   4   0   1   1   0   0   0
    0   0]
 [  1   3   0   5   4   3 227   3  12   0   1   1   7   1   0   0   2   1
    1   0]
 [  1   3   2   0   4   1   5 238  12   3   0   1   3   2   3   0   3   1
    5   1]
 [  0   4   0   2   1   1   3   7 249   0   0   1   2   4   3   0   0   3
    2   2]
 [  0   1   0   0   0   0   1   1   2 240   0   0   0   0   2   0   1   0
    1   1]
 [  2   0   1   0   0   0   1   0   1   3 229   1   1   1   1   0   0   1
    1   0]
 [  0   2   2   0   0   1   1   2   0   0   0 206   1   0   0   0