## Text Classification
Project Aim : To classify a given document into a category.

## Building Vocabulary
Before applying any kind of algorithm, we first build the vocabulory, that is getting the data ready in the desired form/shape.

In [1]:
# Imports
import os
from nltk.corpus import stopwords
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### Feature Extraction 

In [2]:
DATASET_PATH = '20_newsgroups'

# List of folders that will be used later
folders = [folder for folder in os.listdir(DATASET_PATH)]

In [3]:
# 2D list of files, where ith index will store all filenames in the ith folder
files = []
for folder in folders:
    # Create paths to subdirectories
    folder_path = os.path.join(DATASET_PATH, folder)
    # Add all files in current folder to the list
    files.append([file for file in os.listdir(folder_path)])

# Total files 
sum(len(files[i]) for i in range(20))

19997

In [4]:
# List of pathnames to all files in our dataset
pathname_list = []
# For each folder
for folder in range(len(folders)):
    # For all files in current folder
    for file in files[folder]:
        # add path to pathname list
        pathname_list.append(os.path.join(DATASET_PATH, os.path.join(folders[folder], file)))

# check if all files path have been added
len(pathname_list)

19997

In [5]:
# Y array - this holds the output class for each document
Y = []
for folder in folders:
    folder_path = os.path.join(DATASET_PATH, folder)
    num_files = len(os.listdir(folder_path))
    # Add Y = name of folder for all files in current folder
    for i in range(num_files):
        Y.append(folder)

# Checking if all files have been given a category
len(Y)

19997

#### Splitting data into train and test

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(pathname_list, Y, random_state=42)
len(X_train), len(X_test), len(Y_train), len(Y_test)

(14997, 5000, 14997, 5000)

### Word Extraction from document

In [7]:
# Function to process a file and extract words from it
def preprocess_words(words): 
    
    # removing punctuation from each word
    mapping_table = str.maketrans('', '', string.punctuation)

    # clean words
    words = [w.translate(mapping_table) for w in words]

    # normalizing case - making all words of the same case
    words = [w.lower() for w in words]

    # removing any blank words that may have been left due to nature of the dataset
    words = [w for w in words if w] # if the word has some length then store else remove

    # removing numeric strings as they don't add that much value to our dataset
    words = [w for w in words if not w.isdigit()]

    # removing words with single character or two characters
    words = [w for w in words if len(w) > 2]
    
    return words

In [8]:
# Function to remove stop words
def remove_stopwords(words):
    stop_words = set(stopwords.words('english')) 
    block_words = ['newsgroups', 'xref', 'path', 'from', 'subject', 'sender', 'organisation', 'apr','gmt', 'last','better','never','every','even','two','good','used','first','need','going','must','really','might','well','without','made','give','look','try','far','less','seem','new','make','many','way','since','using','take','help','thanks','send','free','may','see','much','want','find','would','one','like','get','use','also','could','say','us','go','please','said','set','got','sure','come','lot','seems','able','anything','put', '--', '|>', '>>', '93', 'xref', 'cantaloupe.srv.cs.cmu.edu',
                   '20', '16', "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'", '21', '19', '10', '17', '24', 
                   'reply-to:', 'thu', 'nntp-posting-host:', 're:','25''18'"i'd"'>i''22''fri,''23''>the',
                   'references:','xref:','sender:','writes:','1993','organization:']
    return [w for w in words if not w in stop_words and not w in block_words]

In [9]:
# Function to remove metadata above each line

# The metadata for each document ends at the first occurence of a blank line

def remove_metadata(lines):
    # iterate over all lines
    for i in range(len(lines)):
        if lines[i]=='\n':
            # Blank line found
            start = i + 1
            break
    # discard data before the start valued line
    new_lines = lines[start:]
    return new_lines

In [10]:
# Function to convert a line into a list of words
def tokenize_line(line):
    # Splitting the line at each space encountered
    words = line[0:len(line)-1].strip().split(" ")
    
    # preprocess words
    words = preprocess_words(words)
    
    # remove stopwords
    words = remove_stopwords(words)
    
    return words

In [11]:
# Function to convert a document into list of words
def tokenize_doc(file_path):
    
    # Load document
    doc = open(file_path, 'r', encoding="ISO-8859-1")
    doc_lines = doc.readlines()
    
    # Remove metadata
    doc_lines = remove_metadata(doc_lines)
    
    # Intializing empty words array for doc
    doc_words = []
    
    for line in doc_lines:
        doc_words.append(tokenize_line(line))
    
    return doc_words

In [12]:
# Flatten the list of words i.e, 2D to 1D
def flatten(words):
    new_words = []
    for i in words:
        for j in i:
            new_words.append(j)
    return new_words

### Working on actual data

In [13]:
list_of_words = []
for doc in X_train:
    list_of_words.append(flatten(tokenize_doc(doc)))

#### List of words is a 2D array, where each row has words for that doc. And on flattening we get the complete vocabulary

In [14]:
# Converting the list of words to a numpy array
np_words = np.asarray(flatten(list_of_words))
np_words.shape

(1795356,)

In [15]:
# Number of unique words
words, counts = np.unique(np_words, return_counts=True)
len(words)

147032

In [16]:
# Sorting the unique words on basis of count and storing them in two lists
# one list holds the actual word and the other holds count of that word
freq, words_ = (list(i) for i in zip(*(sorted(zip(counts, words), reverse=True))))

### Transforming training data into proper shape

In [62]:
num_words_selected = 12300
features = words_[0:num_words_selected]
print(features)



### Building the vocabulary dictionary

In [63]:
# This dictionary will hold the vocabulory for each document and will help us build the training set later
# dict[doc][word] symbolised that a particular document has in it a particular words and the value of the key 
# is the frequency of that word in that particular doc
vocab = {}
curr_doc = 1
for curr_doc_words in list_of_words:
    vocab[curr_doc] = {}
    np_curr_doc_words = np.asarray(curr_doc_words)
    words_in_curr_doc, frequencies = np.unique(np_curr_doc_words, return_counts=True)
    for i in range(len(words_in_curr_doc)):
        # Add each words count for current doc
        vocab[curr_doc][words_in_curr_doc[i]] = frequencies[i]
    curr_doc = curr_doc + 1

vocab

{1: {'1993apr2616362711364csrduiucedu': 1,
  'abortion': 2,
  'afford': 2,
  'arise': 1,
  'article': 1,
  'birth': 3,
  'cant': 2,
  'case': 1,
  'choice': 1,
  'conlon': 1,
  'doesnt': 1,
  'done': 1,
  'evelyn': 1,
  'expensive': 1,
  'frank': 1,
  'gskinneruiucedu': 1,
  'hardly': 1,
  'hatching': 1,
  'hens': 1,
  'latter': 1,
  'meaning': 1,
  'mother': 2,
  'objection': 1,
  'odwyer': 1,
  'odwyersseie': 1,
  'parker': 1,
  'pay': 1,
  'paying': 1,
  'pregnancy': 3,
  'refused': 1,
  'sense': 1,
  'sensible': 1,
  'skinner': 1,
  'statement': 1,
  'statements': 1,
  'together': 1,
  'topic': 1,
  'unable': 1,
  'ways': 1,
  'writes': 1},
 2: {'\tyou': 1,
  '1ren9a94qmorrowstanfordedu': 1,
  '2nd': 1,
  'accept': 1,
  'actual': 2,
  'ago': 1,
  'altogether': 1,
  'ancient': 1,
  'anonymous': 2,
  'another': 2,
  'appear': 1,
  'applies': 1,
  'apply': 1,
  'aramaic': 1,
  'around': 1,
  'article': 2,
  'associates': 2,
  'assuredly': 1,
  'attempt': 1,
  'attested': 1,
  'backgro

### Using Vocab to build training dataset into 2D shape

In [64]:
# Building a 2D array as our training data using vocabulory
# First 7000 words will be treated as features, ith row will have the frequency of each of those features for 
# the ith doc
X_train_final = []
for key in vocab.keys():
    curr_doc_row = []
    for word in features:
        # if word present in vocab of curr_doc add it in training set
        if word in vocab[key].keys():
            curr_doc_row.append(vocab[key][word])
        # else put value as 0
        else:
            curr_doc_row.append(0)
    # add curr_doc row to training set
    X_train_final.append(curr_doc_row)

In [65]:
# Casting arrays to numpy arrays
X_train_final = np.asarray(X_train_final)
Y_train = np.asarray(Y_train)

In [66]:
X_train_final.shape, Y_train.shape

((14997, 12300), (14997,))

In [67]:
# Making a dataframe for better representation of data
df = pd.DataFrame(X_train_final)
df.columns = features
df.head()

Unnamed: 0,writes,article,dont,people,know,think,time,right,system,god,...,waits,wailing,wagons,vomiting,vols,visually,vii,viability,verbeek,venerable
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,2,0,1,1,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Preparing Test Data
Now that our training data is prepared , we will prepare the test data in a similar manner

In [68]:
list_of_words_test = []
for doc in X_test:
    list_of_words_test.append(flatten(tokenize_doc(doc)))

len(list_of_words_test)

5000

In [69]:
vocab_test = {}
curr_doc = 1
for curr_doc_words in list_of_words_test:
    vocab_test[curr_doc] = {}
    np_curr_doc_words = np.asarray(curr_doc_words)
    words_in_curr_doc, frequencies = np.unique(np_curr_doc_words, return_counts=True)
    for i in range(len(words_in_curr_doc)):
        # Add each words count for current doc
        vocab_test[curr_doc][words_in_curr_doc[i]] = frequencies[i]
    curr_doc = curr_doc + 1

vocab_test

{1: {'3point': 1,
  'agree': 1,
  'along': 1,
  'andrew': 1,
  'andrewfripwvtekcom': 1,
  'attack': 1,
  'back': 2,
  'belts': 2,
  'best': 1,
  'cargo': 1,
  'class': 1,
  'controversial': 1,
  'copies': 1,
  'cravanvoyager': 1,
  'eliminating': 1,
  'extra': 1,
  'ill': 1,
  'instead': 1,
  'killed': 1,
  'klossner': 1,
  'leg': 1,
  'mazda': 1,
  'middle': 1,
  'mpv': 2,
  'price': 1,
  'rear': 1,
  'room': 1,
  'seat': 3,
  'shoehorned': 1,
  'shove': 1,
  'small': 1,
  'space': 1,
  'suv': 1,
  'villager': 2,
  'villagerquest': 1},
 2: {'1993apr1703152013902clarinetcom': 1,
  'actual': 1,
  'algorithm': 1,
  'article': 1,
  'brad': 1,
  'bradclarinetcom': 1,
  'cellular': 1,
  'classified': 1,
  'decryption': 1,
  'doesnt': 1,
  'encrypting': 1,
  'encryption': 1,
  'end': 5,
  'hence': 1,
  'however': 1,
  'kept': 1,
  'keys': 1,
  'later': 1,
  'link': 1,
  'main': 1,
  'means': 1,
  'phones': 1,
  'radio': 2,
  'secure': 1,
  'sense': 1,
  'telco': 1,
  'templeton': 1,
  'think

In [70]:
#2D test dataset making
X_test_final = []
for key in vocab_test.keys():
    curr_doc_row = []
    for word in features:
        # if word present in vocab of curr_doc add it in training set
        if word in vocab_test[key].keys():
            curr_doc_row.append(vocab_test[key][word])
        # else put value as 0
        else:
            curr_doc_row.append(0)
    # add curr_doc row to training set
    X_test_final.append(curr_doc_row)

In [71]:
# Casting arrays to numpy arrays
X_test_final = np.asarray(X_test_final)
Y_test = np.asarray(Y_test)

In [72]:
X_test_final.shape, Y_test.shape

((5000, 12300), (5000,))

In [73]:
# Making a dataframe for better representation of data
df = pd.DataFrame(X_test_final)
df.columns = features
df.head()

Unnamed: 0,writes,article,dont,people,know,think,time,right,system,god,...,waits,wailing,wagons,vomiting,vols,visually,vii,viability,verbeek,venerable
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Text Classification (Using Naive Bayes - Multinomial)

#### Using Sklearn Inbuilt Naive Bayes

In [79]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

In [80]:
# fit on our data
clf.fit(X_train_final, Y_train)

MemoryError: 

In [None]:
Y_pred = clf.predict(X_test_final)

In [None]:
Y_pred.shape

In [None]:
clf.score(X_test_final, Y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(Y_test, Y_pred))

In [None]:
print(confusion_matrix(Y_test, Y_pred))

In [None]:
# Score on training data
clf.score(X_train_final, Y_train)

#### Self Implemented

In [None]:
# Self implementation of Multinomial Naive Bayes

In [None]:
# Fit function - this function will make the count dicitionary for us

# The dicitonary will hold the count of words for all distinct classes (not all documents)

# This dictionary will then be used to predict on the testing data

def fit(x, y):
    # Initializing with an empty dictionary
    result = {}
    classes = set(y)
    # Total data points
    result["total_data"] = len(y)
    
    for curr_class in classes:
        # Make a deeper level for current class
        result[curr_class] = {}
        # Rows where Y = curr_class
        current_class_rows = (y == curr_class)
        # Arrays with data for only curr_class
        X_curr_class = x[current_class_rows]
        Y_curr_class = y[current_class_rows]
        num_features = len(features) # Number of words selected to be features
        
        # for each word in features 
        for i in range (0, num_features):
            # Take sum of current words for all documents with output as current class
            result[curr_class][features[i]] = X_curr_class[:, i].sum()
        # Total word count for current class
        result[curr_class]["total_count"] = len(Y_curr_class)
    return result

In [None]:
# probability Function - this function will calculate log probabilities for all classes for a document using
# the dictionary made by the fit function

def probability(doc, dictionary, curr_class):
    
    # Probability that Y = curr_class
    output_prob = np.log(dictionary[curr_class]["total_count"]) - np.log(dictionary["total_data"])
    num_features = len(features) - 1 # -1 for total_data key
    # for each word in the dataset 
    for i in range (0, num_features):
        # Counts with laplace corrections
        count_word_curr_class = dictionary[curr_class][features[i]] + 1
        count_curr_class = dictionary[curr_class]["total_count"] + len(features)
        curr_word_curr_class_prob = np.log(count_word_curr_class) - np.log(count_curr_class)
        # adding the probability of the word the number of times it occurs in the current document
        for j in range(int(doc[i])):
            output_prob += curr_word_curr_class_prob
    print(output_prob)
    return output_prob

In [None]:
# predictSinglePoint function - this function will predict the output for a single document at a time

def predictSinglePoint(doc, dictionary):
    # Possible classes
    classes = dictionary.keys()
    # Some initialization paramaters
    best_p = -1000
    best_class = -1
    first_run = True
    # iterate over all classes
    for curr_class in classes:
        # if all classes exhausted
        if curr_class == "total_data":
            continue
        # Calculate probability of curr_class being output for the current data point
        p_curr_class = probability(doc, dictionary, curr_class)
        # if it is better than current best probability - update
        if (p_curr_class > best_p or first_run):
            best_p = p_curr_class
            best_class = curr_class
        first_run = False
    return best_class

In [None]:
# Predict Function - Will make the prediction using the dictionary that the fit function will build

def predict(x_test, dictionary):
    
    # We'll make a prediction for each document in the test data
    Y_pred = []
    j = 1
    for doc in x_test:
        print(j)
        j += 1
        doc_class = predictSinglePoint(doc, dictionary)
        Y_pred.append(doc_class)
    return Y_pred

## Training on self implemented multinomial NB

In [None]:
dictionary = fit(X_train_final, Y_train)

In [None]:
dictionary

## Predicting using the dictionary generated

In [None]:
Y_pred_own = predict(X_test_final, dictionary)

In [None]:
Y_pred_own

In [None]:
len(Y_pred_own)

## Scoring and other metrics for self implementation

In [None]:
print(classification_report(Y_test, Y_pred_own))

In [None]:
print(confusion_matrix(Y_test, Y_pred_own))

In [None]:
from sklearn.metrics import accuracy_score
print("Self implementation accuracy :" , accuracy_score(Y_test, Y_pred_own))
print("SKLearn accuracy :", accuracy_score(Y_test, Y_pred))

# adding some more block/stop words might help