# Text Classification

This is the text classification of a popular dataset <b>"20 newsgroups"</b>.
<br>Here, I'm using <b>Multinomial NaiveBayes</b> which is quite powerful and fast when it comes to text classification.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from IPython.display import clear_output # to get realtime updations in model making

## Self Implementation

Dataset - http://archive.ics.uci.edu/ml/datasets/Twenty+Newsgroups

In [2]:
# function for reading dataset files
def readDatasetFiles():
    X = []
    Y = []
    total_files = 0
    for category in os.listdir(r"./20_newsgroups"):
        for document in os.listdir(r"./20_newsgroups/" + category):
            total_files += 1
    total_files_readed = 0
    for category in os.listdir(r"./20_newsgroups"):
        for document in os.listdir(r"./20_newsgroups/" + category):
            clear_output(wait = True)
            print('Total Files Readed:', total_files_readed, 'Out of', total_files)
            with open("./20_newsgroups/"+category+"/"+document ,"r") as file:
                X.append((document, file.read()))
                Y.append(category)
            total_files_readed += 1
    return (X, Y)

### Tokenization of training data

<b>"Stopwords"</b>, these are the words which are present in almost every document.<br>So it is better to remove these from the dataset as this will not be useful while training

In [3]:
stop_word=["a","about","above","after","again","against","all","am","an","and","any","are","as","at","be","because","been","before","being","below","between","both","but",
"by","could","did","do","does","doing","down","during","each","few","for","from","further","had","has","have","having","he","he'd","he'll","he's","her",
"here","here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","it","it's","its","itself","let's","me",
"more","most","my","myself","nor","of","on","once","only","or","other","ought","our","ours","ourselves","out","over","own","same","she",
"she'd","she'll","she's","should","so","some","such","than","that","that's","the","their","theirs","them","themselves","then","there","there's",
"these","they","they'd","they'll","they're","they've","this","those","through","to","too","under","until","up","very","was","we","we'd",
"we'll","we're","we've","were","what","what's","when","when's","where","where's","which","while","who","who's","whom","why","why's","with",
"would","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves"]

In [4]:
# function to split a sentence into bag of words from every non-alphanumeric character or a group of same
def tokenize(x):
    tokens = re.split(r'\W+', x.lower())
    return tokens

In [5]:
# function to make a dictionary of all words which are present in training dataset
def makeDictionary(x_train):
    dictionary = {}
    total_training_data = len(x_train)
    features_extracted_from_training_data = 0
    for x in x_train: # iterate through all the training datapoints
        clear_output(wait = True)
        print('Features Extracted From', features_extracted_from_training_data, 'Out of', total_training_data, 'Training Data')
        x_tokens = tokenize(x[1])
        for token in x_tokens:
            # purifying the dictionary from stopwords or words which contains a non-alphabet character, etc
            if not(token.isalpha()) or token in stop_word or len(token) <= 2:
                continue
            if token in dictionary:
                dictionary[token] += 1
            else:
                dictionary[token] = 1
        features_extracted_from_training_data += 1
    return dictionary

In [6]:
# function to select top few words(as features) from the dictionary which has the max frequency in training dataset
def prepareModelFeatures(x_train):
    dictionary = makeDictionary(x_train)
    sorted_dictionary = sorted(dictionary.items(), key = lambda item: item[1], reverse = True)
    vocabulary = [i[0] for i in sorted_dictionary]
    features = vocabulary[:3000]
    return features

After preparing the features, lets say n features for the model training,
### Preprocessing of training and testing data
Make a dataframe which contains n features as its column names and its rows as training datapoints word count
<br>Each row has n length. And the ith column of each row contains the count of the word(ith feature) in that particular datapoint
<br>And do the same with the testing datapoints, but the features used will be same as the training datapoint

In [7]:
# function for preprocessing of the training data
def preprocessTrainingData(x_train):
    features = prepareModelFeatures(x_train) # get features from training data
    x_train_dataset = np.zeros([len(x_train), len(features)], int)
    total_training_data = len(x_train)
    proccessed_training_data = 0
    for i in range(len(x_train)): # iterating through each datapoint
        clear_output(wait = True)
        print('Training Data Processed on Obtained Features:', proccessed_training_data, 'Out of', total_training_data)
        x = x_train[i][1]
        # count each feature in the datapoint and put it in the column of corresponding feature in the current row
        x_tokens = re.split(r'\W+', x.lower())
        for token in x_tokens:
            if token in features:
                x_train_dataset[i][features.index(token)] += 1
        proccessed_training_data += 1
    return (x_train_dataset, features)

In [8]:
# function for preprocessing of the training data
def preprocessTestingData(x_test, features):
    x_test_dataset = np.zeros([len(x_test), len(features)], int)
    total_testing_data = len(x_test)
    proccessed_testing_data = 0
    for i in range(len(x_test)): # iterating through each datapoint
        clear_output(wait = True)
        print('Testing Data Processed on Obtained Features:', proccessed_testing_data, 'Out of', total_testing_data)
        x = x_test[i][1]
        # count each feature in the datapoint and put it in the column of corresponding feature in the current row
        x_tokens = re.split(r'\W+', x.lower())
        for token in x_tokens:
            if token in features:
                x_test_dataset[i][features.index(token)] += 1
        proccessed_testing_data += 1
    return x_test_dataset

### Model Making

fit function - makes a dictionary which has its keys as model classes and values as another dictionary which has its keys as features and values as the feature count in the training data of class(key of previous dictionary)

In [9]:
# model fit function
def fit(x_train, y_train, features):
    result = {}
    classes = list(set(y_train))
    for current_class in classes:
        result[current_class] = {}
        current_x_train = x_train[y_train == current_class]
        current_y_train = y_train[y_train == current_class]
        result[current_class]['total_count'] = len(current_y_train)
        for i in range(len(features)):
            current_feature_value = current_x_train[:, i]
            result[current_class][features[i]] = np.sum(current_feature_value)
        result[current_class]['total_words'] = np.sum(list(result[current_class].values()))
    result['total_data'] = len(y_train)
    result['dictionary'] = features
    return result

probability function - calculates the probability of a particular class using bayes theorem and laplace correction
<br><br>bayes theorem:<br>
probability of particular class = $\prod_i (\frac{count of ith feature in that class}{count of all features in that class}) * \frac{count of that class}{count of all classes}$ Here ith feature is every word in the test data 
<br><br>laplace correction:<br>
add 1 in the numerator and add count of all features in denominator of probability of a particular feature while calculating for any class to avoid zero probability 

In [10]:
# function to calculate probability of a particular class for a particular test data
def probability(clf, x, current_class):
    # used log because value may be lower than 1 which then on multiplication may creates problem
    output = np.log(clf[current_class]['total_count']) - np.log(clf['total_data'])
    for i in range(len(x)):
        current_datapoint = x[i]
        current_feature = clf['dictionary'][i]
        count_current_feature_in_current_class = clf[current_class][current_feature] + 1
        count_all_features_in_current_class = clf[current_class]['total_words'] + (len(clf[current_class].keys()) - 1)
        current_feature_probability_in_current_class = np.log(count_current_feature_in_current_class) - np.log(count_all_features_in_current_class)
        for j in range(current_datapoint):
            output += current_feature_probability_in_current_class
    return output

In [11]:
# function to identify the best class which has the max probability for a particular test data
def predict_single_class(clf, x):
    classes = clf.keys()
    current_class = 0
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if (current_class == "total_data" or current_class == "dictionary"):
            continue
        p_current_class = probability(clf, x, current_class)
        if (first_run or p_current_class > best_p): # Obtaining the class with higher probability
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [12]:
# predict function
def predict(clf, x_test_dataset):
    y_pred = []
    total_testing_data = len(x_test_dataset)
    total_tested_data = 0
    for x in x_test_dataset:
        predicted_class = predict_single_class(clf, x)
        clear_output(wait = True)
        print('Model Tested on', total_tested_data, 'Out of', total_testing_data, 'Testing Data')
        y_pred.append(predicted_class)
        total_tested_data += 1
    return np.array(y_pred)

#### Running all the functions

In [13]:
X, Y = readDatasetFiles()

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y)

x_train_dataset, features = preprocessTrainingData(x_train)
y_train_dataset = np.array(y_train)
x_test_dataset = preprocessTestingData(x_test, features)
y_test_dataset = np.array(y_test)
clear_output(wait = True)
print('Training & Testing Dataset Processed and Ready')

Training & Testing Dataset Processed and Ready


In [14]:
clf = fit(x_train_dataset, y_train_dataset, features)
print('Training Complete')

Training Complete


In [None]:
y_pred_MultinomialNB = predict(clf, x_test_dataset)
clear_output(wait = True)

# model report
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("Classification Report")
print(classification_report(y_test_dataset, y_pred_MultinomialNB))
print("Confusion Matrix")
labels = np.unique(y_test_dataset)
cm = confusion_matrix(y_test_dataset, y_pred_MultinomialNB, labels = labels)
df_cm = pd.DataFrame(cm, index = labels, columns = labels)
plt.figure(figsize = (24, 16))
sns.set(font_scale = 1.6)
ax = sns.heatmap(df_cm, annot = True, cmap = "Blues", annot_kws = {'size': 14}, fmt = 'g')
ax.set_ylim(20.0, 0.0)
plt.show()
print()
print("Accuracy Score:", end = " ")
print(accuracy_score(y_test_dataset, y_pred_MultinomialNB) * 100, "%", sep="")

Model Tested on 4000 Out of 5000 Testing Data


## Comparison of the above model with the sklearn inbuild model

In [None]:
from sklearn.naive_bayes import MultinomialNB
sklearn_clf = MultinomialNB()
sklearn_clf.fit(x_train_dataset, y_train_dataset)

In [None]:
y_pred_sklearn_MultinomialNB = sklearn_clf.predict(x_test_dataset)

# model report
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("Classification Report")
print(classification_report(y_test_dataset, y_pred_sklearn_MultinomialNB))
print("Confusion Matrix")
labels = np.unique(y_test_dataset)
cm = confusion_matrix(y_test_dataset, y_pred_sklearn_MultinomialNB, labels = labels)
df_cm = pd.DataFrame(cm, index = labels, columns = labels)
plt.figure(figsize = (24, 16))
sns.set(font_scale = 1.6)
ax = sns.heatmap(df_cm, annot = True, cmap = "Blues", annot_kws = {'size': 14}, fmt = 'g')
ax.set_ylim(20.0, 0.0)
plt.show()
print()
print("Accuracy Score:", end = " ")
print(accuracy_score(y_test_dataset, y_pred_sklearn_MultinomialNB) * 100, "%", sep="")