<h3>Import Necessary Libraries

In [713]:
#Importing Necessary Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import operator
import os
#Model uses NLTK Snowball word stemmer and standard english NLTK stopwords
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))

<h2>Read Data from .txt files in each directory
<h5>Categories assigned according to the category they were given on cnn.com

In [714]:
#Initializing empty dataframe
data = pd.DataFrame(columns=["Text", "Category"])

#Searching for files in directory containing sports articles
for file in os.scandir(r"data\sports"):
    #Ensuring it is in fact a file
    if file.is_file():
        #And opening as READONLY if so
        f = open(file, 'r')
        #Reading the text file and appending it as a sports article to the dataframe
        data = data.append({'Text':f.read(), 'Category':'Sports'}, ignore_index=True)

#Same process for directory containing business articles
for file in os.scandir(r"data\business"):
    if file.is_file():
        f = open(file, 'r', encoding='UTF-8')
        data = data.append({'Text':f.read(), 'Category':'Business'}, ignore_index=True)

#And Politics articles
for file in os.scandir(r"data\politics"):
    if file.is_file():
        f = open(file, 'r', encoding='UTF-8')
        data = data.append({'Text':f.read(), 'Category':'Politics'}, ignore_index=True)

data

Unnamed: 0,Text,Category
0,"In 2018, D'Ernest Johnson wasn't signed by any...",Sports
1,Tom Brady congratulated Aaron Rodgers for beco...,Sports
2,Leading Formula One drivers defended the popul...,Sports
3,Greece's Maria Sakkari will make her debut at ...,Sports
4,"Gymnast Simone Biles, who disclosed her mental...",Sports
5,The NBA is facing another incident involving C...,Sports
6,An Italian DJ claims he suffered a number of i...,Sports
7,Canadian athletes looking to compete for Team ...,Sports
8,It's not often an NBA star can quietly make hi...,Sports
9,With 100 race wins and seven world championshi...,Sports


<h3>Functions to remove stopwords/punctuation, and stem remaining words

In [715]:
#Stemming and removing stopwords
def stopwords_and_stemming(text):
    #Splitting text into a list of words
    text = text.split()
    clean_data = []
    #Iterating through word list
    for word in text:
        if word not in stop_words:
            #If it's not a stop word, stem and add to new list
            clean_data.append(stemmer.stem(word))
    return clean_data

#Removing remaining punctuation
def remove_punc(words):
    new_list = []
    #Iterating through word list
    for i in words:
        #Most punctuation is attached to a word and included in the string
        new_string = ''
        #Iterating through each character of each word, and removing if it is punctuation
        for char in i:
            if char.isalnum() and char != '.' and char != ',' and char != "'":
                new_string += char
        new_list.append(new_string)
    return new_list

<h1>Creating Text Classifier Class

In [716]:
class TextClassifier():
    def __init__(self, dataset):
        #Removing stopwords, punctuation and stemming across dataset
        dataset['Text'] = dataset['Text'].apply(stopwords_and_stemming).apply(remove_punc)
        #Splitting the given data into test and training groups
        txt_train, txt_test, cat_train, cat_test = train_test_split(dataset.Text, dataset.Category, test_size=.2)
        #Creating training set and test set for class
        self.training_set = pd.DataFrame({'Text':txt_train,'Category':cat_train})
        self.test_set = pd.DataFrame({'Text':txt_test,'Category':cat_test})
        #Preserving original dataset
        self.raw_data = dataset
        print('Model Created...')

    #Splitting every word of every article into respective categories
    def create_training_table(self):
        self.training_data_sports = []
        self.training_data_business = []
        self.training_data_politics = []

        #For every list of words (article) in the training set under this category, add it to training_data_[category] list
        for article in self.training_set[self.training_set['Category'] == 'Sports'].Text:
            self.training_data_sports.extend(article)

        for article in self.training_set[self.training_set['Category'] == 'Business'].Text:
            self.training_data_business.extend(article)

        for article in self.training_set[self.training_set['Category'] == 'Politics'].Text:
            self.training_data_politics.extend(article)

        print('Training sets created')

    #Calculate the conditional probability of a given word belonging to a given category
    #i.e. conditional probability of WORD given CATEGORY

    def conditional_prob(self, word, category):
        if category == 'Sports':
            if word in self.training_data_sports:
                #Calculate probabililty that given word is in sports category training set
                return (self.training_data_sports.count(word)) / len(self.training_data_sports)
            else:
                return 1
        elif category == 'Business':
            if word in self.training_data_business:
                #Calculate probability for business category
                return (self.training_data_business.count(word)) / len(self.training_data_business)
            else:
                return 1
        elif category == 'Politics':
            if word in self.training_data_politics:
                #For politics category
                return (self.training_data_politics.count(word)) / len(self.training_data_politics)
            else:
                return 1
        else:
            return('Invalid Category')
    
    def predict(self, lst):
        #Initializing each probability to 1
        bus_prob, sports_prob, poli_prob = 1, 1, 1
        #Multiplying conditional probability for each word in test article for each category
        for word in lst:
            bus_prob *= self.conditional_prob(word, 'Business')
            sports_prob *= self.conditional_prob(word, 'Sports')
            poli_prob *= self.conditional_prob(word, 'Politics')

        #Returning maximum probability of the free
        probabilities = {'Sports':sports_prob, 'Business':bus_prob, 'Politics':poli_prob}
        return max(probabilities.items(), key=operator.itemgetter(1))[0]
    
    def run(self):
        self.create_training_table()
        self.test_set['Predicted'] = self.test_set['Text'].apply(self.predict)
        num_correct = np.where(self.test_set['Category'] == self.test_set['Predicted'], 1, 0)
        print(f'Accuracy: {np.sum(num_correct)/len(self.test_set)}')
        print(self.test_set)

In [717]:
model = TextClassifier(data)
model.run()

Model Created...
Training sets created
Accuracy: 0.42857142857142855
                                                 Text  Category Predicted
22  [presid, joe, biden, month, circumspect, descr...  Politics    Sports
2   [lead, formula, one, driver, defend, popular, ...    Sports    Sports
5   [the, nba, face, anoth, incid, involv, china, ...    Sports    Sports
15  [inflat, could, surg, 5, earli, next, year, un...  Business    Sports
8   [it, often, nba, star, quiet, make, way, crowd...    Sports    Sports
32  [the, justic, depart, plan, call, us, secret, ...  Politics    Sports
3   [greec, maria, sakkari, make, debut, wta, fina...    Sports  Politics


<h1>Comparing to SKLearn Naive Bayes Model

In [718]:
#Initializing empty dataframe
data = pd.DataFrame(columns=["Text", "Category"])

#Searching for files in directory containing sports articles
for file in os.scandir(r"data\sports"):
    #Ensuring it is in fact a file
    if file.is_file():
        #And opening as READONLY if so
        f = open(file, 'r', encoding='UTF-8')
        #Reading the text file and appending it as a sports article to the dataframe
        data = data.append({'Text':f.read(), 'Category':'Sports'}, ignore_index=True)

#Same process for directory containing business articles
for file in os.scandir(r"data\business"):
    if file.is_file():
        f = open(file, 'r', encoding='UTF-8')
        data = data.append({'Text':f.read(), 'Category':'Business'}, ignore_index=True)

#And Politics articles
for file in os.scandir(r"data\politics"):
    if file.is_file():
        f = open(file, 'r', encoding='UTF-8')
        data = data.append({'Text':f.read(), 'Category':'Politics'}, ignore_index=True)

In [719]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

test, train = pd.DataFrame(), pd.DataFrame()
train['Text'], test['Text'], train['Category'], test['Category'] = train_test_split(data.Text, data.Category, test_size=.2)

skmodel = make_pipeline(TfidfVectorizer(), MultinomialNB())
skmodel.fit(train['Text'], train['Category'])
predicted = skmodel.predict(test['Text'])
test['Predicted'] = predicted

print(f"Accuracy: {np.sum(np.where(test['Category'] == test['Predicted'], 1, 0))/len(test)}")
print(test)

Accuracy: 0.2857142857142857
                                                 Text  Category Predicted
22  President Joe Biden has for months been circum...  Politics  Politics
5   The NBA is facing another incident involving C...    Sports  Politics
20  Supply chain bottlenecks are weighing on econo...  Business  Business
6   An Italian DJ claims he suffered a number of i...    Sports  Politics
7   Canadian athletes looking to compete for Team ...    Sports  Politics
2   Leading Formula One drivers defended the popul...    Sports  Politics
3   Greece's Maria Sakkari will make her debut at ...    Sports  Politics
