<h3>Import Libraries and Read Dataset

In [74]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import operator
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))
data = pd.read_csv('data.csv')

def stopwords_and_stemming(text):
    text = text.split()
    clean_data = []
    for word in text:
        if word not in stop_words:
            clean_data.append(stemmer.stem(word))
    return clean_data

def remove_punc(words):
    new_list = []
    for i in words:
        new_string = ''
        for char in i:
            if char.isalnum() and char != '.' and char != ',' and char != "'":
                new_string += char
        new_list.append(new_string)
    return new_list

<h1>Creating Text Classifier Class

In [75]:
class TextClassifier():
    def __init__(self, dataset):
        dataset['Text'] = dataset['Text'].apply(stopwords_and_stemming).apply(remove_punc)
        #Splitting the given data
        txt_train, txt_test, cat_train, cat_test = train_test_split(dataset.Text, dataset.Category, test_size=.2)
        #Creating training set and test set for class
        self.training_set = pd.DataFrame({'Text':txt_train,'Category':cat_train})
        self.test_set = pd.DataFrame({'Text':txt_test,'Category':cat_test})
        self.raw_data = dataset
        print('Model Created...')


    def create_training_table(self):
        for article in self.training_set[self.training_set['Category'] == 'Sports'].Text:
            sports_words = list(article)

        for article in self.training_set[self.training_set['Category'] == 'Business'].Text:
            business_words = list(article)

        for article in self.training_set[self.training_set['Category'] == 'Politics'].Text:
            poli_words = list(article)

        self.training_data_sports = sports_words
        self.training_data_business = business_words
        self.training_data_politics = poli_words
        print('Training sets created')

    def conditional_prob(self, word, category):
        if category == 'Sports':
            if word in self.training_data_sports:
                return (self.training_data_sports.count(word)) / len(self.training_data_sports)
            else:
                return 1
        elif category == 'Business':
            if word in self.training_data_business:
                return (self.training_data_business.count(word)) / len(self.training_data_business)
            else:
                return 1
        elif category == 'Politics':
            if word in self.training_data_politics:
                return (self.training_data_politics.count(word)) / len(self.training_data_politics)
            else:
                return 1
        else:
            return('Invalid Category')
    
    def predict(self, lst):
        sports_prob, bus_prob, poli_prob = 1, 1, 1
        for word in lst:
            sports_prob *= self.conditional_prob(word, 'Sports')
            bus_prob *= self.conditional_prob(word, 'Business')
            poli_prob *= self.conditional_prob(word, 'Politics')
        sports_prob /= 3
        bus_prob /= 3
        poli_prob /= 3

        probabilities = {'Sports':sports_prob, 'Business':bus_prob, 'Politics':poli_prob}
        return max(probabilities.items(), key=operator.itemgetter(1))[0]
    
    def run(self):
        self.create_training_table()
        self.test_set['Predicted'] = self.test_set['Text'].apply(self.predict)
        num_correct = np.where(self.test_set['Category'] == self.test_set['Predicted'], 1, 0)
        print(f'Accuracy: {np.sum(num_correct)/len(self.test_set)}')
        print(self.test_set)

In [76]:
model = TextClassifier(data)
model.run()

Model Created...
Training sets created
Accuracy: 0.5
                                                 Text  Category Predicted
17  [evergrand, made, crucial, payment, allow, sta...  Business    Sports
25  [neera, tanden, name, white, hous, staff, secr...  Politics    Sports
11  [price, consum, good, rise, unit, state, tangl...  Business    Sports
3   [greec, maria, sakkari, make, debut, wta, fina...    Sports    Sports
5   [the, nba, face, anoth, incid, involv, china, ...    Sports    Sports
7   [canadian, athlet, look, compet, team, canada,...    Sports    Sports


<h1>Comparing to SKLearn Naive Bayes Model

In [77]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
data=pd.read_csv('data.csv')

test, train = pd.DataFrame(), pd.DataFrame()
train['Text'], test['Text'], train['Category'], test['Category'] = train_test_split(data.Text, data.Category, test_size=.2)

skmodel = make_pipeline(TfidfVectorizer(), MultinomialNB())
skmodel.fit(train['Text'], train['Category'])
predicted = skmodel.predict(test['Text'])
test['Predicted'] = predicted

print(f"Accuracy: {np.sum(np.where(test['Category'] == test['Predicted'], 1, 0))/len(test)}")
print(test)

Accuracy: 1.0
                                                 Text  Category Predicted
3   Greece's Maria Sakkari will make her debut at ...    Sports    Sports
26  The House of Representatives voted on Thursday...  Politics  Politics
4   Gymnast Simone Biles, who disclosed her mental...    Sports    Sports
13  Inflation could surge above 5% early next year...  Business  Business
10  President Joe Biden faces a number of economic...  Business  Business
27  President Joe Biden said Thursday evening he's...  Politics  Politics
