<h3>Import Libraries and Read Dataset

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))
data = pd.read_csv('data.csv')

<h1>Stemming and Removing Stopwords

In [4]:
def stopwords_and_stemming(text):
    text = text.split()
    clean_data = []
    for word in text:
        if word not in stop_words:
            clean_data.append(stemmer.stem(word))
    return clean_data

def remove_punc(words):
    new_list = []
    for i in words:
        new_string = ''
        for char in i:
            if char.isalnum() and char != '.' and char != ',' and char != "'":
                new_string += char
        new_list.append(new_string)
    return new_list


data['Text'] = data['Text'].apply(stopwords_and_stemming).apply(remove_punc)

<h1>Creating Text Classifier Class

In [30]:
class TextClassifier():
    def __init__(self, dataset):
        #Splitting the given data
        txt_train, txt_test, cat_train, cat_test = train_test_split(dataset.Text, dataset.Category, test_size=.2)
        #Creating training set and test set for class
        self.training_set = pd.DataFrame({'Text':txt_train,'Category':cat_train})
        self.test_set = pd.DataFrame({'Text':txt_test,'Category':cat_test})
        self.raw_data = dataset
        print('Model Created...')

    def run_model(self):
        print('Formatting training data...')
        self.training_set = self.create_training_table()
        print('Making predictions...')
        self.predict()
        

    def create_training_table(self):
        for article in self.training_set[self.training_set['Category'] == 'Sports'].Text:
            sports_words = list(article)

        for article in self.training_set[self.training_set['Category'] == 'Business'].Text:
            business_words = list(article)

        for article in self.training_set[self.training_set['Category'] == 'Politics'].Text:
            poli_words = list(article)

        self.training_data_sports = sports_words
        self.training_data_business = business_words
        self.training_data_politics = poli_words
        print('Training sets created')

    def conditional_prob(self, word, category):
        if category == 'Sports':
            if word in self.training_data_sports:
                return (self.training_data_sports.count(word)) / len(self.training_data_sports)
            else:
                return 1
        elif category == 'Business':
            if word in self.training_data_business:
                return (self.training_data_business.count(word)) / len(self.training_data_business)
            else:
                return 1
        elif category == 'Politics':
            if word in self.training_data_politics:
                return (self.training_data_politics.count(word)) / len(self.training_data_politics)
            else:
                return 1
        else:
            return('Invalid Category')
    
    def predict(self, lst):
        sports_prob, bus_prob, poli_prob = 1, 1, 1
        for word in lst:
            sports_prob *= self.conditional_prob(word, 'Sports')
            bus_prob *= self.conditional_prob(word, 'Business')
            poli_prob *= self.conditional_prob(word, 'Politics')
        sports_prob /= 3
        bus_prob /= 3
        poli_prob /= 3
        if sports_prob>bus_prob and sports_prob>poli_prob:
            return('Sports')
        elif bus_prob>sports_prob and bus_prob>poli_prob:
            return('Business')
        elif poli_prob>sports_prob and poli_prob>bus_prob:
            return('Politics')
        else:
            return ('Ambig.')
    
    def run(self):
        self.test_set['Predicted'] = self.test_set['Text'].apply(self.predict)

In [31]:
model = TextClassifier(data)
model.create_training_table()

Model Created...
Training sets created


In [32]:
model.run()

In [33]:
model.test_set

Unnamed: 0,Text,Category,Predicted
13,"[inflat, could, surg, 5, earli, next, year, un...",Business,Politics
6,"[an, italian, dj, claim, suffer, number, injur...",Sports,Business
27,"[presid, joe, biden, said, thursday, even, he,...",Politics,Politics
22,"[presid, joe, biden, said, thursday, would, su...",Politics,Politics
26,"[the, hous, repres, vote, thursday, hold, stev...",Politics,Business
19,"[when, amazon, open, sprawl, warehous, communi...",Business,Politics
