In [1]:
import pandas as pd 
import numpy as np 
from collections import defaultdict
import re
import csv
import math
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

In [2]:
class NaiveBayes:
    #we pass it the various classes 
    def __init__(self, classes):
        self.classes = classes
        
    #data processing
    def cleanstring(self, str_arg):
        clean_str=re.sub('[^a-z\s]+',' ',str_arg,flags=re.IGNORECASE) #every char except alphabets is replaced
        clean_str=re.sub('(\s+)',' ',clean_str) #multiple spaces are replaced by single space
        clean_str=clean_str.lower() #converting the cleaned string to lower case
    
        return clean_str # returning the preprocessed string
    
    #This function goes through one piece of data (one comment) and ticks up each word it contains.
    #Each word only counts once even if repeated. End result if iterated over every comment is a total list of how many comments in each category contain any given word.
    def addToDict(self, item, cat_index):
        words = [] #empty list
        if isinstance(item,np.ndarray): item=item[0]
        for word in item.split():
            if word not in set(stopwords.words('english')):
                if word not in words:
                    self.dicts[cat_index][word]+=1
                    words.append(word)
                    #also add to dictionary of the entire dataset
                    self.allwords[word]+=1
                
    def train(self, data, labels):
        self.examples = data
        self.labels = labels
        if not isinstance(self.examples,np.ndarray): 
            self.examples=np.array(self.examples)
        if not isinstance(self.labels,np.ndarray): 
            self.labels=np.array(self.labels)
        self.dicts = np.array([defaultdict(lambda:0) for index in range(self.classes.shape[0])]) #this array of dictionaries will be used to count word occurances in each class
        #note these two aren't the same things: allwords is a dictionary of the entirety dataset, wordslist is a list of each word that occurs at least once int he data set
        self.allwords = defaultdict(lambda:0)
        self.wordslist = set()
        #these arrays will be used to store the probabilities values for each word in each category
        #note the lambda values of 0 : these just serve as a marker that the word never occurs.
        self.thetaone = np.array([defaultdict(lambda: 0 ) for index in range(self.classes.shape[0])]) #this array will be used to store the probability theta(j1) for each word in each category
        self.thetazero = np.array([defaultdict(lambda: 0) for index in range(self.classes.shape[0])])
        
        #for each subreddit, we go through and count the number of occurances of each word
        for index, category in enumerate(self.classes):
            category_examples = self.examples[labels == category]
            clean_examples = [self.cleanstring(ex) for ex in category_examples]
            clean_examples=pd.DataFrame(data=clean_examples)
            
            np.apply_along_axis(self.addToDict, 1, clean_examples, cat_index=index) 
            print(str(index))
            
        #now we can calculate the target probabilities
        
        self.Pc = np.empty(self.classes.shape[0])
        #default values for words not found
        self.thetaonedef = np.empty(self.classes.shape[0])
        self.thetazerodef = np.empty(self.classes.shape[0])
        #again, for each class
        for index, cat in enumerate(self.classes):            
            self.Pc[index] = np.sum(self.labels == cat) /float(self.labels.shape[0]) #probability of the class occuring
            words = set(self.dicts[index].keys()) #get each word
            self.wordslist.update(words)
            for word in words:
                self.thetaone[index][word] = (self.dicts[index][word] + 1) / float(np.sum(self.labels == cat) + 2) #number of comments in this class where the word occurs over the number of comments in this class
                self.thetazero[index][word] = (self.allwords[word] - self.dicts[index][word] +1) / float(np.sum(self.labels != cat) +2) #number of comments NOT in this class where the word occurs over number of commends not in this class
            
            #default values if the tested input includes a word that never occurs
            self.thetaonedef[index] = 1 / float(np.sum(self.labels == cat) + 2)
            self.thetazerodef[index] = 1 / float(np.sum(self.labels != cat) + 2)
            print(str(index))
    
    #returns the probability for the value to be in each class.
    def testValue(self, test_example):
        delta  = np.zeros(self.classes.shape[0]) #this will be used to store the probabilities and returned at the end
        inputwords = defaultdict(lambda:0)
        
        #first clean the input
        inputstring = self.cleanstring(test_example)
        for word in inputstring.split():
            if word not in set(stopwords.words('english')):
                inputwords[word] = 1
        
        for index, cat in enumerate(self.classes):
            delta[index] += math.log(self.Pc[index] / (1 - self.Pc[index]))
            for word in self.wordslist:
                t1 = self.thetaone[index][word]
                if t1 == 0:
                    t1 = self.thetaonedef[index]
                t0 = self.thetazero[index][word]
                if t0 == 0:
                    t0 = self.thetazerodef[index]
                if inputwords[word] == 1:
                    total = math.log(t1 / t0)
                else:
                    total = math.log((1 - t1) / (1 - t0))                
                delta[index] += total
        
        return delta
    
    def test(self, test_set):
        probabilities = []
        
        for example in test_set:
            prob = self.testValue(example)
            probabilities.append(self.classes[np.argmax(prob)])
            #print(example + " | " + self.classes[np.argmax(prob)])
        return probabilities

In [3]:
#reading in data and then dropping unnamed axis for training set 

train = 'reddit_train.csv'
df_train = pd.read_csv(train).drop(columns='id')
df_train = df_train[0:62000]
df_train_labels = df_train['subreddits']
df_train = df_train['comments']

df_classes = np.asarray(['hockey', 'nba', 'leagueoflegends',
                     'soccer', 'funny', 'movies', 'anime',
                     'Overwatch', 'trees', 'GlobalOffensive',
                     'nfl', 'AskReddit', 'gameofthrones', 'conspiracy',
                     'worldnews', 'wow', 'europe', 'canada', 'Music', 'baseball'])

In [4]:
model = NaiveBayes(df_classes)
model.train(df_train, df_train_labels)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [9]:
test_string = "And compete? No way, impossible. You cut your productivity by 20% compared to someone with the same technology working 40h. It's possible in some sector I guess, but not in manufacturing and some services. If the U.S. stay at 40h and Canada move to 32h, we're fucked. You cut your own revenue by 20% too, while most Canadian can't make any saving, removing 8h of work will bankrupt them. It's a ~20% in income tax cut too."
model.testValue(test_string)

array([-13.01414734,  -7.34375045, -13.74996761, -16.60084092,
       -17.16166047, -11.63193832, -16.77381011,  -7.13663567,
       -18.14703752, -10.42231119, -16.08997695, -11.42091286,
       -10.09190954, -19.27073508, -14.4369159 ,  -9.0074178 ,
       -16.01661146,  -0.56195619, -24.7392117 ,  -8.89370527])

In [6]:
test = 'reddit_train.csv'
df_test = pd.read_csv(train).drop(columns='id')
df_test = df_test[62001:]
df_test_labels = df_test['subreddits']
df_test = df_test['comments']

df_predict = model.test(df_test)

total = 0
df_test_labels = df_test_labels.tolist()
print(df_test_labels)
print(df_predict)
for index, value in enumerate(df_predict):
    if value == df_test_labels[index]:
        total += 1

total

['GlobalOffensive', 'trees', 'canada', 'trees', 'funny', 'worldnews', 'Overwatch', 'nba', 'hockey', 'hockey', 'worldnews', 'movies', 'nfl', 'europe', 'wow', 'Overwatch', 'hockey', 'funny', 'trees', 'Music', 'worldnews', 'conspiracy', 'baseball', 'baseball', 'soccer', 'movies', 'leagueoflegends', 'europe', 'baseball', 'nfl', 'canada', 'AskReddit', 'trees', 'Music', 'wow', 'baseball', 'worldnews', 'anime', 'trees', 'AskReddit', 'baseball', 'conspiracy', 'leagueoflegends', 'anime', 'Overwatch', 'wow', 'Overwatch', 'baseball', 'europe', 'soccer', 'trees', 'europe', 'gameofthrones', 'GlobalOffensive', 'movies', 'nfl', 'Overwatch', 'baseball', 'leagueoflegends', 'Music', 'nfl', 'canada', 'europe', 'wow', 'nfl', 'anime', 'GlobalOffensive', 'baseball', 'AskReddit', 'funny', 'europe', 'AskReddit', 'AskReddit', 'gameofthrones', 'wow', 'GlobalOffensive', 'anime', 'soccer', 'AskReddit', 'hockey', 'worldnews', 'nfl', 'baseball', 'gameofthrones', 'leagueoflegends', 'nfl', 'funny', 'hockey', 'hockey'

2461

In [None]:
#reading in data and then dropping unnamed axis for training set 

train2 = 'reddit_train.csv'
df_train2 = pd.read_csv(train).drop(columns='id')
df_train2 = df_train[0:56000]
df_train_labels2 = df_train2['subreddits']
df_train2 = df_train2['comments']

df_classes2 = np.asarray(['hockey', 'nba', 'leagueoflegends',
                     'soccer', 'funny', 'movies', 'anime',
                     'Overwatch', 'trees', 'GlobalOffensive',
                     'nfl', 'AskReddit', 'gameofthrones', 'conspiracy',
                     'worldnews', 'wow', 'europe', 'canada', 'Music', 'baseball'])
model2 = NaiveBayes(df_classes)
model2.train(df_train2, df_train_labels2)
test2 = 'reddit_train.csv'
df_test2 = pd.read_csv(train).drop(columns='id')
df_test2 = df_test2[56001:]
df_test_labels2 = df_test2['subreddits']
df_test2 = df_test2['comments']

df_predict2 = model.test(df_test2)

total = 0
df_test_labels2 = df_test_labels2.tolist()
print(df_test_labels2)
print(df_predict2)
for index, value in enumerate(df_predict2):
    if value == df_test_labels2[index]:
        total += 1

total