In [5]:
#!/usr/bin/env python3

import numpy as np
import random
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import csv
import time
import math
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer
nltk.download('wordnet')


train_file = 'data_train.pkl'
test_file  = 'data_test.pkl'



def load_data():
    #load data and test sets
    train = np.load(train_file, allow_pickle=True)
    test  = np.load(test_file, allow_pickle=True)

    inputs = np.asarray(train[0])
    labels = np.asarray(train[1])

    return inputs, labels, test


class preprocessing():
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.excluded = ["!","#","$","%","&","''","'",":",";",",",".","(",")",
                "|","``","[","]","...","?","","*","--","*i","s",
                "'m","'ll","d","'re","n't", "'s","'d","'ve","-"]


    #SOURCE: https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
    #Get the POS tag for each word after tokenizing
    def get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    #Split words using NLTK's tokenizer
    def tokenize(self, inputs):
        self.word_tokens = [list(nltk.pos_tag(word_tokenize(inputs[i]))) for i in range(len(inputs))]

    #Word preprocessing:
    #   Some words can be excluded (eg. punctuation marks),
    #   We can also lemmatize or stem words
    def filter_words(self, inputs,
                     exclusion = False,
                     lemmatize = False,
                     stemmer = False):

        self.new_inputs = np.array([None] * len(inputs))
        self.tokenize(inputs)

        if lemmatize : lemmatizer = WordNetLemmatizer()
        if stemmer   : stemmer = SnowballStemmer('english')


        for i in range(len(inputs)):

            temp = []
            for j in range(len(self.word_tokens[i])):
                #Lemmatizer
                if lemmatize:
                    words = lemmatizer.lemmatize(self.word_tokens[i][j][0],
                                    self.get_wordnet_pos(self.word_tokens[i][j][1]))
                #SnowballStemmer
                if stemmer:
                    words = stemmer.stem(self.word_tokens[i][j][0])

                if not lemmatize and not stemmer:
                    words = self.word_tokens[i][j][0]

                #Cut stop words and excluded words
                if exclusion:
                    if words.lower() not in self.stop_words and words not in self.excluded:
                        temp.append(words.lower())
                #Cut only stop words
                else:
                    if words.lower() not in self.stop_words:
                        temp.append(words.lower())
            self.new_inputs[i] = temp

        return self.new_inputs

def data_split(train,k = None):

    #on récupère les labels unique et on compte leurs nombres d'occurrences dans counts
    unique, counts = np.unique(labels, return_counts=True)
    train_ind = np.floor(0.9 * len(inputs)).astype('int32')
    random.seed(5000)
    indices = np.arange(len(train))
    random.shuffle(indices)

    if k is None:

        train_indices = indices[:train_ind]
        test_indices = indices[train_ind:]

        train_inputs = inputs[train_indices]
        test_inputs = inputs[test_indices]
        train_labels = labels[train_indices]
        test_labels = labels[test_indices]

        x_train = train_inputs
        y_train = train_labels
        x_test  = test_inputs
        y_test  = test_labels

        return x_train, y_train, x_test, y_test


#P( word = k| subreddit = j):
class NBC():
    def __init__(self, x_train,y_train, x_test, y_test,alpha, TFIDF = False,smoothing=True ):
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.alpha = alpha
        self.TFIDF = TFIDF
        self.smoothing = smoothing
        self.unique_labels, self.label_counts = np.unique(self.y_train, return_counts=True)
        self.num_sentences = len(self.y_train)
        self.len_subreddits = len(self.unique_labels)
        self.the_sentence = np.concatenate([self.x_train[i] for i in range(len(self.x_train))])
        self.words_all, self.counts_all = np.unique(self.the_sentence,return_counts=True)
        self.vocab_size = len(self.words_all)
        self.dict_labels = { self.unique_labels[i]: i for i in range(len(self.unique_labels)) }

    #voir document Jurafsky - Naive Bayes sur Studium, slide 29
    #regroupement des mots de tous les documents appartenant au meme label dans un seul array
    def concatenated_sentence_by_subreddit(self):
        #Concatenated sentence for each subreddit:
        self.mega_sentence = np.array([None] * len(self.unique_labels))
        self.n = np.array([None] * len(self.unique_labels))
        #Number of unique words and their count in mega_sentence for each subreddit:
        self.unique_words = np.array([None] * len(self.unique_labels))
        self.unique_counts = np.array([None] * len(self.unique_labels))

        for j in range(len(self.unique_labels)):
            self.mega_sentence[j] = np.concatenate([self.x_train[i] for i in range(self.num_sentences) if self.y_train[i] == self.unique_labels[j] ]).flatten()
            self.unique_words[j], self.unique_counts[j] = np.unique(self.mega_sentence[j],return_counts=True)
            self.n[j] = len(self.mega_sentence[j])

        self.k= self.vocab_size*self.alpha+self.n

        return self.mega_sentence, self.unique_words, self.unique_counts, self.k


    #P(word=k | subreddit=j)
    def compute_probabilities(self):
        self.concatenated_sentence_by_subreddit()

        if self.TFIDF == False:
            self.probabilities = { }
            for i in range(len(self.words_all)):
                self.probabilities[self.words_all[i]] = {}
                for j in range(len(self.unique_labels)):
                    ind = np.where( self.unique_words[j] == self.words_all[i] )

                    #with Laplace smoothing
                    if self.smoothing:
                        if len(ind[0]) != 0 : self.probabilities[self.words_all[i]][self.unique_labels[j]] = np.log( (self.unique_counts[j][ind] + self.alpha)/self.k[j])
                        if len(ind[0]) == 0: self.probabilities[self.words_all[i]][self.unique_labels[j]] = np.log( self.alpha/self.k[j])

                    #Without smoothing
                    else:
                        if len(ind[0]) != 0 :
                            self.probabilities[self.words_all[i]][self.unique_labels[j]] = np.log( (self.unique_counts[j][ind][0] )/ self.vocab_size+self.n[j])
                        if len(ind[0]) == 0: self.probabilities[self.words_all[i]][self.unique_labels[j]] = 0.

        if self.TFIDF :
            #Creation of the dictionary for probabilities
            self.probabilities = { }
            #for each unique word among all documents:
            for i in range(len(self.words_all)):
                #create a dictionary for each word
                self.probabilities[self.words_all[i]] = {}
                #count in how many documents the word appears
                count = 0
                for j in range(len(self.unique_labels)):
                    ind = np.where( self.unique_words[j] == self.words_all[i] )
                    if len(ind[0]) != 0: count+=1

                if count == 0: idf = np.log( self.len_subreddits )
                elif count == self.len_subreddits: idf = np.log( self.len_subreddits / (self.len_subreddits - 0.5))
                else: idf = np.log( self.len_subreddits / count )
                #write all probabilities for the word for each subreddit:
                for j in range(len(self.unique_labels)):
                    ind = np.where( self.unique_words[j] == self.words_all[i] )

                    #with Laplace smoothing
                    if self.smoothing:

                        if len(ind[0]) != 0 :
                            self.probabilities[self.words_all[i]][self.unique_labels[j]] = np.log( ((self.unique_counts[j][ind] + self.alpha)/self.k[j]) * idf )
                        if len(ind[0]) == 0: self.probabilities[self.words_all[i]][self.unique_labels[j]] = np.log( (self.alpha/self.k[j]) * idf)

                    #Without smoothing
                    else:
                        if len(ind[0]) != 0 :
                            self.probabilities[self.words_all[i]][self.unique_labels[j]] = np.log( ((self.unique_counts[j][ind][0] )/ self.vocab_size+self.n[j]) * idf )
                        if len(ind[0]) == 0: self.probabilities[self.words_all[i]][self.unique_labels[j]] = 0.


        return self.probabilities

    def compute_predictions_sentence(self,sentence):

        dicts = []
        #smoothing
        if self.smoothing:
            if_empty = {self.unique_labels[j] : np.log(self.alpha / self.k[j] ) for j in range(len(self.unique_labels))}

            for i in range(len(sentence)):

                if sentence[i] in self.probabilities:
                    dicts.append(self.probabilities[sentence[i]])
                else:
                    self.probabilities[sentence[i]] = if_empty
                    dicts.append(self.probabilities[sentence[i]])
            result = {k: sum(d[k] for d in dicts) + np.log(1./20.) for k in dicts[0].keys()}

        #no smoothing
        else:
            if_empty = {self.unique_labels[j] : 0. for j in range(len(self.unique_labels))}
            for i in range(len(sentence)):

                if sentence[i] in self.probabilities:
                    dicts.append(self.probabilities[sentence[i]])
                else:
                    self.probabilities[sentence[i]] = if_empty
                    dicts.append(self.probabilities[sentence[i]])
            result = {k: sum(d[k] for d in dicts) + np.log(1./20.) for k in dicts[0].keys()}

        result = max(result, key=result.get)
        return result

    def compute_predictions(self):
        success = []
        preds = []
        real = []
        preds_to_write=["Category"]
        all_preds = []
        ids=[]
        ids.append("Id")
        ids.extend(list(range(30000)))

        self.compute_probabilities()

        for i in range(len(self.x_test)):
            preds.append(self.compute_predictions_sentence(self.x_test[i]))
#             if(i%100==0):
#                 print(i)
            real.append(self.y_test[i])

        preds_to_write.extend(preds)
#         with open("submission_TE_0.5.csv", "w",newline='') as f:
#                 writer = csv.writer(f)
#                 writer.writerows(zip(ids,preds_to_write))

        count=0
        for i in range(len(preds)):
            if preds[i]==real[i]:
                count+=1
        print("prediction is:",100.*(count/len(preds)))
#         return preds

    def validation_error(self):

        conf_mat = self.conf_matrix()
        good_preds = np.sum(np.diag(conf_mat))
        sum_preds = np.sum(conf_mat)

        #a voir test error method
        return 1.0 - good_preds/sum_preds

    def conf_matrix(self):
        n_classes = len(self.unique_labels)
        matrix = np.zeros((n_classes,n_classes))
        predlabels=self.compute_predictions()

        for (test, pred) in zip(self.y_test, predlabels):
            matrix[int(self.dict_labels[test]-1),int(self.dict_labels[pred]-1)] += 1

        return matrix


#Test
k = None
alpha = [0.1,0.3,0.5,0.7,1]
inputs, labels, test = load_data()
model_a = preprocessing()
inputs = model_a.filter_words( inputs,
                 exclusion = True,
                 lemmatize = False,
                 stemmer = False)
test = model_a.filter_words( test,
                 exclusion = True,
                 lemmatize = False,
                 stemmer = False)
x_train, y_train, x_test, y_test= data_split(inputs, k)
for a in alpha:
    print(a)
    nbc_model_a = NBC(x_train,y_train,x_test,y_test, a, TFIDF = False, smoothing=True)
    nbc_model_a.compute_predictions()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ryans\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0.1
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
prediction is: 55.05714285714286
0.3
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
prediction is: 55.528571428571425
0.5
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
51