# The dataset
1. The dataset has three splits:
    + train
    + test
    + unsupervised
   Splits can be found in the hugging faces page of the dataset.
   Or with the function get_dataset_split_names("name_of_dataset")
2. Here are the size of datasets:
    + Size of the train dataset: 25000
    + Size of the test dataset: 25000
    + Size of the unsupervised dataset: 50000

In [None]:
from datasets import Dataset
from datasets import load_dataset
from datasets import load_dataset_builder

import math

In [None]:
dataset = load_dataset_builder("imdb")
dataset_train = load_dataset("imdb", split='train')
dataset_test = load_dataset("imdb", split='test')
dataset_unsupervised = load_dataset("imdb", split='unsupervised')

In [None]:
dataset.info.description

In [None]:
print("Size of the train dataset: " + str(len(dataset_train)))
print("Size of the test dataset: " + str(len(dataset_test)))
print("Size of the unsupervised dataset: " + str(len(dataset_unsupervised)))

# Naive Bayes classifier

## I. Preprocessing

In [None]:
punctuation_filter = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
                      ',', '.', '/', ':', ';', '<', '=', '>', '?', '@',
                      '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

def to_lower_case(row: dict) -> dict:
    """
    Lower text field in the row dict
    return: updated row
    """
    row['text'] = row['text'].lower()
    return row

def remove_punctuation(row: dict) -> dict:
    """
    Replace punctuation from punctuation_filter list to
    spaces in the text field of row dict
    return: updated row
    """
    for punctuation in punctuation_filter:
        row['text'] = row['text'].replace(punctuation, ' ')
    return row

def preprocessing(row: dict) -> dict:
    """
    Lower text field in the row dict and replace punctuation
    from punctuation_filter list to spaces in the text field
    of row dict
    return: updated row
    """
    return to_lower_case(remove_punctuation(row))

In [22]:
preprocess_train = dataset_test.map(preprocessing)
preprocess_test = dataset_train.map(preprocessing)
preprocess_unsupervised = dataset_unsupervised.map(preprocessing)

Loading cached processed dataset at /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-8085cf80541666ee.arrow
Loading cached processed dataset at /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d02788d40065b7bf.arrow
Loading cached processed dataset at /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-440a117c0d431674.arrow


## II. Naive Bayes classifier

### Our implementation

In [93]:
def train_naive_bayes(documents: Dataset, classes: list):
    logprior = {}
    loglikelihood = {k: {} for k in classes}
    
    # Vocabulary of documents
    voc = {} # Histogram {word: count}
    class_voc = {k: {} for k in classes}
    total_count = 0
    
    def update_voc(document: Dataset) -> None:
        words = document['text'].split()
        nonlocal total_count
        total_count += len(words)
        for word in words:
            voc.update({word: voc.get(word, 0) + 1})
            c = document['label']
            class_voc[c].update({word: class_voc[c].get(word, 0) + 1})
    
    documents.map(update_voc)

    # Update total count for loglikelihood formula
    total_count += len(voc)

    for c in classes:
        num_doc = len(documents)
        c_docs = documents.filter(lambda doc: doc['label'] == c)
        num_c = len(c_docs)
        logprior[c] = math.log(num_c / num_doc)

        for word in voc.keys():
            loglikelihood[c][word] = math.log((class_voc[c].get(word, 0) + 1)/total_count)
        
    return logprior, loglikelihood, voc

In [94]:
def test_naive_bayes(test_str: str, logprior: dict, loglikelihood: dict, classes: list, voc: dict) -> int:
    sum_max = None
    c_max = None
    for c in classes:
        sum_c = logprior[c]
        for word in test_str.split():
            if word in voc:
                sum_c += loglikelihood[c][word]
        if not sum_max or sum_max < sum_c:
            sum_max = sum_c
            c_max = c
    return c_max

In [95]:
classes = [0, 1]
logprior, loglikelihood, voc = train_naive_bayes(preprocess_train, classes)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Loading cached processed dataset at /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-a8cd3e9d43393ef8.arrow
Loading cached processed dataset at /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-b4d91ec42a490537.arrow


In [96]:
def accuracy(preprocess_test: Dataset, logprior: dict, loglikelihood: dict, classes: list, voc: dict):
    confusion = [0, 0, 0, 0] # TP, TN, FP, FN
    accuracy=0
    def update_voc(document: Dataset) -> None:
        nonlocal accuracy
        res = test_naive_bayes(document['text'], logprior, loglikelihood, classes, voc)
        confusion[1-res + (2 * (1-document['label']))] += 1
        accuracy += res == document['label']
    preprocess_test.map(update_voc)
    accuracy = accuracy/len(preprocess_test)
    return accuracy

In [98]:
print('Our implementation accuracy:', accuracy(preprocess_test, logprior, loglikelihood, classes, voc))

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Our implementation accuracy:  0.82152


### Scikit
#### 3. Implement a naive Bayes classifier using scikit-learn.

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [81]:
vectorizer = CountVectorizer(analyzer = "word", 
                             lowercase=True, 
                             tokenizer = None, 
                             preprocessor = None, 
                             stop_words = None, 
                             max_features = 5000)
db = preprocess_train
y = db['label']
corpus = db['text']
bag_of_words = vectorizer.fit_transform(corpus)
print(bag_of_words.shape)
X = bag_of_words.toarray()

clf = MultinomialNB(force_alpha=True)
clf.fit(X, y)

(25000, 5000)


In [85]:
#vectorizer.get_feature_names_out()
#print ((clf.feature_log_prob_))
vocabulary = vectorizer.vocabulary_
#print(vocabulary)
vectorizer.transform(['love in my ass'])

<1x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [90]:
print(clf.predict(vectorizer.transform(['I love this movie'])))

[1]


In [99]:
y = preprocess_test['label']
X = vectorizer.transform(preprocess_test['text'])
score = clf.score(X, y)
print('Scikit learn accuracy:', score)

Scikit learn accuracy: 0.83076


#### 4. Report the accuracy on both training and test set, for both your implementation and the scikit-learn one.
- Our implementation accuracy: 0.82152
- Scikit learn accuracy: 0.83076

# Stemming and Lemmatization