# The dataset
1. The dataset has three splits:
    + train
    + test
    + unsupervised
   Splits can be found in the hugging faces page of the dataset.
   Or with the function get_dataset_split_names("name_of_dataset")
2. Here are the size of datasets:
    + Size of the train dataset: 25000
    + Size of the test dataset: 25000
    + Size of the unsupervised dataset: 50000

In [None]:
from datasets import Dataset
from datasets import load_dataset
from datasets import load_dataset_builder

import math

In [None]:
dataset = load_dataset_builder("imdb")
dataset_train = load_dataset("imdb", split='train')
dataset_test = load_dataset("imdb", split='test')
dataset_unsupervised = load_dataset("imdb", split='unsupervised')

In [None]:
dataset.info.description

In [None]:
print("Size of the train dataset: " + str(len(dataset_train)))
print("Size of the test dataset: " + str(len(dataset_test)))
print("Size of the unsupervised dataset: " + str(len(dataset_unsupervised)))

# Naive Bayes classifier

## I. Preprocessing

In [None]:
punctuation_filter = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
                      ',', '.', '/', ':', ';', '<', '=', '>', '?', '@',
                      '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

def to_lower_case(row: dict) -> dict:
    """
    Lower text field in the row dict
    return: updated row
    """
    row['text'] = row['text'].lower()
    return row

def remove_punctuation(row: dict) -> dict:
    """
    Replace punctuation from punctuation_filter list to
    spaces in the text field of row dict
    return: updated row
    """
    for punctuation in punctuation_filter:
        row['text'] = row['text'].replace(punctuation, ' ')
    return row

def preprocessing(row: dict) -> dict:
    """
    Lower text field in the row dict and replace punctuation
    from punctuation_filter list to spaces in the text field
    of row dict
    return: updated row
    """
    return to_lower_case(remove_punctuation(row))

In [None]:
preprocess_train = dataset_test.map(preprocessing)
preprocess_test = dataset_train.map(preprocessing)
preprocess_unsupervised = dataset_unsupervised.map(preprocessing)

## II. Naive Bayes classifier

### Our implementation

In [None]:
def train_naive_bayes(documents: Dataset, classes: list):
    logprior = {}
    loglikelihood = {k: {} for k in classes}
    
    # Vocabulary of documents
    voc = {} # Histogram {word: count}
    class_voc = {k: {} for k in classes}
    total_count = 0
    
    def update_voc(document: Dataset) -> None:
        words = document['text'].split()
        nonlocal total_count
        total_count += len(words)
        for word in words:
            voc.update({word: voc.get(word, 0) + 1})
            c = document['label']
            class_voc[c].update({word: class_voc[c].get(word, 0) + 1})
    
    documents.map(update_voc)

    # Update total count for loglikelihood formula
    total_count += len(voc)

    for c in classes:
        num_doc = len(documents)
        c_docs = documents.filter(lambda doc: doc['label'] == c)
        num_c = len(c_docs)
        logprior[c] = math.log(num_c / num_doc)

        for word in voc.keys():
            loglikelihood[c][word] = math.log((class_voc[c].get(word, 0) + 1)/total_count)
        
    return logprior, loglikelihood, voc

In [None]:
def test_naive_bayes(test_str: str, logprior: dict, loglikelihood: dict, classes: list, voc: dict) -> int:
    sum_max = None
    c_max = None
    for c in classes:
        sum_c = logprior[c]
        for word in test_str.split():
            if word in voc:
                sum_c += loglikelihood[c][word]
        if not sum_max or sum_max < sum_c:
            sum_max = sum_c
            c_max = c
    return c_max

In [None]:
classes = [0, 1]
logprior, loglikelihood, voc = train_naive_bayes(preprocess_train, classes)

In [None]:
def accuracy(preprocess_test: Dataset, logprior: dict, loglikelihood: dict, classes: list, voc: dict):
    confusion = [0, 0, 0, 0] # TP, TN, FP, FN
    accuracy=0
    def update_voc(document: Dataset) -> None:
        nonlocal accuracy
        res = test_naive_bayes(document['text'], logprior, loglikelihood, classes, voc)
        confusion[1-res + (2 * (1-document['label']))] += 1
        accuracy += res == document['label']
    preprocess_test.map(update_voc)
    accuracy = accuracy/len(preprocess_test)
    return accuracy

In [None]:
print(accuracy(preprocess_test, logprior, loglikelihood, classes, voc))

### Scikit

# Stemming and Lemmatization

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import spacy

# loading the small English model
nlp = spacy.load("en_core_web_sm")

In [None]:
def lemmatization(row: dict) -> dict:
    """
    do the lemmatization
    """
    lemmas = [token.lemma_ for token in nlp(row['text'])]
    " ".join(lemmas)
    row['text'] = lemmas
    return row

In [None]:
lemmatization_train = preprocess_train.map(lemmatization)

In [14]:
lemmatization_train

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [15]:
lemmatization_train[0]['text']

['I',
 'love',
 'sci',
 'fi',
 'and',
 'be',
 'willing',
 'to',
 'put',
 'up',
 'with',
 'a',
 'lot',
 ' ',
 'sci',
 'fi',
 'movie',
 'tv',
 'be',
 'usually',
 'underfunded',
 ' ',
 'under',
 'appreciated',
 'and',
 'misunderstood',
 ' ',
 'I',
 'try',
 'to',
 'like',
 'this',
 ' ',
 'I',
 'really',
 'do',
 ' ',
 'but',
 'it',
 'be',
 'to',
 'good',
 'tv',
 'sci',
 'fi',
 'as',
 'babylon',
 '5',
 'be',
 'to',
 'star',
 'trek',
 ' ',
 'the',
 'original',
 '  ',
 'silly',
 'prosthetic',
 ' ',
 'cheap',
 'cardboard',
 'set',
 ' ',
 'stilte',
 'dialogue',
 ' ',
 'cg',
 'that',
 'doesn',
 't',
 'match',
 'the',
 'background',
 ' ',
 'and',
 'painfully',
 'one',
 'dimensional',
 'character',
 'can',
 'not',
 'be',
 'overcome',
 'with',
 'a',
 ' ',
 'sci',
 'fi',
 ' ',
 'set',
 '  ',
 'I',
 'm',
 'sure',
 'there',
 'be',
 'those',
 'of',
 'you',
 'out',
 'there',
 'who',
 'think',
 'babylon',
 '5',
 'be',
 'good',
 'sci',
 'fi',
 'tv',
 ' ',
 'it',
 's',
 'not',
 ' ',
 'it',
 's',
 'clichéd',