In [2]:
with open('../FILIMDB/train.texts', 'r', encoding='utf-8') as f:
    texts = f.read().split('\n')
    
with open('../FILIMDB/train.labels', 'r', encoding='utf-8') as f:
    labels = f.read().split('\n')

In [25]:
import itertools
import re
from collections import Counter


def tokenize(text):
    except_list = ".?!"
    p = re.compile(fr"[^\w'+{except_list}+']")

    tokens = p.split(text.lower())
    return list(filter(None, tokens))

    
def train(texts, labels):
    texts_tokenized = [tokenize(text) for text in texts]
    
    train_set = list(zip(texts_tokenized, labels))
    
    pos = [text for text, label in train_set if label == 'pos']
    neg = [text for text, label in train_set if label == 'neg']
    
    #pos = list(filter(lambda x: x[1] == 'pos', train_set))
    #neg = list(filter(lambda x: x[1] == 'neg', train_set))
    
    #pos = list(set(doc_words) for doc_words in pos)
    pos_words = list(itertools.chain.from_iterable(pos))
    neg_words = list(itertools.chain.from_iterable(neg))
    
    pos_words_counter = Counter(pos_words)
    neg_words_counter = Counter(neg_words)
    
    pos_words_overall_count = sum(pos_words_counter.values())
    neg_words_overall_count = sum(neg_words_counter.values())
    
    pos_docs_count = len(pos)
    neg_docs_count = len(neg)
    
    vocab_size = len(pos_words_counter.keys()) + len(neg_words_counter.keys())
    
    out_params = {'pos_words_counter': pos_words_counter,
                  'neg_words_counter': neg_words_counter,
                  'pos_words_overall_count': pos_words_overall_count,
                  'neg_words_overall_count': neg_words_overall_count,
                  'vocab_size': vocab_size,
                  'pos_docs_count': pos_docs_count,
                  'neg_docs_count': neg_docs_count}
    return out_params

In [26]:
%%time
out_params = train(texts, labels)

CPU times: total: 1.56 s
Wall time: 1.56 s


In [27]:
import math

In [29]:
def classify_single(text, params):
    text_tokenized = tokenize(text)
    
    pos_words_counter = params['pos_words_counter']
    neg_words_counter = params['neg_words_counter']
    pos_words_overall_count = params['pos_words_overall_count']
    neg_words_overall_count = params['neg_words_overall_count']
    pos_docs_count = params['pos_docs_count']
    neg_docs_count = params['neg_docs_count']
    
    vocab_size = params['vocab_size']
    
    vocab = pos_words_counter.keys() | neg_words_counter.keys()
    unique_text_words = set(text_tokenized)
    
    # calc pos-neg probability
    N = len(text_tokenized)
    log_fact_n = math.log(math.factorial(N))

    pos_probability = log_fact_n
    neg_probability = log_fact_n
    
    for word in unique_text_words:
        k_w = text_tokenized.count(word)
        log_fact_k_w = math.log(math.factorial(k_w))
        
        p_w_pos = (pos_words_counter[word] + 1) / (pos_words_overall_count + vocab_size)
        pos_probability += k_w * math.log(p_w_pos) - log_fact_k_w
            
        p_w_neg = (neg_words_counter[word] + 1) / (neg_words_overall_count + vocab_size)
        neg_probability += k_w * math.log(p_w_neg) - log_fact_k_w
        
    pos_probability += math.log(pos_docs_count / (pos_docs_count + neg_docs_count))
    neg_probability += math.log(neg_docs_count / (pos_docs_count + neg_docs_count))
    
    if pos_probability > neg_probability:
        return 'pos'
    else:
        return 'neg'

In [30]:
with open('../FILIMDB/dev.texts', 'r', encoding='utf-8') as f:
    dev_texts = f.read().split('\n')
    
with open('../FILIMDB/dev.labels', 'r', encoding='utf-8') as f:
    dev_labels = f.read().split('\n')

In [31]:
from tqdm import notebook

In [32]:
def classify(texts, params):
    pred_s = []
    for text in notebook.tqdm(texts):
        pred = classify_single(text, params)
        pred_s.append(pred)
    return pred_s
    #return [classify_single(text, params) for text in texts]

In [33]:
pred_s = classify(dev_texts, out_params)

  0%|          | 0/10001 [00:00<?, ?it/s]

In [34]:
from statistics import mean
mean([y_pred == y_true for y_pred, y_true in zip(pred_s, dev_labels) ])

0.8462153784621538

In [35]:
with open('../FILIMDB/dev-b.texts', 'r', encoding='utf-8') as f:
    dev_b_texts = f.read().split('\n')
    
with open('../FILIMDB/dev-b.labels', 'r', encoding='utf-8') as f:
    dev_b_labels = f.read().split('\n')

In [36]:
pred_s = classify(dev_b_texts, out_params)

  0%|          | 0/2001 [00:00<?, ?it/s]

In [37]:
mean([y_pred == y_true for y_pred, y_true in zip(pred_s, dev_b_labels) ])

0.7296351824087957

In [38]:
pred_s = classify(texts, out_params)

  0%|          | 0/15001 [00:00<?, ?it/s]

In [39]:
mean([y_pred == y_true for y_pred, y_true in zip(pred_s, labels) ])

0.9316712219185388

In [15]:
train_set = list(zip(texts, labels))

In [16]:
pos, neg = train(texts, labels)

In [17]:
pos

[['this',
  'needed',
  'i',
  'right',
  'movie',
  'go',
  'because',
  'god',
  'and',
  'enging!',
  'acting',
  'straight',
  'the',
  'but',
  'elements',
  'actors',
  'scary',
  "don't",
  'rent',
  'your',
  'story',
  'movie.',
  'to',
  'is',
  'be',
  'for',
  'moviestore',
  'thing',
  '10',
  'popcorn!',
  'anyone',
  'anything',
  'or',
  'now',
  'about',
  'it!',
  'rewarded',
  'a',
  'it',
  'its',
  'want',
  'will',
  'nearest',
  'ruin',
  'recommend',
  'that',
  'you',
  'tell',
  'forget',
  'my',
  'gave'],
 ['his',
  'an',
  'drama',
  'no',
  'movies',
  'bogart',
  'feel.',
  'careening',
  'washed',
  'enough',
  'iconic',
  'or',
  'highest',
  'first',
  'adversaries',
  'might',
  'head.',
  'films',
  'other',
  'dana',
  'session.',
  'wearing',
  'overall',
  'of',
  'as',
  'for',
  'prototypical',
  'memorable',
  'husband',
  'united',
  'watch',
  'its',
  'turning',
  'ups',
  'through.',
  'detective',
  'cardboard',
  'acting',
  'than',
  'ni