In [1]:
with open('../FILIMDB/train.texts', 'r', encoding='utf-8') as f:
    texts = f.read().split('\n')
    
with open('../FILIMDB/train.labels', 'r', encoding='utf-8') as f:
    labels = f.read().split('\n')

In [2]:
len(texts), len(labels)

(15001, 15001)

In [3]:
train_set = list(zip(texts, labels))

In [4]:
pos = list(filter(lambda x: x[1] == 'pos', train_set))

In [5]:
neg = list(filter(lambda x: x[1] == 'neg', train_set))

In [6]:
#min, max, mean, median
from statistics import mean, median

def calc_stats(texts_and_labels):
    len_s = [len(text) for text, label in texts_and_labels]
    return min(len_s), max(len_s), mean(len_s), median(len_s)

In [8]:
print('min={0}, max={1}, mean={2}, median={3}'.format(*calc_stats(pos)))

min=70, max=10363, mean=1360.8021276595746, median=996.5


In [10]:
print('min={0}, max={1}, mean={2}, median={3}'.format(*calc_stats(neg)))

min=52, max=8969, mean=1316.3576203208556, median=981.0


In [11]:
import re

In [12]:
def tokenize(text):
    except_list = ".?!"
    p = re.compile(fr"[^\w'+{except_list}+']")
    
    tokens = p.split(text.lower())
    return list(filter(None, tokens))

In [13]:
pos_tokenized = [tokenize(text) for text, label in pos]
neg_tokenized = [tokenize(text) for text, label in neg]

In [14]:
from collections import Counter

In [15]:
import itertools

pos_tokenized = list(itertools.chain.from_iterable(pos_tokenized))
neg_tokenized = list(itertools.chain.from_iterable(neg_tokenized))

In [16]:
pos_counter = Counter(pos_tokenized)
neg_counter = Counter(neg_tokenized)

In [17]:
pos_counter.keys() & neg_counter.keys()

{'cute.',
 'mall.',
 "bear's",
 'mail.',
 'basketball',
 'laughs',
 'film!!',
 'insp.',
 'been',
 'sumptuous',
 'solomon',
 '2002',
 'bugged',
 'perverted.',
 'cillian',
 'aborigines',
 'conniving',
 'gun!',
 'hart',
 'diplomacy',
 'claire',
 'pour',
 'vivian',
 'dim.',
 'flinty',
 'policies',
 'rochester',
 'concorde',
 'ph.d',
 'incapable',
 'laughable',
 'quota',
 'boil',
 'heirs',
 'negligent',
 'boo',
 'manager',
 'fortunate',
 'delinquent',
 'lina',
 'executives',
 'risk',
 'martinez',
 'longing',
 'duplicity',
 "lindsey's",
 'deconstruct',
 'cannonball',
 'accomplishment.',
 'walsh',
 'of...its',
 'robbie',
 'bollywood',
 'thunder',
 "armstrong's",
 'coordinators',
 'relentless.',
 'columbian',
 'betrays',
 'chong',
 'illiterate',
 'linguist',
 'orchestrating',
 'cruises',
 'rats.',
 'inclusive',
 'pranks.',
 'archives',
 'links',
 'proclamations',
 "teenager's",
 'stout',
 'imprison',
 'idol.',
 '112',
 'drilled',
 'twilight',
 'intruder',
 'zizek',
 'dennehy',
 'title.',
 "we'

In [18]:
import math

common_words = pos_counter.keys() | neg_counter.keys()

pos_n = sum(pos_counter.values())
neg_n = sum(neg_counter.values())

word_to_nb_weight = dict()

for word in common_words:
    p_w_pos = pos_counter[word] / pos_n
    p_w_neg = neg_counter[word] / neg_n
    
    #print(word)
    nb_weight = math.log((p_w_pos + 1e-7) / (p_w_neg + 1e-7) )
    word_to_nb_weight[word] = nb_weight

In [19]:
#pos_counter.keys() - neg_counter.keys()

In [20]:
weights = [(word, nb_weight, pos_counter[word], neg_counter[word]) for word, nb_weight in word_to_nb_weight.items()]

In [21]:
sorted(weights, key=lambda x: x[1])[:30]

[('thunderbirds', -5.3178282602560705, 0, 36),
 ('beowulf', -5.136486901123298, 0, 30),
 ('atrocious.', -5.1027880290634835, 0, 29),
 ('unwatchable.', -5.1027880290634835, 0, 29),
 ('ajay', -5.1027880290634835, 0, 29),
 ('deathstalker', -5.031779317208724, 0, 27),
 ('dahmer', -4.994290003346008, 0, 26),
 ('kareena', -4.955340314690489, 0, 25),
 ('welch', -4.914811847175403, 0, 24),
 ('thinking?', -4.914811847175403, 0, 24),
 ('turgid', -4.914811847175403, 0, 24),
 ('ripley', -4.872571186157663, 0, 23),
 ('sarne', -4.872571186157663, 0, 23),
 ('grendel', -4.872571186157663, 0, 23),
 ('stinker.', -4.872571186157663, 0, 23),
 ('varma', -4.782328062512033, 0, 21),
 ('palermo', -4.782328062512033, 0, 21),
 ('slater', -4.782328062512033, 0, 21),
 ('kinski', -4.782328062512033, 0, 21),
 ('kibbutz', -4.733956634425011, 0, 20),
 ('flop.', -4.733956634425011, 0, 20),
 ('maddy', -4.733956634425011, 0, 20),
 ('steaming', -4.6831259496970326, 0, 19),
 ('orca', -4.6831259496970326, 0, 19),
 ('dreck.

In [22]:
sorted(weights, key=lambda x: x[1])[::-1][:30]

[('edie', 5.992289209086858, 73, 0),
 ('antwone', 5.936081070121319, 69, 0),
 ('gundam', 5.813194909289788, 61, 0),
 ('paulie', 5.745581965978395, 57, 0),
 ('mildred', 5.634732362928491, 51, 0),
 ('corbett', 5.392767901353049, 40, 0),
 ('flavia', 5.367566736916988, 39, 0),
 ('deathtrap', 5.259886206729456, 35, 0),
 ('biko', 5.170761044682313, 32, 0),
 ('trier', 5.139195563268109, 31, 0),
 ('ossessione', 5.106601134812726, 30, 0),
 ('din', 5.106601134812726, 30, 0),
 ('yokai', 5.106601134812726, 30, 0),
 ('gunga', 5.072908410767607, 29, 0),
 ('creasy', 5.038040786464192, 28, 0),
 ('daisies', 5.001913352114944, 27, 0),
 ('mclaglen', 5.001913352114944, 27, 0),
 ('treat.', 5.001913352114944, 27, 0),
 ('brashear', 4.964431647119351, 26, 0),
 ('gino', 4.964431647119351, 26, 0),
 ('visconti', 4.964431647119351, 26, 0),
 ('tsui', 4.964431647119351, 26, 0),
 ('ultimatum', 4.925490171686578, 25, 0),
 ('offside', 4.88497059673164, 24, 0),
 ("victoria's", 4.88497059673164, 24, 0),
 ('rea', 4.88497

In [24]:
import pandas as pd

In [32]:
min_df = pd.DataFrame(sorted(weights, key=lambda x: x[1])[:30], columns=['Word', 'Naive Bayes Weight', 'Count Positive', 'Count Negative'])
min_df.head()

Unnamed: 0,Word,Naive Bayes Weight,Count Positive,Count Negative
0,thunderbirds,-5.317828,0,36
1,beowulf,-5.136487,0,30
2,atrocious.,-5.102788,0,29
3,unwatchable.,-5.102788,0,29
4,ajay,-5.102788,0,29


In [33]:
import dataframe_image as dfi

In [34]:
dfi.export(min_df.style, 'min30.png')

In [36]:
max_df = pd.DataFrame(sorted(weights, key=lambda x: x[1])[::-1][:30], columns=['Word', 'Naive Bayes Weight', 'Count Positive', 'Count Negative'])
max_df.head()

Unnamed: 0,Word,Naive Bayes Weight,Count Positive,Count Negative
0,edie,5.992289,73,0
1,antwone,5.936081,69,0
2,gundam,5.813195,61,0
3,paulie,5.745582,57,0
4,mildred,5.634732,51,0


In [37]:
dfi.export(max_df.style, 'max30.png')

In [42]:
nb_row = {'Model': 'NB', 'Training time': '3.58s', 'Predicting on test time': '798.84s', 'train acc': 95.10, 'dev acc': 85.64, 'dev-b acc': 73.75}
mnb_row = {'Model': 'MNB', 'Training time': '2.35s', 'Predicting on test time': '938.03s', 'train acc': 93.17, 'dev acc': 84.63, 'dev-b acc': 73.00}

pd.DataFrame([nb_row, mnb_row])

Unnamed: 0,Model,Training time,Predicting on test time,train acc,dev acc,dev-b acc
0,NB,3.58s,798.84s,95.1,85.64,73.75
1,MNB,2.35s,938.03s,93.17,84.63,73.0


In [167]:
sorted(word_to_nb_weight.items(), key=lambda x: x[1])[:30]   #, key=lambda x: x[1])

[('thunderbirds', -5.3178282602560705),
 ('beowulf', -5.136486901123298),
 ('ajay', -5.1027880290634835),
 ('atrocious.', -5.1027880290634835),
 ('unwatchable.', -5.1027880290634835),
 ('deathstalker', -5.031779317208724),
 ('dahmer', -4.994290003346008),
 ('kareena', -4.955340314690489),
 ('turgid', -4.914811847175403),
 ('welch', -4.914811847175403),
 ('thinking?', -4.914811847175403),
 ('sarne', -4.872571186157663),
 ('stinker.', -4.872571186157663),
 ('grendel', -4.872571186157663),
 ('ripley', -4.872571186157663),
 ('slater', -4.782328062512033),
 ('varma', -4.782328062512033),
 ('kinski', -4.782328062512033),
 ('palermo', -4.782328062512033),
 ('flop.', -4.733956634425011),
 ('kibbutz', -4.733956634425011),
 ('maddy', -4.733956634425011),
 ('dreck.', -4.6831259496970326),
 ('orca', -4.6831259496970326),
 ('steaming', -4.6831259496970326),
 ('start?', -4.6831259496970326),
 ('hackenstein', -4.62957248828758),
 ('manos', -4.62957248828758),
 ('hobgoblins', -4.62957248828758),
 ('vo

In [145]:
sorted(word_to_nb_weight.items(), key=lambda x: x[1])[-30:]   #, key=lambda x: x[1])

[('alvin', 3.037987769146273),
 ('lin', 3.0609772873709717),
 ('dev', 3.0609772873709717),
 ("sinatra's", 3.0609772873709717),
 ('sho', 3.0609772873709717),
 ("chan's", 3.1054290499418054),
 ('lindy', 3.1054290499418054),
 ('luzhin', 3.1479886643606014),
 ('cheung', 3.1479886643606014),
 ('polanski', 3.168607951563337),
 ('giovanna', 3.1888106588808562),
 ('vertigo', 3.1888106588808562),
 ('haines', 3.1888106588808562),
 ('stardust', 3.228031372034138),
 ('evans', 3.228031372034138),
 ('bourne', 3.228031372034138),
 ('anchors', 3.265771700016985),
 ('delightfully', 3.3021393441878595),
 ('anton', 3.33723066399913),
 ('chavez', 3.371132215674811),
 ("kelly's", 3.371132215674811),
 ('philo', 3.403922038497802),
 ('clara', 3.403922038497802),
 ('pixar', 3.435670736812382),
 ("tony's", 3.435670736812382),
 ('excellently', 3.435670736812382),
 ('mathieu', 3.633496480142302),
 ('adele', 3.633496480142302),
 ('kolchak', 3.6835069007169636),
 ('sox', 3.95891888057693)]

In [125]:
pos_counter["god's"], neg_counter["god's"]

(25, 39)

In [90]:
import tqdm

In [91]:
from tqdm import notebook

In [93]:
for text, label in notebook.tqdm(pos):
    tokenized_text = tokenize(text)

  0%|          | 0/7480 [00:00<?, ?it/s]

In [82]:
except_list = ".?!"
p = re.compile(fr"[^\w'+{except_list}+']")

In [83]:
#neg

In [84]:
pos[101][0]

'I thought Rachel York was fantastic as "Lucy." I have seen her in "Kiss Me, Kate" and "Victor/Victoria," as well, and in each of these performances she has developed very different, and very real, characterizations. She is a chameleon who can play (and sing) anything!<br /><br />I am very surprised at how many negative reviews appear here regarding Rachel\'s performance in "Lucy." Even some bonafide TV and entertainment critics seem to have missed the point of her portrayal. So many people have focused on the fact that Rachel doesn\'t really look like Lucy. My response to that is, "So what?" I wasn\'t looking for a superficial impersonation of Lucy. I wanted to know more about the real woman behind the clown. And Rachel certainly gave us that, in great depth. I also didn\'t want to see someone simply "doing" classic Lucy routines. Therefore I was very pleased with the decision by the producers and director to have Rachel portray Lucy in rehearsal for the most memorable of these skits 

In [87]:
list(filter(None, p.split(pos[101][0].lower())))

['i',
 'thought',
 'rachel',
 'york',
 'was',
 'fantastic',
 'as',
 'lucy.',
 'i',
 'have',
 'seen',
 'her',
 'in',
 'kiss',
 'me',
 'kate',
 'and',
 'victor',
 'victoria',
 'as',
 'well',
 'and',
 'in',
 'each',
 'of',
 'these',
 'performances',
 'she',
 'has',
 'developed',
 'very',
 'different',
 'and',
 'very',
 'real',
 'characterizations.',
 'she',
 'is',
 'a',
 'chameleon',
 'who',
 'can',
 'play',
 'and',
 'sing',
 'anything!',
 'br',
 'br',
 'i',
 'am',
 'very',
 'surprised',
 'at',
 'how',
 'many',
 'negative',
 'reviews',
 'appear',
 'here',
 'regarding',
 "rachel's",
 'performance',
 'in',
 'lucy.',
 'even',
 'some',
 'bonafide',
 'tv',
 'and',
 'entertainment',
 'critics',
 'seem',
 'to',
 'have',
 'missed',
 'the',
 'point',
 'of',
 'her',
 'portrayal.',
 'so',
 'many',
 'people',
 'have',
 'focused',
 'on',
 'the',
 'fact',
 'that',
 'rachel',
 "doesn't",
 'really',
 'look',
 'like',
 'lucy.',
 'my',
 'response',
 'to',
 'that',
 'is',
 'so',
 'what?',
 'i',
 "wasn't",
 

In [None]:
re.compile(r"/<>(){}")