In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet', 'unsupervised': 'plain_text/unsupervised-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])

In [3]:
df.head(10)

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
5,I would put this at the top of my list of film...,0
6,Whoever wrote the screenplay for this movie ob...,0
7,"When I first saw a glimpse of this movie, I qu...",0
8,"Who are these ""They""- the actors? the filmmake...",0
9,This is said to be a personal film for Peter B...,0


In [4]:
df['label'].value_counts()

label
0    12500
1    12500
Name: count, dtype: int64

In [5]:
X = df.text
y = df.label
print(X.shape)
print(y.shape)

(25000,)
(25000,)


In [6]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(20000,)
(5000,)
(20000,)
(5000,)


In [8]:
vect = CountVectorizer()

In [9]:
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [10]:
X_train_dtm = vect.fit_transform(X_train)

In [11]:
X_train_dtm

<20000x68268 sparse matrix of type '<class 'numpy.int64'>'
	with 2752639 stored elements in Compressed Sparse Row format>

In [12]:
X_test_dtm = vect.transform(X_test)
X_test_dtm

<5000x68268 sparse matrix of type '<class 'numpy.int64'>'
	with 686126 stored elements in Compressed Sparse Row format>

In [13]:
nb = MultinomialNB()

In [14]:
%time nb.fit(X_train_dtm, y_train)

CPU times: total: 15.6 ms
Wall time: 15 ms


In [15]:
y_pred_class = nb.predict(X_test_dtm)

In [16]:
metrics.accuracy_score(y_test, y_pred_class)

0.846

In [17]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[2202,  313],
       [ 457, 2028]], dtype=int64)

In [18]:
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob[:10]

array([1.49172620e-05, 1.00000000e+00, 1.33330375e-10, 9.99999817e-01,
       1.00000000e+00, 3.74762189e-07, 2.07993798e-37, 1.00000000e+00,
       3.17273536e-09, 9.72775501e-02])

In [19]:
metrics.roc_auc_score(y_test, y_pred_prob)

0.9207510670384134

In [20]:
import torch
import torch.nn as nn
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import torch.nn.functional as F
import random

In [21]:
data = df['text'].tolist()
data[:10]

['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [22]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [25]:
max_epochs = 5
vec_size = 20
alpha = 0.025

dv = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1)

dv.build_vocab(tagged_data)

In [26]:
for epoch in range(max_epochs):
    print("iteration {0}".format(epoch))
    dv.train(tagged_data, total_examples=dv.corpus_count, epochs=dv.epochs)
    dv.alpha -= 0.0002
    dv.min_alpha = dv.alpha

dv.save("d2v.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
Model Saved


In [28]:
y_pred_class_dv = dv.predict(X_test_dtm)

AttributeError: 'Doc2Vec' object has no attribute 'predict'

In [22]:
df1 = df[df['label'] == 0]    
df2 = df[df['label'] == 1] 

In [23]:
len(df1)

12500

In [24]:
len(df2)

12500

In [25]:
df1 = df1[:2000]
df2 = df2[:2000]

In [26]:
df = pd.concat([df1, df2], ignore_index=True)
df.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [27]:
df['label'].value_counts()

label
0    2000
1    2000
Name: count, dtype: int64

In [28]:
len(df)

4000

In [29]:
data = df['text'].tolist()
data[:5]

['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [30]:
s = pd.Series(data)
s[:5]

0    I rented I AM CURIOUS-YELLOW from my video sto...
1    "I Am Curious: Yellow" is a risible and preten...
2    If only to avoid making this type of film in t...
3    This film was probably inspired by Godard's Ma...
4    Oh, brother...after hearing about this ridicul...
dtype: object

In [31]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [32]:
s_upd = s.apply(preprocess)
s_upd[:5]

0    rented curiousyellow video store controversy s...
1    curious yellow risible pretentious steaming pi...
2    avoid making type film future film interesting...
3    film probably inspired godards masculin fémini...
4    oh brotherafter hearing ridiculous film umptee...
dtype: object

In [33]:
mylist = s_upd.tolist()
mylist[2]

'avoid making type film future film interesting experiment tells cogent storybr br one might feel virtuous sitting thru touches many important issues without discernable motive viewer comes away new perspectives unless one comes one ones mind wanders invariably pointless filmbr br one might better spend ones time staring window tree growingbr br'

In [34]:
data_0 = [[el] for el in mylist]
data_0[:5]

[['rented curiousyellow video store controversy surrounded first released 1967 also heard first seized us customs ever tried enter country therefore fan films considered controversial really see myselfbr br plot centered around young swedish drama student named lena wants learn everything life particular wants focus attentions making sort documentary average swede thought certain political issues vietnam war race issues united states asking politicians ordinary denizens stockholm opinions politics sex drama teacher classmates married menbr br kills curiousyellow 40 years ago considered pornographic really sex nudity scenes far even shot like cheaply made porno countrymen mind find shocking reality sex nudity major staple swedish cinema even ingmar bergman arguably answer good old boy john ford sex scenes filmsbr br commend filmmakers fact sex shown film shown artistic purposes rather shock people make money shown pornographic theaters america curiousyellow good film anyone wanting stud

In [35]:
len(data_0)

4000

In [36]:
newlist = []
for i in mylist:
    i = i.split()
    newlist.append(i)

newlist[:5]

[['rented',
  'curiousyellow',
  'video',
  'store',
  'controversy',
  'surrounded',
  'first',
  'released',
  '1967',
  'also',
  'heard',
  'first',
  'seized',
  'us',
  'customs',
  'ever',
  'tried',
  'enter',
  'country',
  'therefore',
  'fan',
  'films',
  'considered',
  'controversial',
  'really',
  'see',
  'myselfbr',
  'br',
  'plot',
  'centered',
  'around',
  'young',
  'swedish',
  'drama',
  'student',
  'named',
  'lena',
  'wants',
  'learn',
  'everything',
  'life',
  'particular',
  'wants',
  'focus',
  'attentions',
  'making',
  'sort',
  'documentary',
  'average',
  'swede',
  'thought',
  'certain',
  'political',
  'issues',
  'vietnam',
  'war',
  'race',
  'issues',
  'united',
  'states',
  'asking',
  'politicians',
  'ordinary',
  'denizens',
  'stockholm',
  'opinions',
  'politics',
  'sex',
  'drama',
  'teacher',
  'classmates',
  'married',
  'menbr',
  'br',
  'kills',
  'curiousyellow',
  '40',
  'years',
  'ago',
  'considered',
  'pornogr

In [37]:
test_lst_01 = []
for i in newlist:
    for k in i:
        test_lst_01.append(k)

# test_lst[152]

test_lst_01[:10]


['rented',
 'curiousyellow',
 'video',
 'store',
 'controversy',
 'surrounded',
 'first',
 'released',
 '1967',
 'also']

In [38]:
len(test_lst_01)

496978

In [39]:
count = Counter(test_lst_01)
count

Counter({'br': 9209,
         'movie': 6546,
         'film': 5737,
         'one': 4113,
         'like': 3089,
         'good': 2336,
         'story': 1935,
         'even': 1903,
         'would': 1884,
         'really': 1812,
         'time': 1802,
         'see': 1750,
         'much': 1509,
         'well': 1498,
         'great': 1486,
         'get': 1448,
         'first': 1433,
         'also': 1428,
         'bad': 1364,
         'people': 1339,
         'dont': 1317,
         'made': 1283,
         'could': 1258,
         'movies': 1254,
         'films': 1241,
         'way': 1208,
         'character': 1171,
         'make': 1166,
         'think': 1142,
         'characters': 1121,
         'love': 1085,
         'watch': 1076,
         'seen': 1071,
         'many': 1069,
         'two': 1051,
         'best': 1030,
         'life': 1028,
         'never': 1003,
         'show': 983,
         'little': 974,
         'acting': 937,
         'plot': 934,
         'ever'

In [40]:
mydict = dict(count)
mydict

{'rented': 62,
 'curiousyellow': 3,
 'video': 233,
 'store': 79,
 'controversy': 10,
 'surrounded': 26,
 'first': 1433,
 'released': 149,
 '1967': 9,
 'also': 1428,
 'heard': 163,
 'seized': 3,
 'us': 651,
 'customs': 8,
 'ever': 919,
 'tried': 119,
 'enter': 28,
 'country': 126,
 'therefore': 50,
 'fan': 288,
 'films': 1241,
 'considered': 74,
 'controversial': 24,
 'really': 1812,
 'see': 1750,
 'myselfbr': 1,
 'br': 9209,
 'plot': 934,
 'centered': 14,
 'around': 516,
 'young': 626,
 'swedish': 26,
 'drama': 238,
 'student': 65,
 'named': 126,
 'lena': 54,
 'wants': 193,
 'learn': 119,
 'everything': 358,
 'life': 1028,
 'particular': 124,
 'focus': 79,
 'attentions': 7,
 'making': 421,
 'sort': 233,
 'documentary': 137,
 'average': 121,
 'swede': 1,
 'thought': 514,
 'certain': 119,
 'political': 113,
 'issues': 58,
 'vietnam': 12,
 'war': 283,
 'race': 72,
 'united': 23,
 'states': 45,
 'asking': 32,
 'politicians': 8,
 'ordinary': 38,
 'denizens': 3,
 'stockholm': 4,
 'opinions':

In [41]:
count['br']

9209

In [42]:
sort_dict = sorted(mydict.items(), key=lambda item: item[1], reverse=True)
sort_dict

[('br', 9209),
 ('movie', 6546),
 ('film', 5737),
 ('one', 4113),
 ('like', 3089),
 ('good', 2336),
 ('story', 1935),
 ('even', 1903),
 ('would', 1884),
 ('really', 1812),
 ('time', 1802),
 ('see', 1750),
 ('much', 1509),
 ('well', 1498),
 ('great', 1486),
 ('get', 1448),
 ('first', 1433),
 ('also', 1428),
 ('bad', 1364),
 ('people', 1339),
 ('dont', 1317),
 ('made', 1283),
 ('could', 1258),
 ('movies', 1254),
 ('films', 1241),
 ('way', 1208),
 ('character', 1171),
 ('make', 1166),
 ('think', 1142),
 ('characters', 1121),
 ('love', 1085),
 ('watch', 1076),
 ('seen', 1071),
 ('many', 1069),
 ('two', 1051),
 ('best', 1030),
 ('life', 1028),
 ('never', 1003),
 ('show', 983),
 ('little', 974),
 ('acting', 937),
 ('plot', 934),
 ('ever', 919),
 ('know', 918),
 ('end', 860),
 ('man', 858),
 ('scene', 848),
 ('better', 828),
 ('still', 816),
 ('say', 780),
 ('real', 774),
 ('scenes', 770),
 ('go', 754),
 ('something', 752),
 ('years', 750),
 ('back', 742),
 ('actors', 740),
 ('new', 719),
 ('

In [43]:
top_words = []
for i in sort_dict:
    top_words.append(i[0])

top_words = top_words[:4000]
top_words

['br',
 'movie',
 'film',
 'one',
 'like',
 'good',
 'story',
 'even',
 'would',
 'really',
 'time',
 'see',
 'much',
 'well',
 'great',
 'get',
 'first',
 'also',
 'bad',
 'people',
 'dont',
 'made',
 'could',
 'movies',
 'films',
 'way',
 'character',
 'make',
 'think',
 'characters',
 'love',
 'watch',
 'seen',
 'many',
 'two',
 'best',
 'life',
 'never',
 'show',
 'little',
 'acting',
 'plot',
 'ever',
 'know',
 'end',
 'man',
 'scene',
 'better',
 'still',
 'say',
 'real',
 'scenes',
 'go',
 'something',
 'years',
 'back',
 'actors',
 'new',
 'another',
 'watching',
 'funny',
 'im',
 'didnt',
 'old',
 'thing',
 'every',
 'doesnt',
 'nothing',
 'makes',
 'work',
 'find',
 'us',
 'though',
 'director',
 'young',
 'cast',
 'got',
 'lot',
 'part',
 'actually',
 'going',
 'quite',
 'cant',
 'look',
 'want',
 'fact',
 'thats',
 'seems',
 'however',
 'big',
 'take',
 'long',
 'give',
 'comedy',
 'always',
 'things',
 'series',
 'enough',
 'role',
 'bit',
 'world',
 'pretty',
 'horror',
 

In [44]:
myset = set(top_words)
myset

{'paper',
 'dying',
 'fail',
 'girl',
 '2',
 'band',
 'current',
 'plots',
 'documentaries',
 'quickly',
 'torture',
 'thinking',
 'sets',
 'satisfying',
 'hundreds',
 'love',
 'especially',
 'total',
 'viewing',
 'sick',
 'remarkably',
 'taken',
 'talent',
 'murder',
 'subtitles',
 'fitting',
 'evil',
 'inspired',
 'union',
 'fits',
 'daily',
 'produced',
 'title',
 'genuinely',
 'cinema',
 'remarkable',
 'sweet',
 'lady',
 'telling',
 'han',
 'joy',
 'attached',
 'section',
 'boredom',
 'lessons',
 'mistake',
 'montana',
 'johnny',
 'utter',
 'wonderful',
 'praise',
 'outcome',
 'mob',
 'thrills',
 'insomnia',
 'villains',
 'likes',
 'profound',
 'funeral',
 'finally',
 'albert',
 'bound',
 'suffering',
 'empire',
 'fears',
 'friendship',
 'farce',
 'eventually',
 'ignored',
 'result',
 'refuses',
 'theyve',
 'slave',
 'spy',
 'interactions',
 'arrive',
 'believe',
 'bar',
 'regular',
 'worth',
 'gary',
 'yokai',
 'raj',
 'pleasure',
 'witty',
 'class',
 'rest',
 'implausible',
 'gen

In [45]:
len(myset)

4000

In [46]:
targ_lst = list(myset)
targ_lst

['paper',
 'dying',
 'fail',
 'girl',
 '2',
 'band',
 'current',
 'plots',
 'documentaries',
 'quickly',
 'torture',
 'thinking',
 'sets',
 'satisfying',
 'hundreds',
 'love',
 'especially',
 'total',
 'viewing',
 'sick',
 'remarkably',
 'taken',
 'talent',
 'murder',
 'subtitles',
 'fitting',
 'evil',
 'inspired',
 'union',
 'fits',
 'daily',
 'produced',
 'title',
 'genuinely',
 'cinema',
 'remarkable',
 'sweet',
 'lady',
 'telling',
 'han',
 'joy',
 'attached',
 'section',
 'boredom',
 'lessons',
 'mistake',
 'montana',
 'johnny',
 'utter',
 'wonderful',
 'praise',
 'outcome',
 'mob',
 'thrills',
 'insomnia',
 'villains',
 'likes',
 'profound',
 'funeral',
 'finally',
 'albert',
 'bound',
 'suffering',
 'empire',
 'fears',
 'friendship',
 'farce',
 'eventually',
 'ignored',
 'result',
 'refuses',
 'theyve',
 'slave',
 'spy',
 'interactions',
 'arrive',
 'believe',
 'bar',
 'regular',
 'worth',
 'gary',
 'yokai',
 'raj',
 'pleasure',
 'witty',
 'class',
 'rest',
 'implausible',
 'gen

In [40]:
# test_lst_02 = list(myset)
# test_lst_02[:10]

['valentinocharms',
 'spoilerbr',
 'symbolise',
 'afterall',
 'performancehe',
 'objectified',
 'deviated',
 'antifascism',
 'disappointedas',
 'barroom']

In [41]:
# len(test_lst_02)

71864

In [42]:
# targ_lst = test_lst_02[:10000]
# len(targ_lst)

10000

In [38]:
# add_lst = []
# for i in newlist:
#     add_lst.append(random.choice(i))

# add_lst[:5]

['shock', 'insides', 'motive', 'subject', 'lots']

In [47]:
c = 0
while c < len(targ_lst):
    for i in data_0:
        i.append(targ_lst[c])
        c += 1
     
data_0[:5]

[['rented curiousyellow video store controversy surrounded first released 1967 also heard first seized us customs ever tried enter country therefore fan films considered controversial really see myselfbr br plot centered around young swedish drama student named lena wants learn everything life particular wants focus attentions making sort documentary average swede thought certain political issues vietnam war race issues united states asking politicians ordinary denizens stockholm opinions politics sex drama teacher classmates married menbr br kills curiousyellow 40 years ago considered pornographic really sex nudity scenes far even shot like cheaply made porno countrymen mind find shocking reality sex nudity major staple swedish cinema even ingmar bergman arguably answer good old boy john ford sex scenes filmsbr br commend filmmakers fact sex shown film shown artistic purposes rather shock people make money shown pornographic theaters america curiousyellow good film anyone wanting stud

In [62]:
# c = 0
# while c < len(add_lst):
#     for i in data_0:
#         i.append(add_lst[c])
#         c += 1
     
# data_0[:5]

In [48]:
res_lst = []
for l in data_0:
    res_lst.append(tuple(l))

res_lst[:5]

[('rented curiousyellow video store controversy surrounded first released 1967 also heard first seized us customs ever tried enter country therefore fan films considered controversial really see myselfbr br plot centered around young swedish drama student named lena wants learn everything life particular wants focus attentions making sort documentary average swede thought certain political issues vietnam war race issues united states asking politicians ordinary denizens stockholm opinions politics sex drama teacher classmates married menbr br kills curiousyellow 40 years ago considered pornographic really sex nudity scenes far even shot like cheaply made porno countrymen mind find shocking reality sex nudity major staple swedish cinema even ingmar bergman arguably answer good old boy john ford sex scenes filmsbr br commend filmmakers fact sex shown film shown artistic purposes rather shock people make money shown pornographic theaters america curiousyellow good film anyone wanting stud

In [55]:
# new_df = pd.DataFrame(res_lst, columns=['text', 'target'])
# new_df.head()

In [59]:
# new_df['text'][2].split()

In [60]:
# test_df = pd.DataFrame(columns=['target'])
# test_df

In [49]:

# c = 0
targ_upd_arr = []
# while c < len(data_0):
for i in data_0:
    if i[1] in i[0].split():
        value = 1
        # targ_upd_arr.append(list(value))
        # test_df['target'][c] = 1
    else:
        # data_0[c][1] not in data_0[c][0].split()
        value = 0
        # test_df['target'][c] = 0
    targ_upd_arr.append([value])
        # c += 1
        #     else
        # for k in new_df['text']:
        #     for j in k.split():
        #         if i == j:
        #             test_df['target'][c] = 1
        #         else:
        #             test_df['target'][c] = 0
        #             # targ_upd_lst.append(i)
        #             c += 1

targ_upd_arr[:5]

[[0], [0], [0], [0], [0]]

In [50]:
len(targ_upd_arr)

4000

In [51]:
my_arr = np.array(targ_upd_arr)
my_arr[:10]

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0]])

In [52]:
test_df = pd.DataFrame(my_arr, columns=['target'])
test_df

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
3995,0
3996,0
3997,0
3998,0


In [53]:
test_df['target'].value_counts()

target
0    3916
1      84
Name: count, dtype: int64

In [56]:
new_df = pd.DataFrame(res_lst, columns=['text', 'target'])
new_df.head()

Unnamed: 0,text,target
0,rented curiousyellow video store controversy s...,paper
1,curious yellow risible pretentious steaming pi...,dying
2,avoid making type film future film interesting...,fail
3,film probably inspired godards masculin fémini...,girl
4,oh brotherafter hearing ridiculous film umptee...,2


In [57]:
new_df['target'] = test_df['target']
new_df.head()

Unnamed: 0,text,target
0,rented curiousyellow video store controversy s...,0
1,curious yellow risible pretentious steaming pi...,0
2,avoid making type film future film interesting...,0
3,film probably inspired godards masculin fémini...,0
4,oh brotherafter hearing ridiculous film umptee...,0


In [58]:
new_df['target'].value_counts()

target
0    3916
1      84
Name: count, dtype: int64

In [54]:
# c = 0
# # targ_upd_lst = []
# while c < len(new_df['target']):
#     for i in new_df['target']:
#         for k in new_df['text']:
#             for j in k.split():
#                 if i == j:
#                     test_df['target'][c] = 1
#                 else:
#                     test_df['target'][c] = 0
#                     # targ_upd_lst.append(i)
#                     c += 1

# # targ_upd_lst[:5]

In [61]:
vocab = set()
for context, target in res_lst:
    vocab.update(context.split())
    vocab.update([target])
word_to_ix = {word: i for i, word in enumerate(vocab)}

items = list(word_to_ix.items())
items[:5]

[('padget', 0),
 ('rim', 1),
 ('seventiesthe', 2),
 ('underused', 3),
 ('villainous', 4)]

In [63]:
training_data = [(context.split(), target) for context, target in res_lst]
training_data[:3]

[(['rented',
   'curiousyellow',
   'video',
   'store',
   'controversy',
   'surrounded',
   'first',
   'released',
   '1967',
   'also',
   'heard',
   'first',
   'seized',
   'us',
   'customs',
   'ever',
   'tried',
   'enter',
   'country',
   'therefore',
   'fan',
   'films',
   'considered',
   'controversial',
   'really',
   'see',
   'myselfbr',
   'br',
   'plot',
   'centered',
   'around',
   'young',
   'swedish',
   'drama',
   'student',
   'named',
   'lena',
   'wants',
   'learn',
   'everything',
   'life',
   'particular',
   'wants',
   'focus',
   'attentions',
   'making',
   'sort',
   'documentary',
   'average',
   'swede',
   'thought',
   'certain',
   'political',
   'issues',
   'vietnam',
   'war',
   'race',
   'issues',
   'united',
   'states',
   'asking',
   'politicians',
   'ordinary',
   'denizens',
   'stockholm',
   'opinions',
   'politics',
   'sex',
   'drama',
   'teacher',
   'classmates',
   'married',
   'menbr',
   'br',
   'kills'

In [64]:
vocab_size = len(vocab)
embedding_dim = 7

In [65]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size) 
        self.context = nn.Linear(embed_size, vocab_size) 
        
    def forward(self, inputs):                   
        embeds = self.embedding(inputs)          
        embeds_mean = torch.mean(embeds, dim=0)  
        out = self.context(embeds_mean)          
        log_probs = F.log_softmax(out, dim=0)    
        return log_probs

In [66]:
model = CBOW(len(vocab), embedding_dim)
word_emb = model.embedding(torch.tensor(word_to_ix["paper"], dtype=torch.long))

print(word_emb.shape)

print(word_emb)
print(word_emb.mean())

x_1d = word_emb.unsqueeze(0).unsqueeze(0)
print(x_1d.shape)

cnn1d_1 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=2, bias=False)
print(cnn1d_1.weight)
print(cnn1d_1.bias)

y1 = cnn1d_1(x_1d)
print(y1)

torch.Size([7])
tensor([ 0.2207, -0.0586, -0.9471, -0.7424,  0.7032, -0.2656, -1.1339],
       grad_fn=<EmbeddingBackward0>)
tensor(-0.3177, grad_fn=<MeanBackward0>)
torch.Size([1, 1, 7])
Parameter containing:
tensor([[[-0.1419, -0.5872]]], requires_grad=True)
None
tensor([[[ 0.0031,  0.5644,  0.5703, -0.3075,  0.0562,  0.7034]]],
       grad_fn=<ConvolutionBackward0>)


In [67]:
LEARNING_RATE = 0.09
EPOCHS = 5

torch.manual_seed(42)
model = CBOW(len(vocab), embedding_dim)

loss_function = nn.NLLLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in training_data:
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        target_idx = torch.tensor([word_to_ix[target]], dtype=torch.long)
        
        model.zero_grad()
        
        log_probs = model(context_idxs)
        
        loss = loss_function(log_probs.view(1, -1), target_idx)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {total_loss}')

Epoch 1, Loss: 61760.08382034302
Epoch 2, Loss: 51966.8141169548
Epoch 3, Loss: 43908.714472293854
Epoch 4, Loss: 46930.76711463928
Epoch 5, Loss: 46402.40115451813


In [47]:
new_data = pd.DataFrame(s_upd, columns=['text'])
new_data[:5]

Unnamed: 0,text
0,rented curiousyellow video store controversy s...
1,curious yellow risible pretentious steaming pi...
2,avoid making type film future film interesting...
3,film probably inspired godards masculin fémini...
4,oh brotherafter hearing ridiculous film umptee...


In [48]:
new_data['label'] = df['label']
new_data[:5]

Unnamed: 0,text,label
0,rented curiousyellow video store controversy s...,0
1,curious yellow risible pretentious steaming pi...,0
2,avoid making type film future film interesting...,0
3,film probably inspired godards masculin fémini...,0
4,oh brotherafter hearing ridiculous film umptee...,0


In [49]:
len(new_data)

4000

In [50]:
train = new_data.text
test = new_data.label
print(train.shape)
print(test.shape)

(4000,)
(4000,)


In [51]:
X_train, X_test, y_train, y_test = train_test_split(train, test, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3200,)
(800,)
(3200,)
(800,)


In [52]:
new_vect = CountVectorizer()

In [53]:
new_vect.fit(X_train)
X_train_dtm_upd = new_vect.transform(X_train)

In [54]:
X_train_dtm_upd = new_vect.fit_transform(X_train)

In [55]:
X_train_dtm_upd

<3200x37705 sparse matrix of type '<class 'numpy.int64'>'
	with 323525 stored elements in Compressed Sparse Row format>

In [56]:
X_test_dtm_upd = new_vect.transform(X_test)
X_test_dtm_upd

<800x37705 sparse matrix of type '<class 'numpy.int64'>'
	with 76098 stored elements in Compressed Sparse Row format>

In [57]:
new_nb = MultinomialNB()

In [58]:
%time new_nb.fit(X_train_dtm_upd, y_train)

CPU times: total: 0 ns
Wall time: 4 ms


In [59]:
y_pred_class_new = new_nb.predict(X_test_dtm_upd)

In [60]:
metrics.accuracy_score(y_test, y_pred_class_new)

0.88125

In [61]:
metrics.confusion_matrix(y_test, y_pred_class_new)

array([[379,  43],
       [ 52, 326]], dtype=int64)

In [62]:
y_pred_prob_new = new_nb.predict_proba(X_test_dtm_upd)[:, 1]
y_pred_prob_new[:10]

array([1.28659440e-09, 1.00000000e+00, 5.81676743e-10, 1.24655543e-02,
       1.00000000e+00, 7.12552883e-03, 4.44752989e-05, 1.65171215e-02,
       1.30677982e-03, 2.19626591e-08])

In [63]:
metrics.roc_auc_score(y_test, y_pred_prob_new)

0.9384042980014544