In [30]:
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import collections
import random
from tqdm import tqdm

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts, get_tmpfile
from gensim.utils import simple_preprocess
from gensim.models.callbacks import CallbackAny2Vec

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import utils
from sklearn.model_selection import train_test_split

import csv

import multiprocessing

2020-12-22 20:01:21,249 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-12-22 20:01:21,252 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)


In [5]:
from collections import namedtuple

class TaggedDocumentNew(namedtuple('TaggedDocument', 'words tags rating id')):
    """Represents a document along with a tag, input document format for :class:`~gensim.models.doc2vec.Doc2Vec`.
    A single document, made up of `words` (a list of unicode string tokens) and `tags` (a list of tokens).
    Tags may be one or more unicode string tokens, but typical practice (which will also be the most memory-efficient)
    is for the tags list to include a unique integer id as the only tag.
    Replaces "sentence as a list of words" from :class:`gensim.models.word2vec.Word2Vec`.
    """
    def __str__(self):
        """Human readable representation of the object's state, used for debugging.
        Returns
        -------
        str
           Human readable representation of the object's state (words and tags).
        """
        return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags, self.rating, self.id)

#### DATA

In [6]:
from os import listdir
from os.path import isfile, join

path_pos = "./Data/raw/aclImdb_v1/aclImdb/train/pos/"
pos_rev_files = ["pos/"+f for f in listdir(path_pos) if isfile(join(path_pos, f))]

path_neg = "./Data/raw/aclImdb_v1/aclImdb/train/neg/"
neg_rev_files = ["neg/"+f for f in listdir(path_neg) if isfile(join(path_neg, f))]

path_unsup = "./Data/raw/aclImdb_v1/aclImdb/train/unsup/"
unsup_rev_files = ["unsup/"+f for f in listdir(path_unsup) if isfile(join(path_unsup, f))]

In [7]:
def generate_review_df(files, url_file):
    base_path = "./Data/raw/aclImdb_v1/aclImdb/train/"
    
    df = pd.DataFrame(columns=["id", "rating", "review", "title"])
    with open(url_file, "r") as file:
        urls = file.readlines()

    for file in tqdm(files):
        _id = file.split("_")[0].split("/")[-1]
        _rating = file.split("_")[-1].split(".")[0]
        with open (base_path+file, "r", encoding="utf8") as f:
            _review = f.readline()
            
        _title = urls[int(_id)].split("/")[-2]
        temp = pd.DataFrame(data=[[_id, _rating, _review, _title]], columns=["id", "rating", "review", "title"])
        df = pd.concat([df, temp], axis=0)
        
    return df

In [8]:
df_pos = generate_review_df(pos_rev_files, "./Data/raw/aclImdb_v1/aclImdb/train/urls_pos.txt")

100%|███████████████████████████████████████████████████████████████████████████| 12500/12500 [00:15<00:00, 811.18it/s]


In [9]:
df_neg = generate_review_df(neg_rev_files, "./Data/raw/aclImdb_v1/aclImdb/train/urls_neg.txt")

100%|███████████████████████████████████████████████████████████████████████████| 12500/12500 [00:14<00:00, 845.56it/s]


In [10]:
df = pd.concat([df_pos, df_neg], axis=0).reset_index().drop(["index"], axis=1)

In [11]:
df = df.sample(frac=1).reset_index(drop=True)
df.shape

(25000, 4)

In [12]:
X = df.loc[:, ["review", "id"]]
y = df.loc[:, ["rating"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [13]:
df.to_csv("./Data/transformed/data.csv", index=False, header=False, sep="\t")

In [14]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [16]:
train.to_csv("./Data/transformed/data_train.csv", index=False, header=False, sep="\t")

In [17]:
test.to_csv("./Data/transformed/data_test.csv", index=False, header=False, sep="\t")

In [43]:
import smart_open

def preprocess(line):
    contents = line.split("\t")
    text = contents[0].strip()
    tokens = word_tokenize(text)
    tokens = [t.lower() for t in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english') + ["br"])
    words = [w for w in words if w not in stop_words and len(w) > 2]
    return contents[1].strip(), contents[2].strip(), words

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in tqdm(enumerate(f)):
            _id, rating, tokens = preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield TaggedDocumentNew(tokens, [i], [rating], [_id])

train_corpus = list(read_corpus('./Data/transformed/data_train.csv'))
test_corpus = list(read_corpus('./Data/transformed/data_test.csv'))

16750it [00:30, 552.34it/s]
8250it [00:14, 562.40it/s]


In [44]:
train_corpus[3]

TaggedDocumentNew(words=['one', 'thing', 'recommend', 'film', 'intriguing', 'premise', 'certainly', 'draws', 'audience', 'mystery', 'throughout', 'film', 'hints', 'something', 'dark', 'lurking', 'however', 'much', 'tension', 'williams', 'mild', 'mannered', 'portrayal', 'much', 'makes', 'relate', 'obsession', 'boy', 'collete', 'fares', 'much', 'better', 'woman', 'whose', 'true', 'nature', 'intentions', 'clear', 'production', 'felt', 'rushed', 'holes', 'apparent', 'certainly', 'feels', 'like', 'preview', 'much', 'complete', 'better', 'effort', 'book', 'probably', 'better', 'one', 'thing', 'certain', 'taupin', 'must', 'written', 'something', 'truly', 'good', 'inspired', 'least', 'one', 'commendable', 'effort'], tags=[3], rating=['8'], id=['10016'])

In [45]:
cores = multiprocessing.cpu_count()

model = Doc2Vec(vector_size=300, min_count=2, epochs=50, workers=cores)
model.build_vocab([x for x in tqdm(train_corpus)])

100%|███████████████████████████████████████████████████████████████████████| 16750/16750 [00:00<00:00, 2417570.27it/s]
2020-12-22 20:25:09,438 : INFO : collecting all words and their counts
2020-12-22 20:25:09,438 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-12-22 20:25:09,652 : INFO : PROGRESS: at example #10000, processed 1151758 words (5419560/s), 58631 word types, 10000 tags
2020-12-22 20:25:09,799 : INFO : collected 75711 word types and 16750 unique tags from a corpus of 16750 examples and 1951575 words
2020-12-22 20:25:09,800 : INFO : Loading a fresh vocabulary
2020-12-22 20:25:09,866 : INFO : effective_min_count=2 retains 41041 unique words (54% of original 75711, drops 34670)
2020-12-22 20:25:09,867 : INFO : effective_min_count=2 leaves 1916905 word corpus (98% of original 1951575, drops 34670)
2020-12-22 20:25:09,980 : INFO : deleting the raw counts dictionary of 75711 items
2020-12-22 20:25:09,982 : INFO : sample=0.001 downsamples 24 m

In [46]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2020-12-22 20:27:08,649 : INFO : training model with 12 workers on 41041 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-12-22 20:27:09,681 : INFO : EPOCH 1 - PROGRESS: at 9.86% examples, 185027 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:27:10,688 : INFO : EPOCH 1 - PROGRESS: at 20.23% examples, 185919 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:27:11,693 : INFO : EPOCH 1 - PROGRESS: at 42.42% examples, 258442 words/s, in_qsize 24, out_qsize 0
2020-12-22 20:27:12,706 : INFO : EPOCH 1 - PROGRESS: at 66.10% examples, 300995 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:27:13,709 : INFO : EPOCH 1 - PROGRESS: at 87.91% examples, 323276 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:27:13,986 : INFO : worker thread finished; awaiting finish of 11 more threads
2020-12-22 20:27:14,000 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 20:27:14,028 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-

2020-12-22 20:27:33,264 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 20:27:33,269 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-12-22 20:27:33,274 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-22 20:27:33,276 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-22 20:27:33,288 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-22 20:27:33,290 : INFO : EPOCH - 5 : training on 1951575 raw words (1854381 effective words) took 4.4s, 419821 effective words/s
2020-12-22 20:27:34,356 : INFO : EPOCH 6 - PROGRESS: at 18.72% examples, 328849 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:27:35,420 : INFO : EPOCH 6 - PROGRESS: at 43.44% examples, 377990 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:27:36,423 : INFO : EPOCH 6 - PROGRESS: at 67.64% examples, 398547 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:27:37,446 : INFO : EPOCH 6 - PROGRESS: at 88.39% exa

2020-12-22 20:27:58,978 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 20:27:58,994 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-12-22 20:27:59,017 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-12-22 20:27:59,036 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-12-22 20:27:59,052 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-12-22 20:27:59,122 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 20:27:59,129 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-12-22 20:27:59,135 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-22 20:27:59,138 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-22 20:27:59,154 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-22 20:27:59,156 : INFO : EPOCH - 10 : training on 1951575 raw words (1854956 effec

2020-12-22 20:28:23,133 : INFO : EPOCH 15 - PROGRESS: at 62.15% examples, 377438 words/s, in_qsize 24, out_qsize 0
2020-12-22 20:28:24,170 : INFO : EPOCH 15 - PROGRESS: at 84.92% examples, 387705 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:28:24,692 : INFO : worker thread finished; awaiting finish of 11 more threads
2020-12-22 20:28:24,694 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 20:28:24,698 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 20:28:24,718 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-12-22 20:28:24,724 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-12-22 20:28:24,742 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-12-22 20:28:24,758 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-12-22 20:28:24,784 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 20:28:24,786 : INFO : worker 

2020-12-22 20:28:45,678 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-22 20:28:45,680 : INFO : EPOCH - 19 : training on 1951575 raw words (1854466 effective words) took 4.8s, 388722 effective words/s
2020-12-22 20:28:46,736 : INFO : EPOCH 20 - PROGRESS: at 18.72% examples, 332361 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:28:47,752 : INFO : EPOCH 20 - PROGRESS: at 40.35% examples, 361080 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:28:48,778 : INFO : EPOCH 20 - PROGRESS: at 63.13% examples, 375563 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:28:49,787 : INFO : EPOCH 20 - PROGRESS: at 85.45% examples, 386466 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:28:50,266 : INFO : worker thread finished; awaiting finish of 11 more threads
2020-12-22 20:28:50,269 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 20:28:50,276 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 20:28:50,285 : INFO : wor

2020-12-22 20:29:10,559 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-12-22 20:29:10,574 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 20:29:10,579 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-12-22 20:29:10,580 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-22 20:29:10,583 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-22 20:29:10,594 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-22 20:29:10,595 : INFO : EPOCH - 24 : training on 1951575 raw words (1854699 effective words) took 4.9s, 379033 effective words/s
2020-12-22 20:29:11,631 : INFO : EPOCH 25 - PROGRESS: at 20.23% examples, 366274 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:29:12,645 : INFO : EPOCH 25 - PROGRESS: at 43.44% examples, 392573 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:29:13,661 : INFO : EPOCH 25 - PROGRESS: at 67.64% examples, 407153 words

2020-12-22 20:29:33,958 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 20:29:33,971 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-12-22 20:29:33,975 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-12-22 20:29:33,982 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-12-22 20:29:33,991 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-12-22 20:29:34,006 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 20:29:34,015 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-12-22 20:29:34,017 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-22 20:29:34,022 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-22 20:29:34,037 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-22 20:29:34,037 : INFO : EPOCH - 29 : training on 1951575 raw words (1854479 effec

2020-12-22 20:29:57,536 : INFO : EPOCH 34 - PROGRESS: at 67.64% examples, 402369 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:29:58,542 : INFO : EPOCH 34 - PROGRESS: at 90.88% examples, 411602 words/s, in_qsize 18, out_qsize 0
2020-12-22 20:29:58,766 : INFO : worker thread finished; awaiting finish of 11 more threads
2020-12-22 20:29:58,771 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 20:29:58,793 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 20:29:58,801 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-12-22 20:29:58,811 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-12-22 20:29:58,825 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-12-22 20:29:58,842 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-12-22 20:29:58,855 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 20:29:58,860 : INFO : worker 

2020-12-22 20:30:18,490 : INFO : EPOCH - 38 : training on 1951575 raw words (1854759 effective words) took 4.5s, 414166 effective words/s
2020-12-22 20:30:19,529 : INFO : EPOCH 39 - PROGRESS: at 18.72% examples, 337681 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:30:20,559 : INFO : EPOCH 39 - PROGRESS: at 43.44% examples, 389241 words/s, in_qsize 24, out_qsize 0
2020-12-22 20:30:21,577 : INFO : EPOCH 39 - PROGRESS: at 63.13% examples, 376851 words/s, in_qsize 24, out_qsize 0
2020-12-22 20:30:22,615 : INFO : EPOCH 39 - PROGRESS: at 81.42% examples, 366411 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:30:23,219 : INFO : worker thread finished; awaiting finish of 11 more threads
2020-12-22 20:30:23,243 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 20:30:23,253 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 20:30:23,256 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-12-22 20:30:23,274 : INFO : wor

2020-12-22 20:30:42,605 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-12-22 20:30:42,607 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-22 20:30:42,613 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-22 20:30:42,622 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-22 20:30:42,623 : INFO : EPOCH - 43 : training on 1951575 raw words (1854783 effective words) took 4.5s, 416664 effective words/s
2020-12-22 20:30:43,672 : INFO : EPOCH 44 - PROGRESS: at 18.72% examples, 334655 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:30:44,734 : INFO : EPOCH 44 - PROGRESS: at 43.44% examples, 381380 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:30:45,756 : INFO : EPOCH 44 - PROGRESS: at 67.16% examples, 395484 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:30:46,759 : INFO : EPOCH 44 - PROGRESS: at 90.48% examples, 406668 words/s, in_qsize 19, out_qsize 0
2020-12-22 20:30:47,003 : INFO : worke

2020-12-22 20:31:05,933 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-12-22 20:31:05,941 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-12-22 20:31:05,951 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-12-22 20:31:05,985 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 20:31:05,987 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-12-22 20:31:05,989 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-22 20:31:05,990 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-22 20:31:06,002 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-22 20:31:06,003 : INFO : EPOCH - 48 : training on 1951575 raw words (1854537 effective words) took 4.6s, 403559 effective words/s
2020-12-22 20:31:07,067 : INFO : EPOCH 49 - PROGRESS: at 18.72% examples, 329879 words/s, in_qsize 23, out_qsize 0
2020-12-22 20:31:08

In [9]:
ranks = []
second_ranks = []
for doc_id in tqdm(range(len(train_corpus))):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

  0%|                                                                                        | 0/25000 [00:00<?, ?it/s]2020-12-22 13:28:24,619 : INFO : precomputing L2-norms of doc weight vectors
100%|████████████████████████████████████████████████████████████████████████████| 25000/25000 [28:45<00:00, 14.49it/s]


In [65]:
similar_ratings = {}
for doc_id in tqdm(range(len(train_corpus))):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    for i in range(10):
        if train_corpus[sims[doc_id][0]].rating[0] in similar_ratings:
            similar_ratings[train_corpus[sims[doc_id][0]].rating[0]] = [train_corpus[sims[i][0]].rating[0]]
        else:
            similar_ratings[train_corpus[sims[doc_id][0]].rating[0]] += [train_corpus[sims[i][0]].rating[0]]

  0%|                                                                                        | 0/16750 [00:00<?, ?it/s]


KeyError: '1'

In [63]:
train_corpus[s[4][0]].rating[0]

'1'

In [10]:
counter = collections.Counter(ranks)
print(counter)

Counter({0: 24888, 1: 106, 2: 6})


In [61]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)
iv = model.infer_vector(train_corpus[doc_id].words)
s = model.docvecs.most_similar([iv], topn=len(model.docvecs))
train_corpus[s[12345][0]].rating[0]

'8'

In [49]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)
iv = model.infer_vector(train_corpus[doc_id].words)
s = model.docvecs.most_similar([iv], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print(f'Test Document {(doc_id, " ".join(train_corpus[doc_id].rating))}: «{" ".join(train_corpus[doc_id].words)}»\n')
# print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
# for label, index in [('MOST', 1), ('MEDIAN', len(s)//2), ('LEAST', len(s) - 1)]:
#     print(f'{label} : {(" ".join(train_corpus[s[index][0]].rating))} : {s[index][1]} - {(" ".join(train_corpus[s[index][0]].words))}\n')
for i in len(s):
    print(train_corpus)

Test Document (6667, '8'): «magnum opus swedish king crap matshelge olsson seldom movie magnitude made sweden truly stands one amazing achievements swedish film date pays things russian nuclear scientist markov wants defect sweden plans ruined russian military kidnap tell arrived sweden trickery way seeing markov continues work nuclear physics revolutionize energy supply whole planet cia however bent getting markov west send ninja liberate practice swedish actors speaking english something matshelge perfected later works cheap bmovie feeling creates probably unmatched performance besides well action standard directtovideo style machine guns firing huge clouds smoke thousands russians dying people running around black ninja suits trying hide snow really stands though insanely poor way fighting scenes choreographed say ninja title expect martial arts expect close combat maybe two three scenes actual martial arts movie hilarious bad lack words describe ninjas moved slower hearts would sto

TypeError: 'int' object is not iterable

In [79]:
def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in tqdm(sents)])
    return targets, feature_vectors

y_train, X_train = vector_for_learning(model_dbow, train_documents)
y_test, X_test = vector_for_learning(model_dbow, test_documents)

100%|██████████████████████████████████████████████████████████████████████████████| 2999/2999 [00:46<00:00, 63.82it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2001/2001 [00:29<00:00, 67.45it/s]


In [80]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
print('Testing accuracy for movie plots%s' % accuracy_score(y_test, y_pred))
print('Testing F1 score for movie plots: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy for movie plots0.3103448275862069
Testing F1 score for movie plots: 0.3135941911201832
