In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import collections
import random
from tqdm import tqdm

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [13]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts, get_tmpfile
from gensim.utils import simple_preprocess
from gensim.models.callbacks import CallbackAny2Vec

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import utils
from sklearn.model_selection import train_test_split

import csv

import multiprocessing

In [5]:
from collections import namedtuple

class TaggedDocumentNew(namedtuple('TaggedDocument', 'words tags rating id')):
    """Represents a document along with a tag, input document format for :class:`~gensim.models.doc2vec.Doc2Vec`.
    A single document, made up of `words` (a list of unicode string tokens) and `tags` (a list of tokens).
    Tags may be one or more unicode string tokens, but typical practice (which will also be the most memory-efficient)
    is for the tags list to include a unique integer id as the only tag.
    Replaces "sentence as a list of words" from :class:`gensim.models.word2vec.Word2Vec`.
    """
    def __str__(self):
        """Human readable representation of the object's state, used for debugging.
        Returns
        -------
        str
           Human readable representation of the object's state (words and tags).
        """
        return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags, self.rating, self.id)

#### DATA

In [6]:
from os import listdir
from os.path import isfile, join

path_pos = "./Data/raw/aclImdb_v1/aclImdb/train/pos/"
pos_rev_files = ["pos/"+f for f in listdir(path_pos) if isfile(join(path_pos, f))]

path_neg = "./Data/raw/aclImdb_v1/aclImdb/train/neg/"
neg_rev_files = ["neg/"+f for f in listdir(path_neg) if isfile(join(path_neg, f))]

path_unsup = "./Data/raw/aclImdb_v1/aclImdb/train/unsup/"
unsup_rev_files = ["unsup/"+f for f in listdir(path_unsup) if isfile(join(path_unsup, f))]

In [7]:
def generate_review_df(files, url_file):
    base_path = "./Data/raw/aclImdb_v1/aclImdb/train/"
    
    df = pd.DataFrame(columns=["id", "rating", "review", "title"])
    with open(url_file, "r") as file:
        urls = file.readlines()

    for file in tqdm(files):
        _id = file.split("_")[0].split("/")[-1]
        _rating = file.split("_")[-1].split(".")[0]
        with open (base_path+file, "r", encoding="utf8") as f:
            _review = f.readline()
            
        _title = urls[int(_id)].split("/")[-2]
        temp = pd.DataFrame(data=[[_id, _rating, _review, _title]], columns=["id", "rating", "review", "title"])
        df = pd.concat([df, temp], axis=0)
        
    return df

In [8]:
df_pos = generate_review_df(pos_rev_files, "./Data/raw/aclImdb_v1/aclImdb/train/urls_pos.txt")

100%|███████████████████████████████████████████████████████████████████████████| 12500/12500 [00:15<00:00, 794.78it/s]


In [9]:
df_neg = generate_review_df(neg_rev_files, "./Data/raw/aclImdb_v1/aclImdb/train/urls_neg.txt")

100%|███████████████████████████████████████████████████████████████████████████| 12500/12500 [00:16<00:00, 775.66it/s]


In [10]:
# df_unsup = generate_review_df(unsup_rev_files, "./Data/raw/aclImdb_v1/aclImdb/train/urls_unsup.txt")

In [11]:
df = pd.concat([df_pos, df_neg], axis=0).reset_index().drop(["index"], axis=1)

In [25]:
df

Unnamed: 0,id,rating,review,title
0,0,9,Bromwell High is a cartoon comedy. It ran at t...,tt0453418
1,10000,8,Homelessness (or Houselessness as George Carli...,tt0102303
2,10001,10,Brilliant over-acting by Lesley Ann Warren. Be...,tt0102303
3,10002,7,This is easily the most underrated film inn th...,tt0102303
4,10003,8,This is not the typical Mel Brooks film. It wa...,tt0102303
...,...,...,...,...
24995,9998,4,"Towards the end of the movie, I felt it was to...",tt0075648
24996,9999,3,This is the kind of movie that my enemies cont...,tt0075648
24997,999,3,I saw 'Descent' last night at the Stockholm Fi...,tt0463027
24998,99,1,Some films that you pick up for a pound turn o...,tt0272950


In [30]:
X = df.loc[:, ["review", "id"]]
y = df.loc[:, ["rating"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [31]:
X_train

Unnamed: 0,review,id
19372,I wasted my time and gave this show a chance. ...,4936
19864,"Released on DVD in the UK as Axe, The Choke is...",5379
19075,It's interesting that a novel with no plot has...,4669
3598,Oh it's so cool to watch a Silent Classic once...,198
10665,Dark comedy? Gallows humor? How does one make ...,834
...,...,...
13620,I bought this at tower records after seeing th...,11008
3624,If you have never viewed this film and like ol...,2011
13036,I saw this movie on a westbound American Airli...,10483
22671,Munchies starts in deepest darkest Peru (looks...,7905


In [32]:
y_train

Unnamed: 0,rating
19372,1
19864,2
19075,4
3598,8
10665,8
...,...
13620,1
3624,7
13036,4
22671,1


In [20]:
df.to_csv("./Data/transformed/data.csv", index=False, header=False, sep="\t")

In [21]:
df.loc[:20000].to_csv("./Data/transformed/data_train.csv", index=False, header=False, sep="\t")

In [23]:
df.loc[20000:].to_csv("./Data/transformed/data_test.csv", index=False, header=False, sep="\t")

In [26]:
# import smart_open

# def preprocess(line):
#     contents = line.split("\t")
#     text = contents[2].strip()
#     tokens = word_tokenize(text)
#     tokens = [t.lower() for t in tokens]
#     table = str.maketrans('', '', string.punctuation)
#     stripped = [w.translate(table) for w in tokens]
#     words = [word for word in stripped if word.isalpha()]
#     stop_words = set(stopwords.words('english'))
#     stop_words.add("br")
#     words = [w for w in words if not w in stop_words]
    
#     return contents[0], contents[1], words

# def read_corpus(fname, tokens_only=False):
#     with smart_open.open(fname, encoding="iso-8859-1") as f:
#         for i, line in tqdm(enumerate(f)):
#             _id, rating, tokens = preprocess(line)
#             if tokens_only:
#                 yield tokens
#             else:
#                 # For training data, add tags
#                 yield TaggedDocumentNew(tokens, [i], [rating], [_id])
            

# train_corpus = list(read_corpus('./Data/transformed/data_train.csv'))
# test_corpus = list(read_corpus('./Data/transformed/data_test.csv'))

In [48]:
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [t.lower() for t in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    stop_words.add("br")
    words = [w for w in words if not w in stop_words]
    
    return words

def process_corpus(X, y):
    for i in tqdm(X.index):
        line = X.loc[i, ["review"]].values[0]
        _id = X.loc[i, ["id"]].values[0]
        tokens = preprocess(line)
        rating = y.loc[i, "rating"]
        yield TaggedDocumentNew(tokens, [i], [rating], [_id])


train_corpus = list(process_corpus(X_train, y_train))
test_corpus = list(process_corpus(X_test, y_test))

100%|███████████████████████████████████████████████████████████████████████████| 16750/16750 [00:44<00:00, 379.83it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 8250/8250 [00:22<00:00, 372.51it/s]


In [49]:
train_corpus[-1]

TaggedDocumentNew(words=['reading', 'web', 'sites', 'bette', 'davis', 'one', 'find', 'instances', 'authors', 'claim', 'nothing', 'special', 'acting', 'even', 'found', 'site', 'claimed', 'bette', 'davis', 'success', 'probably', 'due', 'luck', 'ms', 'davis', 'films', 'tell', 'quite', 'opposite', 'evident', 'example', 'two', 'films', 'weeks', 'apart', 'fog', 'frisco', 'human', 'bondage', 'characters', 'played', 'movies', 'though', 'negative', 'quite', 'different', 'arlene', 'former', 'beautiful', 'glamorous', 'frivolous', 'heiress', 'much', 'likable', 'character', 'mildred', 'latter', 'pale', 'uneducated', 'impudent', 'cockney', 'waitress', 'needless', 'say', 'ms', 'davis', 'played', 'characters', 'authentic', 'enthusiasm', 'even', 'point', 'former', 'role', 'would', 'wished', 'actresses', 'day', 'one', 'forced', 'play', 'latter', 'role', 'seemed', 'actresses', 'undesirable', 'career', 'destroying', 'role', 'one', 'fought', 'ferociously', 'months', 'latter', 'role', 'launched', 'among', '

In [50]:
cores = multiprocessing.cpu_count()

model = Doc2Vec(vector_size=300, min_count=2, epochs=50, workers=cores)
model.build_vocab([x for x in tqdm(train_corpus)])

100%|███████████████████████████████████████████████████████████████████████| 16750/16750 [00:00<00:00, 2385878.96it/s]
2020-12-22 17:12:01,408 : INFO : collecting all words and their counts
2020-12-22 17:12:01,409 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-12-22 17:12:01,659 : INFO : PROGRESS: at example #10000, processed 1189128 words (4790841/s), 59149 word types, 24999 tags
2020-12-22 17:12:01,825 : INFO : collected 75738 word types and 25000 unique tags from a corpus of 16750 examples and 1995011 words
2020-12-22 17:12:01,826 : INFO : Loading a fresh vocabulary
2020-12-22 17:12:01,897 : INFO : effective_min_count=2 retains 41424 unique words (54% of original 75738, drops 34314)
2020-12-22 17:12:01,899 : INFO : effective_min_count=2 leaves 1960697 word corpus (98% of original 1995011, drops 34314)
2020-12-22 17:12:02,005 : INFO : deleting the raw counts dictionary of 75738 items
2020-12-22 17:12:02,007 : INFO : sample=0.001 downsamples 24 m

In [51]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs) 

2020-12-22 17:12:14,709 : INFO : training model with 12 workers on 41424 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-12-22 17:12:15,723 : INFO : EPOCH 1 - PROGRESS: at 19.92% examples, 375040 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:12:16,742 : INFO : EPOCH 1 - PROGRESS: at 42.43% examples, 394801 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:12:17,752 : INFO : EPOCH 1 - PROGRESS: at 64.23% examples, 399336 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:12:18,790 : INFO : EPOCH 1 - PROGRESS: at 85.55% examples, 396285 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:12:19,137 : INFO : worker thread finished; awaiting finish of 11 more threads
2020-12-22 17:12:19,171 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 17:12:19,216 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 17:12:19,223 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-12-22 17:12:19,244 : I

2020-12-22 17:12:40,039 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 17:12:40,055 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-12-22 17:12:40,058 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-22 17:12:40,064 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-22 17:12:40,072 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-22 17:12:40,074 : INFO : EPOCH - 5 : training on 1995011 raw words (1886960 effective words) took 5.4s, 351223 effective words/s
2020-12-22 17:12:41,091 : INFO : EPOCH 6 - PROGRESS: at 13.26% examples, 251761 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:12:42,229 : INFO : EPOCH 6 - PROGRESS: at 36.53% examples, 319426 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:12:43,263 : INFO : EPOCH 6 - PROGRESS: at 59.25% examples, 351268 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:12:44,263 : INFO : EPOCH 6 - PROGRESS: at 81.13% exa

2020-12-22 17:13:07,089 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 17:13:07,104 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 17:13:07,113 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-12-22 17:13:07,129 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-12-22 17:13:07,149 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-12-22 17:13:07,153 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-12-22 17:13:07,162 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 17:13:07,168 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-12-22 17:13:07,178 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-22 17:13:07,185 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-22 17:13:07,187 : INFO : worker thread finished; awaiting finish of 0 more thread

2020-12-22 17:13:29,693 : INFO : EPOCH 15 - PROGRESS: at 66.53% examples, 415185 words/s, in_qsize 24, out_qsize 0
2020-12-22 17:13:30,714 : INFO : EPOCH 15 - PROGRESS: at 89.44% examples, 416809 words/s, in_qsize 22, out_qsize 0
2020-12-22 17:13:30,919 : INFO : worker thread finished; awaiting finish of 11 more threads
2020-12-22 17:13:30,985 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 17:13:30,997 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 17:13:31,009 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-12-22 17:13:31,016 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-12-22 17:13:31,019 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-12-22 17:13:31,041 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-12-22 17:13:31,045 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 17:13:31,055 : INFO : worker 

2020-12-22 17:13:50,003 : INFO : EPOCH 20 - PROGRESS: at 18.94% examples, 349145 words/s, in_qsize 24, out_qsize 0
2020-12-22 17:13:51,008 : INFO : EPOCH 20 - PROGRESS: at 40.51% examples, 374786 words/s, in_qsize 24, out_qsize 0
2020-12-22 17:13:52,028 : INFO : EPOCH 20 - PROGRESS: at 61.28% examples, 378669 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:13:53,044 : INFO : EPOCH 20 - PROGRESS: at 83.73% examples, 387719 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:13:53,533 : INFO : worker thread finished; awaiting finish of 11 more threads
2020-12-22 17:13:53,599 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 17:13:53,607 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 17:13:53,612 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-12-22 17:13:53,615 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-12-22 17:13:53,639 : INFO : worker thread finished; awaiting finish of 6 more 

2020-12-22 17:14:12,117 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-22 17:14:12,118 : INFO : EPOCH - 24 : training on 1995011 raw words (1886786 effective words) took 4.9s, 387091 effective words/s
2020-12-22 17:14:13,147 : INFO : EPOCH 25 - PROGRESS: at 18.45% examples, 340219 words/s, in_qsize 24, out_qsize 0
2020-12-22 17:14:14,174 : INFO : EPOCH 25 - PROGRESS: at 34.64% examples, 316094 words/s, in_qsize 24, out_qsize 0
2020-12-22 17:14:15,182 : INFO : EPOCH 25 - PROGRESS: at 56.30% examples, 346893 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:14:16,184 : INFO : EPOCH 25 - PROGRESS: at 80.11% examples, 372119 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:14:16,786 : INFO : worker thread finished; awaiting finish of 11 more threads
2020-12-22 17:14:16,849 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 17:14:16,871 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 17:14:16,881 : INFO : wor

2020-12-22 17:14:35,334 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-22 17:14:35,340 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-22 17:14:35,341 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-22 17:14:35,342 : INFO : EPOCH - 29 : training on 1995011 raw words (1886419 effective words) took 4.6s, 406291 effective words/s
2020-12-22 17:14:36,445 : INFO : EPOCH 30 - PROGRESS: at 18.39% examples, 317565 words/s, in_qsize 24, out_qsize 0
2020-12-22 17:14:37,445 : INFO : EPOCH 30 - PROGRESS: at 42.00% examples, 376271 words/s, in_qsize 24, out_qsize 0
2020-12-22 17:14:38,452 : INFO : EPOCH 30 - PROGRESS: at 62.62% examples, 381229 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:14:39,456 : INFO : EPOCH 30 - PROGRESS: at 85.10% examples, 390612 words/s, in_qsize 24, out_qsize 0
2020-12-22 17:14:39,887 : INFO : worker thread finished; awaiting finish of 11 more threads
2020-12-22 17:14:39,950 : INFO : work

2020-12-22 17:14:59,313 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-12-22 17:14:59,329 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-12-22 17:14:59,335 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 17:14:59,343 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-12-22 17:14:59,345 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-22 17:14:59,349 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-22 17:14:59,351 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-22 17:14:59,351 : INFO : EPOCH - 34 : training on 1995011 raw words (1886638 effective words) took 4.7s, 399968 effective words/s
2020-12-22 17:15:00,377 : INFO : EPOCH 35 - PROGRESS: at 18.88% examples, 350611 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:15:01,400 : INFO : EPOCH 35 - PROGRESS: at 42.89% examples, 395469 words/s, in_qsize 23, out_qsi

2020-12-22 17:15:23,177 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 17:15:23,179 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 17:15:23,196 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-12-22 17:15:23,203 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-12-22 17:15:23,209 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-12-22 17:15:23,218 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-12-22 17:15:23,235 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 17:15:23,237 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-12-22 17:15:23,242 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-12-22 17:15:23,247 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-12-22 17:15:23,248 : INFO : worker thread finished; awaiting finish of 0 more thread

2020-12-22 17:15:45,678 : INFO : EPOCH 44 - PROGRESS: at 89.44% examples, 416168 words/s, in_qsize 22, out_qsize 0
2020-12-22 17:15:45,888 : INFO : worker thread finished; awaiting finish of 11 more threads
2020-12-22 17:15:45,947 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 17:15:45,971 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 17:15:45,975 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-12-22 17:15:45,977 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-12-22 17:15:45,989 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-12-22 17:15:46,006 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-12-22 17:15:46,011 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-12-22 17:15:46,025 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-12-22 17:15:46,026 : INFO : worker thread finished; awaitin

2020-12-22 17:16:07,872 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-12-22 17:16:07,873 : INFO : EPOCH - 48 : training on 1995011 raw words (1886657 effective words) took 5.3s, 357848 effective words/s
2020-12-22 17:16:08,888 : INFO : EPOCH 49 - PROGRESS: at 19.84% examples, 372703 words/s, in_qsize 24, out_qsize 0
2020-12-22 17:16:09,916 : INFO : EPOCH 49 - PROGRESS: at 36.53% examples, 336314 words/s, in_qsize 24, out_qsize 0
2020-12-22 17:16:10,929 : INFO : EPOCH 49 - PROGRESS: at 56.30% examples, 347581 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:16:11,948 : INFO : EPOCH 49 - PROGRESS: at 79.57% examples, 368849 words/s, in_qsize 23, out_qsize 0
2020-12-22 17:16:12,544 : INFO : worker thread finished; awaiting finish of 11 more threads
2020-12-22 17:16:12,604 : INFO : worker thread finished; awaiting finish of 10 more threads
2020-12-22 17:16:12,625 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-12-22 17:16:12,647 : INFO : wor

In [9]:
ranks = []
second_ranks = []
for doc_id in tqdm(range(len(train_corpus))):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

  0%|                                                                                        | 0/25000 [00:00<?, ?it/s]2020-12-22 13:28:24,619 : INFO : precomputing L2-norms of doc weight vectors
100%|████████████████████████████████████████████████████████████████████████████| 25000/25000 [28:45<00:00, 14.49it/s]


In [10]:
counter = collections.Counter(ranks)
print(counter)

Counter({0: 24888, 1: 106, 2: 6})


In [58]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)
iv = model.infer_vector(train_corpus[doc_id].words)
s = model.docvecs.most_similar([iv], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print(f'Test Document {(doc_id, " ".join(train_corpus[doc_id].rating))}: «{" ".join(train_corpus[doc_id].words)}»\n')
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 1), ('MEDIAN', len(s)//2), ('LEAST', len(s) - 1)]:
    print(f'{label} : {(" ".join(train_corpus[s[index][0]].rating))} : {s[index][1]} - {(" ".join(train_corpus[s[index][0]].words))}\n')

Test Document (2343, '10'): «evidently many people seen movie one posting comments movie missed george peabody award well humanitas award paul winfield award awesome performance movie eugene logan cowriter made tv movie also part another movie humanity loss technical adviser truman capote movie glass house movie available dvd anyone interested post another letter telling eugene logan came technical adviser movie amazing person truman capote thanks reading hope find way view two movies»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d300,n5,w5,mc2,s0.001,t12):

MOST : 9 : 0.4363114833831787 - bought year ago yep left shelf ages watching dvds holed cold haunting movie brilliant performances involved especially boy smiles get movie plot reminds perhaps favourite movie ever grosse pointe blank obviously lighthearted heavy hearted psychologist clinical forensic psychologist shiver went spine identity new contract revealed scary stuff brilliant work round pete

MEDIAN : 2 : 0.11312077939510

IndexError: list index out of range

In [79]:
def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in tqdm(sents)])
    return targets, feature_vectors

y_train, X_train = vector_for_learning(model_dbow, train_documents)
y_test, X_test = vector_for_learning(model_dbow, test_documents)

100%|██████████████████████████████████████████████████████████████████████████████| 2999/2999 [00:46<00:00, 63.82it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2001/2001 [00:29<00:00, 67.45it/s]


In [80]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
print('Testing accuracy for movie plots%s' % accuracy_score(y_test, y_pred))
print('Testing F1 score for movie plots: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy for movie plots0.3103448275862069
Testing F1 score for movie plots: 0.3135941911201832
