# doc2vec for sentiment classification

In [None]:
from Corpora import MovieReviewCorpus
from Lexicon import SentimentLexicon
from Statistics import SignTest
from Classifiers import NaiveBayesText, SVMText, BoWFeatureType
from Extensions import SVMDoc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import os

In [None]:
# automatically reload changes from other modules without having to restart kernel
%load_ext autoreload
%autoreload 2

## Step 1: Get training data for doc2vec

The training data is a collection of documents (list of lists of words)

In [None]:
def clean_text(text):
    """Convert text to lower-case and strip punctuation/symbols from words"""
    # clean_text = text.lower() # Optional, convert text to lower-case
    clean_text = text.replace('<br />', ' ') # Replace <br /> tags with spaces
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']: # Pad punctuation with spaces on both sides
        clean_text = clean_text.replace(char, ' ' + char + ' ')
    # clean_text = clean_text.replace("'", " '") # Optional, left-pad apostrophes with spaces
    return clean_text

In [None]:
## Use this for testing the clean_text function
# dataset = "train"
# label = "neg"
# f = os.listdir(f"data/aclImdb/{dataset}/{label}")[0]
# text = open(f"data/aclImdb/{dataset}/{label}/{f}").read()
# clean_text(text)#.split()

In [None]:
def get_documents_for_doc2vec():
    documents = []
    i = 0
    for dataset in ["train", "test"]:
        for label in ["neg", "pos", "unsup"]:
            if dataset == "test" and label == "unsup": continue
            files = os.listdir(f"data/aclImdb/{dataset}/{label}")
            for file in files:
                text = open(f"data/aclImdb/{dataset}/{label}/{file}").read()
                text = clean_text(text)
                documents.append(TaggedDocument(text.split(), [i]))
                i += 1
    return documents

In [None]:
documents = get_documents_for_doc2vec()
print(f"We have {len(documents)} documents for training doc2vec models")
shortest_doc = np.argmin(np.array([len(d[0]) for d in documents]))
print(f"The shortest document looks like:\n{documents[shortest_doc]}")

## Step 2: Train various doc2vec models

In [None]:
# common_kwargs = dict(vector_size=100, epochs=20, min_count=2, sample=0, workers=8, negative=5, hs=0)
model = Doc2Vec(documents, vector_size=100, window=10, dm=1, workers=8) # dm = 1 for PV-DM, 0 for PV-DBOW, dm_concat for both

## Step 3: Compare SVM using doc2vec-features with Naive Bayes 

## Step 4: Analyse the doc2vec approach

Perhaps, compare the embeddings for "This film sucks." and "The movie is terrible!", versus "Great movie!" and "This film is brilliant".