# Using Custom Doc2Vec for Movie Plot Similarity

## Building the Dataset

Import the dataset and clean the text from any punctuation, numbers, and make lowercase.
As this happens, build the test and train split

In [40]:
import tensorflow as tf
import numpy as np
import pandas as pd
import tensorflow.keras as keras
import string
import csv
from sklearn.model_selection import train_test_split

In [36]:
# Get dataset

# Taken from ntlk.corpus.stopwords("english") without installing the package
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

def clean_text(text):
    """
    Clean a document's text by removing punctuation, numbers, and making lower
    case
    
    :param text: The contents of one document
    """
    words = text.split()
    cleaned = []
    for word in words:
        word = word.lower()
        temp_word = []
        for letter in word:
            if letter not in string.punctuation and letter not in string.digits:
                temp_word.append(letter)
        temp_word = "".join(temp_word)
        if len(temp_word) > 0 and not temp_word in stop_words:
            cleaned.append(temp_word)
        
    return cleaned

In [105]:
class Doc:
    """
    Class represents a singular document and contains the relevant information
    """

    def __init__(self, text: list, name:str, label: str):
        self.text = text
        self.label = label
        self.name = name
        self.index = 0

    def __str__(self):
        text = " ".join(self.text)
        return "[" + text + "]," + "[" + label + "]"

In [106]:
# possible_tags = ["sci-fi", "animation", "action", "comedy", "fantasy", "romance"]

# Keep track of the original format of docs for visual output later?
corpus = []
docs = []
data = {"title": [], "plot": [], "genre": []}

# Restrict the number of docs, since my computer crashes with too many
allowed = 1000
with open("movie_plots.csv") as file:
    reader = csv.reader(file, delimiter = ",", quotechar = '"')
    for row in reader:
        if allowed <= 0:
            break
        cleaned = clean_text(row[2])
        data["title"].append(row[1])
        data["plot"].append(cleaned)
        data["genre"].append(row[3])
        docs.append(Doc(text = cleaned, name = row[1], label = row[3]))
        allowed -= 1
        
df = pd.DataFrame(data)
df

Unnamed: 0,title,plot,genre
0,Toy Story (1995),"[little, boy, named, andy, loves, room, playin...",animation
1,Jumanji (1995),"[two, kids, find, play, magical, board, game, ...",fantasy
2,Grumpier Old Men (1995),"[things, dont, seem, change, much, wabasha, co...",comedy
3,Heat (1995),"[hunters, preyneil, professional, criminal, cr...",action
4,Sabrina (1995),"[ugly, duckling, undergone, remarkable, change...",romance
...,...,...,...
995,Harold and Kumar Go to White Castle (2004),"[asianamerican, office, worker, indian, stoner...",comedy
996,"Princess Diaries 2: Royal Engagement, The (2004)","[princess, mia, turned, supposed, succeed, gra...",romance
997,AVP: Alien vs. Predator (2004),"[archaeological, expedition, bouvetya, island,...",action
998,Yu-Gi-Oh! (2004),"[underneath, sands, egypt, anubis, ancient, ev...",animation


In [107]:
# Get some of the features from the dataset
unique_words = []
for doc in docs:
    for word in doc.text:
        if word not in unique_words:
            unique_words.append(word)
            
vocab_size = len(unique_words)
num_docs = len(docs)

print("Num words:", vocab_size)
print("Num docs:", num_docs)

Num words: 10714
Num docs: 1000


In [108]:
# Make vector encodings for all words and docs
words_copy = unique_words.copy()
docs_copy = docs.copy()
word_enc_mapping = {}
doc_enc_mapping = {}

for i in range(num_docs):
    doc_enc_mapping[docs[i].name] = i
    docs_copy[i] = doc_enc_mapping[docs[i].name]
    
for i in range(vocab_size):
    word_enc_mapping[unique_words[i]] = i
    words_copy[i] = word_enc_mapping[unique_words[i]]

doc_vecs = tf.keras.utils.to_categorical(docs_copy)
word_vecs = tf.keras.utils.to_categorical(words_copy)

## Building the Model

In [97]:
# Params for model building
batch_size = 64
embed_size = 128
negative_samples = 8
learning_rate = 0.5

In [98]:
# Inputs
text_input = keras.Input(shape = [batch_size, 1], dtype = tf.int32)
# label_input = keras.Input(shape = [batch_size, 1], dtype = tf.int32)
doc_vector = keras.Input(shape = [batch_size, 1], dtype = tf.int32)

# Embeddings
init_embed = keras.initializers.RandomUniform(minval=-1.0, maxval=1.0)

embed_for_word = keras.layers.Embedding(vocab_size, embed_size, embeddings_initializer = init_embed, input_length = batch_size)
word_embed = embed_for_word(text_input)

embed_for_docs = keras.layers.Embedding(num_docs, embed_size, embeddings_initializer = init_embed, input_length = batch_size)
doc_embed = embed_for_docs(doc_vector)

# Combine embeddings and flatten
combined = keras.layers.Concatenate(axis = 1)([word_embed, doc_embed])
flattened = keras.layers.Flatten()(combined)

# Softmax activation
softmax = keras.layers.Dense(vocab_size, activation = "softmax")
activated = softmax(flattened)

output = keras.layers.Dense(1, activation = "sigmoid")(activated)

model = keras.Model(inputs = [text_input, doc_vector], outputs = output)

# SGD Optimizer and CCE Loss
sgd = keras.optimizers.SGD(learning_rate)

model.compile(loss = "categorical_crossentropy", optimizer = sgd, metrics = ["accuracy"])

## Data Generation
Create functions that generate batch data since dimensions don't match

## Model Training
Time to see how it works

In [142]:
epochs = 1000
doc_index = 0
for epoch in range(epochs):
    # Build a batch for the next epoch
    # x is combined doc_id and word_ids
    batch = []
    batch_frame = pd.DataFrame()
    for i in range(batch_size):
        id_batch = []
        doc_batch = []
        label_batch = []
        for j in range(window_size):
            docs[doc_index].index = (docs[doc_index].index + 1) % len(docs[doc_index].text)
            doc_batch.append(word_encs[0][word_enc_mapping[docs[doc_index].text[docs[doc_index].index]]])
            id_batch.append(doc_encs[0][doc_enc_mapping[docs[doc_index].name]])
            label_batch.append(docs[doc_index].label)
        doc_index = (doc_index + 1) % num_docs
        
        id_batch = np.array(id_batch)
        id_batch_tf = tf.convert_to_tensor(id_batch)
        print(id_batch_tf)
        doc_batch = np.array(doc_batch)
        label_batch = np.array(label_batch)
        
        temp_frame = pd.DataFrame([id_batch, doc_batch, label_batch])
        batch_frame = pd.concat([batch_frame, temp_frame], axis = 1)
    batch_frame = batch_frame.transpose()
    x, y = np.split(batch_frame, [2], axis = 1)
    model.fit(x, y)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.float32).