# Use Gensim to train word embeddings

# Import necessary libraries

In [8]:
import swifter

# Import  the dataset

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('./datasets/wiki_movie_plots_deduped.csv', encoding='utf-8')
display(df.head())
print(df.shape)

Unnamed: 0,Release Year,Title,Origin_Ethnicity,Director,Cast,Genre,Wiki Page,Plot,lemmatized_text
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...","bartender work saloon , serve drink customer ...."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","moon , paint smile face hang park night . youn..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","film , minute long , compose shot . , girl sit..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,"Lasting 61 second consist shot , shoot set woo..."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,"early know adaptation classic fairytale , film..."


(25533, 9)


# Use sentence detector to create list of sentences

In [33]:
from spacy.lang.en import English

nlp = English()
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
def sentence_tokens(paragraph: str)->list:
    doc = nlp(paragraph)
    return list(doc.sents)

In [34]:
sentence_tokens('Mr. India is a very good film. I.S.B is a good university.')

[Mr. India is a very good film., I.S.B is a good university.]

In [9]:
df['list_of_sentences'] = df.Plot.swifter.apply(sentence_tokens)
df.head()

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=25533.0, style=ProgressStyle(descripti…




Unnamed: 0,Release Year,Title,Origin_Ethnicity,Director,Cast,Genre,Wiki Page,Plot,lemmatized_text,list_of_sentences
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...","bartender work saloon , serve drink customer ....","[(A, bartender, is, working, at, a, saloon, ,,..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","moon , paint smile face hang park night . youn...","[(The, moon, ,, painted, with, a, smiling, fac..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","film , minute long , compose shot . , girl sit...","[(The, film, ,, just, over, a, minute, long, ,..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,"Lasting 61 second consist shot , shoot set woo...","[(Lasting, just, 61, seconds, and, consisting,..."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,"early know adaptation classic fairytale , film...","[(The, earliest, known, adaptation, of, the, c..."


In [11]:
df['list_of_sentences'][0]

[A bartender is working at a saloon, serving drinks to customers.,
 After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside.,
 They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head.,
 The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register.,
 The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]]

# Lemmatize each sentence within list of sentences

In [20]:
import spacy
nlp = spacy.load('en_core_web_md', disable=["tagger", "parser", "ner"])
def standardize_texts(text:str):
    doc = nlp(str(text))
    lemmatized_words = []
    for token in doc:
        if not token.is_stop:
            lemmatized_words.append(token.lemma_)
    return lemmatized_words

In [21]:
standardize_texts(df['list_of_sentences'][0][0])

['bartender', 'work', 'saloon', ',', 'serve', 'drink', 'customer', '.']

In [14]:
def create_list_of_sentences_lemmatized(list_of_sentences: list) -> list:
    # iterate over the list of sentences
    list_of_sentences_lemmatized = []
    for sentence in list_of_sentences:
        list_of_sentences_lemmatized.append(standardize_texts(sentence))
    return list_of_sentences_lemmatized

In [23]:
print(create_list_of_sentences_lemmatized(df['list_of_sentences'][0]))

[['bartender', 'work', 'saloon', ',', 'serve', 'drink', 'customer', '.'], ['fill', 'stereotypically', 'Irish', 'man', 'bucket', 'beer', ',', 'Carrie', 'Nation', 'follower', 'burst', 'inside', '.'], ['assault', 'Irish', 'man', ',', 'pull', 'hat', 'eye', 'dump', 'beer', 'head', '.'], ['group', 'begin', 'wreck', 'bar', ',', 'smash', 'fixture', ',', 'mirror', ',', 'break', 'cash', 'register', '.'], ['bartender', 'spray', 'seltzer', 'water', 'Nation', 'face', 'group', 'policeman', 'appear', 'order', 'everybody', 'leave.[1', ']']]


In [None]:
df['list_of_sentences_lemmatized'] = df.list_of_sentences.swifter.apply(create_list_of_sentences_lemmatized)
display(df.list_of_sentences_lemmatized)

# Export the dataframe to keep backup

In [35]:
df.drop(columns=['list_of_sentences']).to_pickle('./binary_files/wiki_movie_plots_deduped.pkl')

# Train Gensim

## Create corpus

In [53]:
import numpy as np
corpus = []
for paragraph in df.list_of_sentences_lemmatized:
    for sentence in paragraph:
        corpus.append(sentence)
corpus = np.array(corpus)
print(corpus.shape)

(485374,)


## Train model

In [54]:
import gensim
from gensim.test.utils import common_texts
model = gensim.models.Word2Vec(sentences=corpus, 
                               size=100, 
                               window=5, 
                               min_count=1, 
                               workers=4)

## Export the gensim word vectors

In [59]:
model.wv.save_word2vec_format('./binary_files/wiki_movie_plots_deduped_wv.txt')

In [63]:
model.wv.most_similar(positive=['woman'])

[('lady', 0.7534633278846741),
 ('girl', 0.7341021299362183),
 ('couple', 0.71397465467453),
 ('prostitute', 0.6636070013046265),
 ('Cormie', 0.6256614327430725),
 ('pourer', 0.6244224309921265),
 ('photocamera', 0.619834303855896),
 ('Soumya(Indrasish', 0.6054298281669617),
 ('lamentable', 0.6028733253479004),
 ('Meshal', 0.6019753813743591)]