In [1]:
import pandas as pd
import numpy as mp
import re
from tqdm import tqdm
from time import time

import spacy
from gensim.models import FastText as ft
from gensim.models.phrases import Phrases, Phraser
import multiprocessing

import nltk
from nltk.tokenize import TweetTokenizer



In [2]:
import en_core_web_lg
nlp = en_core_web_lg.load()

In [3]:
def nlp_preprocess(df,col):
    """
    Function to get lemmatized version of sentence column
    Parameters:
        df: dataframe, dataframe with sentence column
        col: string, column name which is to be converted
    Output:
        df: dataframe, dataframe with a separate column with lemmatized sentences
    """
    ## Lower case the column
    df['lower_'+col] = df[col].apply(lambda x: re.sub("[^A-Za-z']+",' ',str(x).lower()))
    txt = df['lower_'+col].tolist()
    final = []
    for doc in tqdm(nlp.pipe(txt,disable=['ner','parser'])):
        temp = [tok.lemma_ for tok in doc if tok.lemma_!='-PRON-']
        ## Keep only sentences where we have enough words
        if len(temp)>2:
            final.append(' '.join(temp))
        else:
            final.append(0)
    df['Lemmatized_'+col] = final
    ## Remove rows where sentences were very short
    df = df[df['Lemmatized_'+col]!=0]
    return df

In [4]:
df = pd.read_csv(r'wiki_movie_plots_deduped.csv')

In [5]:
df.head(2)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."


In [6]:
df.shape

(34886, 8)

In [7]:
test = df.copy()

In [8]:
t = time()

test = nlp_preprocess(test,'Plot')

print('Time to clean up everything: {} mins'.format(round((time()-t)/60,2)))

34886it [07:57, 73.08it/s] 

Time to clean up everything: 8.03 mins





In [9]:
test.head(2)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,lower_Plot,Lemmatized_Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",a bartender is working at a saloon serving dri...,a bartender be work at a saloon serve drink to...
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...",the moon painted with a smiling face hangs ove...,the moon paint with a smile face hang over a p...


In [10]:
cores = multiprocessing.cpu_count()
print(cores)

sent = [i.split() for i in test['Lemmatized_Plot'] if i is not None]

8


In [11]:
t = time()
phrases = Phrases(sent, min_count=15, threshold=0.5, progress_per=10000, scoring='npmi')
print('Time to run Phraser: {} mins'.format(round((time()-t)/60,2)))

Time to run Phraser: 0.28 mins


In [12]:
bigram = Phraser(phrases)
sentences = bigram[sent]

## Save bigram phraser
bigram.save(r'../Pretrained Models/moviePlotBigrams.phrases')

In [13]:
## FastText model - set up paramters
## min_count - ignore words with frequency lower than this; 
## window - maximum distance between current and predicted word in a sentence
## size - dimensionality of feature vector
## sg - use skip-gram
## alpha - initial learning rate
## min_alpha - learning rate will drop to min_alpha as training progresses
## negative - use negative sampling
fasttext_model = ft(min_count=20, window=2, vector_size=300, sg=1, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores-1)

In [14]:
## Build vocab - build vocabulary from a sequence of sentences and initialize the model
t = time()
fasttext_model.build_vocab(sentences, progress_per=10000)
print('Time to run build vocab: {} mins'.format(round((time()-t)/60,2)))

Time to run build vocab: 0.31 mins


In [15]:
## Train the fasttext model
t = time()
fasttext_model.train(sentences, total_examples=fasttext_model.corpus_count, epochs=30, report_delay=1)
print('Time to run train model: {} mins'.format(round((time()-t)/60,2)))

Time to run train model: 44.93 mins


In [16]:
fasttext_model.wv.most_similar('fight')

[('battle', 0.6300743818283081),
 ('fighting', 0.5872423648834229),
 ('defeat', 0.5312903523445129),
 ('confrontation', 0.5278085470199585),
 ('gunfight', 0.5182511210441589),
 ('shootout', 0.5081945657730103),
 ('fistfight', 0.5036159157752991),
 ('brawl', 0.49870991706848145),
 ('showdown', 0.4659406840801239),
 ('attack', 0.45887643098831177)]

In [None]:
## Save fasttext model
fasttext_model.save(r'../Pretrained Models/moviePlotFasttextModel.bin')