# Setup

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

# Preprocessing  <a id='preprocess'></a>

Already done and saved in `data/plot_summaries_aug.pkl` (takes about 45 minutes). Can be skipped by moving to the next [part](#lda)

In [None]:
data = pd.read_csv('data/plot_summaries.txt', sep="\t", header=None,names=["WikiMovieID", "Plot"] )
display(data.head(3))

## Cleaning

In [None]:
!pip install langdetect

In [None]:
from langdetect import detect
from tqdm import tqdm_notebook

In [None]:
data['lang'] = data.Plot.progress_map(detect)

In [None]:
data.lang.value_counts()

In [None]:
data = data.loc[data.lang=='en']

## Tokenization

In [None]:
from nltk.tokenize import sent_tokenize
data['sentences'] = data.Plot.progress_map(sent_tokenize)

In [None]:
from nltk.tokenize import word_tokenize
data['tokens_sentences'] = data['sentences'].progress_map(lambda sentences: [word_tokenize(sentence) for sentence in sentences])

## Lemmatization

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

In [None]:
from nltk import pos_tag
data['POS_tokens'] = data['tokens_sentences'].progress_map(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])

In [None]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# Lemmatizing each word with its POS tag, in each sentence
data['tokens_sentences_lemmatized'] = data['POS_tokens'].progress_map(
    lambda list_tokens_POS: [
        [
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) 
            if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
        ] 
        for tokens_POS in list_tokens_POS
    ]
)

## Regrouping tokens and removing stop words

In [None]:
from nltk.corpus import stopwords
from itertools import chain 

#stopwords_verbs = ['say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 'take', 'use', 'would', 'can']
#stopwords_other = ['one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also', 'copyright', 'something']
#my_stopwords = stopwords.words('English') + stopwords_verbs + stopwords_other

data['tokens'] = data['tokens_sentences_lemmatized'].progress_map(lambda sentences: list(chain.from_iterable(sentences)))

In [None]:
data['tokens'] = data['tokens'].progress_map(lambda tokens: [token.lower() for token in tokens if token.isalpha() 
                                                    and token.lower() not in stopwords.words('English') and len(token)>1])

## Saving our results

In [None]:
import pickle
import os  
os.makedirs('data', exist_ok=True)  
data.to_pickle('data/plot_summaries_aug.pkl')

# LDA <a id='lda'></a>
Loading the preprocessed data (in case [preprocessing](#preprocess) part is not run)

In [2]:
data = pd.read_pickle('data/plot_summaries_aug.pkl')

## Data preparation
### Prepare bi-grams and tri-grams

In [4]:
from gensim.models import Phrases

In [5]:
tokens = data['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])

### Prepare objects for LDA gensim implementation

In [6]:
from gensim import corpora

In [7]:
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

## Implementation

In [8]:
from gensim import models
import numpy as np

In [66]:
np.random.seed(123456)
num_topics = 20
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=2, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

CPU times: user 1min 9s, sys: 467 ms, total: 1min 9s
Wall time: 1min


# Results

In [67]:
topics = dict()
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20):
    topics[i]=topic

## Example

In [68]:
data.Plot[5]

"The president is on his way to give a speech. While he is traveling there a man shows up with a camera. A reporter tries to ask a member of the secret service a question. When the president enters he is shot by the man with the camera. The president's main bodyguard, Alex Thomas , is grazed by the bullet that hits the president. The shooter is gunned down by Alex and other secret service agents. The president dies at the hospital. Kate Crawford , an investigative journalist, starts asking questions about the assassination. Anyone she questions is killed. She goes to Alex Thomas's house to tell him what is happening. As they head to his boat, Thomas sees some men hiding in the bushes. He throws Kate into the water and dives in. Thomas jumps out of the water to kill two of the hitmen while a third hitman drives off to inform his boss what happened. They are able to link the hitmen to a man called Jack Baldwin . Agent Thomas and other Secret Service members attack the location of Jack Ba

In [69]:
lda_model[corpus[5]]

[(0, 0.02579133),
 (2, 0.08803686),
 (3, 0.03808829),
 (5, 0.45693526),
 (6, 0.24133606),
 (11, 0.06318434),
 (12, 0.08576671)]

In [70]:
topics[5]

'0.047*"michael" + 0.032*"thomas" + 0.025*"ray" + 0.019*"donald" + 0.019*"jesse" + 0.018*"elizabeth" + 0.016*"mac" + 0.015*"allison" + 0.013*"bond" + 0.012*"victor" + 0.011*"roger" + 0.010*"jonathan" + 0.010*"alan" + 0.010*"margaret" + 0.008*"evelyn" + 0.008*"pierre" + 0.008*"harold" + 0.007*"louise" + 0.007*"find" + 0.007*"diamond"'

## Visualization

In [71]:
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
vis = gensimvis.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

  default_term_info = default_term_info.sort_values(
