# Setup

In [1]:
!pip install langdetect
!pip install names-dataset



In [2]:
import pickle
import os 
import numpy as np
import pandas as pd

# Preprocessing
from langdetect import detect
from tqdm import tqdm_notebook
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from names_dataset import NameDataset
from nltk.corpus import stopwords
from itertools import chain 

# LDA
from gensim.models import Phrases
from gensim import corpora
from gensim import models


In [3]:
from nltk import download
download('averaged_perceptron_tagger')
download('wordnet')
download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nearchospotamitis/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nearchospotamitis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nearchospotamitis/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
pd.set_option('display.max_colwidth', None)
tqdm_notebook().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm_notebook().pandas()


0it [00:00, ?it/s]

# Preprocessing  <a id='preprocess'></a>

Already done and saved in `data/plot_summaries_aug.pkl` (takes about 45 minutes). Can be skipped by moving to the next [part](#lda)

In [None]:
data = pd.read_csv('data/plot_summaries.txt', sep="\t", header=None,names=["WikiMovieID", "Plot"] )
display(data.head(3))

## Cleaning

In [None]:
data['lang'] = data.Plot.progress_map(detect)

In [None]:
data.lang.value_counts()

In [None]:
data = data.loc[data.lang=='en']

## Tokenization

In [None]:
data['sentences'] = data.Plot.progress_map(sent_tokenize)

In [None]:
data['tokens_sentences'] = data['sentences'].progress_map(lambda sentences: [word_tokenize(sentence) for sentence in sentences])

## Lemmatization

In [None]:
data['POS_tokens'] = data['tokens_sentences'].progress_map(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])

In [None]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''
    
lemmatizer = WordNetLemmatizer()

In [None]:
# Lemmatizing each word with its POS tag, in each sentence
data['tokens_sentences_lemmatized'] = data['POS_tokens'].progress_map(
    lambda list_tokens_POS: [
        [
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) 
            if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
        ] 
        for tokens_POS in list_tokens_POS
    ]
)

In [None]:
os.makedirs('data', exist_ok=True)  
data.to_pickle('data/plot_summaries_mid.pkl')

## Regrouping tokens and removing stop words

In [5]:
data = pd.read_pickle('data/plot_summaries_mid.pkl')

---
**Names**

In [6]:
nd = NameDataset()
names = nd.get_top_names(n=2000, country_alpha2="US")
names = names["US"]["M"] + names["US"]["F"]
names = [name.lower() for name in names]

---

In [7]:
#stops_1 = ['use', 'take', 'one', 'find', 'kill', 'two', 'leave', 'however', 'life', 'also', 'make', 'play', 'go', 'get', 'tell', 'work', 'friend', 'woman', 'wife', 'husband', 'meet']
#stops_2 = ['return', 'family', 'love', 'see', 'ask', 'house', 'try', 'back', 'escape', 'run']
#stops_3 = ['discover', 'man', 'help', 'attack', 'give', 'show', 'want', 'end', 'become', 'child', 'come']
#stops_4 = ['police', 'decide', 'fight', 'father', 'son', 'fall', 'know', 'daughter', 'mother', 'home', 'live', 'say', 'start', 'group']
#stops = stops_1 + stops_2 + stops_3 + names

stops = ["one", "two", "also", "see", "take", "get", "find", "try", "however", "go", "come", "leave", "become", "make", "back", "run"]
my_stopwords = stopwords.words('English') + names + stops

data['tokens'] = data['tokens_sentences_lemmatized'].progress_map(lambda sentences: list(chain.from_iterable(sentences)))

  0%|          | 0/42276 [00:00<?, ?it/s]

In [8]:
data['tokens'] = data['tokens'].progress_map(lambda tokens: [token.lower() for token in tokens if token.isalpha() 
                                                    and token.lower() not in my_stopwords and len(token)>1])

  0%|          | 0/42276 [00:00<?, ?it/s]

## Saving our results

# LDA <a id='lda'></a>
Loading the preprocessed data (in case [preprocessing](#preprocess) part is not run)

## Data preparation
### Prepare bi-grams and tri-grams

In [9]:
tokens = data['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])

### Prepare objects for LDA gensim implementation

In [10]:
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

## Implementation

In [11]:
np.random.seed(123456)
num_topics = 20
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

CPU times: user 1min 48s, sys: 382 ms, total: 1min 49s
Wall time: 1min 36s


# Assigning topics to plots

In [12]:
topics = dict()
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20):
    topics[i]=topic

## Example

In [13]:
data.Plot[5]

"The president is on his way to give a speech. While he is traveling there a man shows up with a camera. A reporter tries to ask a member of the secret service a question. When the president enters he is shot by the man with the camera. The president's main bodyguard, Alex Thomas , is grazed by the bullet that hits the president. The shooter is gunned down by Alex and other secret service agents. The president dies at the hospital. Kate Crawford , an investigative journalist, starts asking questions about the assassination. Anyone she questions is killed. She goes to Alex Thomas's house to tell him what is happening. As they head to his boat, Thomas sees some men hiding in the bushes. He throws Kate into the water and dives in. Thomas jumps out of the water to kill two of the hitmen while a third hitman drives off to inform his boss what happened. They are able to link the hitmen to a man called Jack Baldwin . Agent Thomas and other Secret Service members attack the location of Jack Ba

In [14]:
corpus[0]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]

In [15]:
def takeSecond(elem):
    return elem[1]

def most_probable_topic(possible_topics):
    possible_topics.sort(reverse=True, key = takeSecond)
    return possible_topics[0][0]

chosen_topics = list()
for plot in tqdm(corpus):
    possible_topics = lda_model[plot]
    chosen_topic = most_probable_topic(possible_topics)
    chosen_topics.append(chosen_topic)

chosen_topics = np.array(chosen_topics)

100%|███████████████████████████████████| 42276/42276 [00:18<00:00, 2344.77it/s]


In [16]:
topics_df = pd.DataFrame(chosen_topics, columns = ["Topic"])

topics_df["Index"] = topics_df.index

display(topics_df.head())

Unnamed: 0,Topic,Index
0,16,0
1,6,1
2,16,2
3,1,3
4,16,4


In [17]:
data.drop(columns=["lang", "sentences", "tokens_sentences", "POS_tokens","tokens_sentences_lemmatized", "tokens"], inplace=True)

data["Index"] = data.index

data = data.merge(topics_df, left_on="Index", right_on="Index").drop(columns=["Index"])

display(data.head(2))

Unnamed: 0,WikiMovieID,Plot,Topic
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.",16
1,31186339,"The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12. Her older sister Katniss volunteers to take her place. Peeta Mellark, a baker's son who once gave Katniss bread when she was starving, is the other District 12 tribute. Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy. He warns them about the ""Career"" tributes who train intensively at special academies and almost always win. During a TV interview with Caesar Flickerman, Peeta unexpectedly reveals his love for Katniss. She is outraged, believing it to be a ploy to gain audience support, as ""sponsors"" may provide in-Games gifts of food, medicine, and tools. However, she discovers Peeta meant what he said. The televised Games begin with half of the tributes killed in the first few minutes; Katniss barely survives ignoring Haymitch's advice to run away from the melee over the tempting supplies and weapons strewn in front of a structure called the Cornucopia. Peeta forms an uneasy alliance with the four Careers. They later find Katniss and corner her up a tree. Rue, hiding in a nearby tree, draws her attention to a poisonous tracker jacker nest hanging from a branch. Katniss drops it on her sleeping besiegers. They all scatter, except for Glimmer, who is killed by the insects. Hallucinating due to tracker jacker venom, Katniss is warned to run away by Peeta. Rue cares for Katniss for a couple of days until she recovers. Meanwhile, the alliance has gathered all the supplies into a pile. Katniss has Rue draw them off, then destroys the stockpile by setting off the mines planted around it. Furious, Cato kills the boy assigned to guard it. As Katniss runs from the scene, she hears Rue calling her name. She finds Rue trapped and releases her. Marvel, a tribute from District 1, throws a spear at Katniss, but she dodges the spear, causing it to stab Rue in the stomach instead. Katniss shoots him dead with an arrow. She then comforts the dying Rue with a song. Afterward, she gathers and arranges flowers around Rue's body. When this is televised, it sparks a riot in Rue's District 11. President Snow summons Seneca Crane, the Gamemaker, to express his displeasure at the way the Games are turning out. Since Katniss and Peeta have been presented to the public as ""star-crossed lovers"", Haymitch is able to convince Crane to make a rule change to avoid inciting further riots. It is announced that tributes from the same district can win as a pair. Upon hearing this, Katniss searches for Peeta and finds him with an infected sword wound in the leg. She portrays herself as deeply in love with him and gains a sponsor's gift of soup. An announcer proclaims a feast, where the thing each survivor needs most will be provided. Peeta begs her not to risk getting him medicine. Katniss promises not to go, but after he falls asleep, she heads to the feast. Clove ambushes her and pins her down. As Clove gloats, Thresh, the other District 11 tribute, kills Clove after overhearing her tormenting Katniss about killing Rue. He spares Katniss ""just this time...for Rue"". The medicine works, keeping Peeta mobile. Foxface, the girl from District 5, dies from eating nightlock berries she stole from Peeta; neither knew they are highly poisonous. Crane changes the time of day in the arena to late at night and unleashes a pack of hound-like creatures to speed things up. They kill Thresh and force Katniss and Peeta to flee to the roof of the Cornucopia, where they encounter Cato. After a battle, Katniss wounds Cato with an arrow and Peeta hurls him to the creatures below. Katniss shoots Cato to spare him a prolonged death. With Peeta and Katniss apparently victorious, the rule change allowing two winners is suddenly revoked. Peeta tells Katniss to shoot him. Instead, she gives him half of the nightlock. However, before they can commit suicide, they are hastily proclaimed the victors of the 74th Hunger Games. Haymitch warns Katniss that she has made powerful enemies after her display of defiance. She and Peeta return to District 12, while Crane is locked in a room with a bowl of nightlock berries, and President Snow considers the situation.",6


In [18]:
data.Topic.value_counts()

2     11285
3      6411
16     4616
6      4453
19     3948
0      3047
11     1729
15     1373
7      1255
13     1064
17     1012
1       936
5       466
8       175
18      141
9       134
12       83
10       71
14       36
4        14
Name: Topic, dtype: int64

---

# Results

In [19]:
topics_ = dict()
for i,topic in lda_model.show_topics(formatted=False, num_topics=num_topics, num_words=20):
    topics_[i]=topic

In [20]:
"""
Counts for each word the number of topics that include it
"""
def count_words(topics_):
    counts = dict()
    for i in topics_:
        for word in topics_[i]:
            if word[0] in counts:
                counts[word[0]]+=1
            else:
                counts[word[0]]=1
    return counts

In [21]:
counts = count_words(topics_)
repeated_words = [(k,v) for k,v in counts.items() if v>=4]
repeated_words.sort(reverse=True, key = takeSecond)
repeated_words

[('meet', 4),
 ('return', 4),
 ('time', 4),
 ('end', 4),
 ('kill', 4),
 ('escape', 4),
 ('fight', 4)]

In [22]:
for i in topics_:
    message = "Topic {} : ".format(i) 
    words = str([k for (k,v) in topics_[i]])
    print(message+words)
    print()

Topic 0 : ['family', 'father', 'child', 'daughter', 'brother', 'marry', 'mother', 'fall', 'girl', 'sister', 'house', 'live', 'village', 'wife', 'uncle', 'know', 'help', 'meet', 'home', 'story']

Topic 1 : ['boy', 'money', 'car', 'kid', 'race', 'gang', 'pay', 'town', 'steal', 'work', 'sell', 'give', 'big', 'horse', 'owner', 'new', 'bank', 'decide', 'buy', 'old']

Topic 2 : ['mother', 'father', 'life', 'friend', 'meet', 'work', 'family', 'decide', 'relationship', 'home', 'wife', 'live', 'parent', 'return', 'husband', 'woman', 'marry', 'time', 'new', 'end']

Topic 3 : ['kill', 'police', 'escape', 'murder', 'shoot', 'men', 'tell', 'help', 'plan', 'meet', 'reveal', 'arrest', 'fight', 'gang', 'brother', 'name', 'death', 'gun', 'later', 'arrive']

Topic 4 : ['camp', 'japanese', 'bowen', 'expand_section', 'japan', 'circus', 'lion', 'tokyo', 'mace', 'sullivan', 'whitey', 'toshio', 'africa', 'preacher', 'tarzan', 'finch', 'pa', 'godfrey', 'oz', 'mississippi']

Topic 5 : ['woman', 'room', 'sex', 

## Top 10 most frequent topics

In [34]:
topic_counts = data.Topic.value_counts()
for i in topic_counts.index[:10]:
    message = "Topic {} : ".format(i) 
    words = str([k for (k,v) in topics_[i]])
    print(message+words + "({})".format(topic_counts[i]))
    print()

Topic 2 : ['mother', 'father', 'life', 'friend', 'meet', 'work', 'family', 'decide', 'relationship', 'home', 'wife', 'live', 'parent', 'return', 'husband', 'woman', 'marry', 'time', 'new', 'end'](11285)

Topic 3 : ['kill', 'police', 'escape', 'murder', 'shoot', 'men', 'tell', 'help', 'plan', 'meet', 'reveal', 'arrest', 'fight', 'gang', 'brother', 'name', 'death', 'gun', 'later', 'arrive'](6411)

Topic 16 : ['life', 'wife', 'people', 'story', 'woman', 'work', 'death', 'lead', 'time', 'case', 'involve', 'film', 'family', 'doctor', 'even', 'know', 'state', 'place', 'whose', 'city'](4616)

Topic 6 : ['tell', 'say', 'ask', 'house', 'call', 'give', 'home', 'want', 'show', 'next', 'later', 'night', 'friend', 'talk', 'look', 'time', 'know', 'arrive', 'start', 'think'](4453)

Topic 19 : ['use', 'kill', 'destroy', 'world', 'attack', 'fight', 'escape', 'human', 'earth', 'discover', 'save', 'return', 'power', 'time', 'monster', 'force', 'reveal', 'battle', 'capture', 'group'](3948)

Topic 0 : ['fa

Top 10 topics include a few random topics (16,13,7)

## Visualization

In [None]:
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
vis = gensimvis.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)