In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd

In [35]:
df_popular=pd.read_csv("TopMovies.csv")
df_popular.head(5)

Unnamed: 0,country,year,movie
0,Germany,2014,The Hobbit: The Battle of the Five Armies
1,Germany,2014,Honig im Kopf
2,Germany,2014,Transformers: Age of Extinction
3,Germany,2014,The Hunger Games: Mockingjay - Part 1
4,Germany,2014,Qu'est-ce qu'on a fait au Bon Dieu?


In [36]:
#subsetting dataframe for 5 countries
df_us = df_popular[df_popular.country=='United States']
df_ge = df_popular[df_popular.country=='Germany']
df_uk = df_popular[df_popular.country=='United Kingdom']
df_jp = df_popular[df_popular.country=='Japan']
df_hk = df_popular[df_popular.country=='Hong Kong']

In [37]:
df_us.shape, df_ge.shape, df_uk.shape, df_jp.shape, df_hk.shape

((1300, 3), (1300, 3), (1300, 3), (1300, 3), (1300, 3))

In [38]:
#load wikipedia plots
years = range(2002, 2015)

wiki = pd.DataFrame()
for year in years:
    path ='wikipedia_plots_%d.csv' % year
    frame = pd.read_table(path)

    frame['year'] = year
    wiki = wiki.append(frame, ignore_index=True)

In [39]:
wiki.head(5)

Unnamed: 0.1,Unnamed: 0,year,title,languages,countries,released,gross,comment,abstract,dbpediaLink,wikipediaLink,wiki_plot
0,0,2002,Thulluvadho Ilamai,Tamil,India,2002-05-10,,Thulluvadho Ilamai (English: Exuberance of You...,Thulluvadho Ilamai (English: Exuberance of You...,http://dbpedia.org/resource/Thulluvadho_Ilamai,http://en.wikipedia.org/wiki/Thulluvadho_Ilamai,Mahesh (Dhanush) is the son of a poor fisherma...
1,1,2002,The Anarchist Cookbook,English,United States,,14369.0,The Anarchist Cookbook is a 2002 American roma...,The Anarchist Cookbook is a 2002 American roma...,http://dbpedia.org/resource/The_Anarchist_Cook...,http://en.wikipedia.org/wiki/The_Anarchist_Coo...,Puck's introduction[edit]The story is narrated...
2,2,2002,Histoire de Pen,French,Canada,,,Histoire de Pen (also released internationally...,Histoire de Pen (also released internationally...,http://dbpedia.org/resource/Histoire_de_Pen,http://en.wikipedia.org/wiki/Histoire_de_Pen,
3,3,2002,War and Peace,"English, Hindi",India,2002,,War and Peace (Jang Aur Aman) is a 2002 Indian...,War and Peace (Jang Aur Aman) is a 2002 Indian...,http://dbpedia.org/resource/War_and_Peace_(200...,http://en.wikipedia.org/wiki/War_and_Peace_(20...,The documentary begins with the assassination ...
4,4,2002,"Que sera, sera",Portuguese,Brazil,,,"Que sera, sera (Portuguese: Seja o que Deus Qu...","Que sera, sera (Portuguese: Seja o que Deus Qu...","http://dbpedia.org/resource/Que_sera,_sera_(film)","http://en.wikipedia.org/wiki/Que_sera,_sera_(f...",


In [53]:
wiki.drop(wiki.columns[[0,3,4,5,6,7,8,9,10]], axis=1, inplace=True)

In [54]:
wiki.head(5)

Unnamed: 0,year,title,wiki_plot
0,2002,Thulluvadho Ilamai,Mahesh (Dhanush) is the son of a poor fisherma...
1,2002,The Anarchist Cookbook,Puck's introduction[edit]The story is narrated...
2,2002,Histoire de Pen,
3,2002,War and Peace,The documentary begins with the assassination ...
4,2002,"Que sera, sera",


In [57]:
#get plots for US top movies
us_top_movies = df_us['movie'].tolist()
df_us_wiki = wiki[wiki['title'].isin(us_top_movies)]
df_us_wiki.head(10)

Unnamed: 0,year,title,wiki_plot
14,2002,Harry Potter and the Chamber of Secrets,Further information: Plot of the novelHarry Po...
20,2002,The Sweetest Thing,"In an opening scene, a group of men are interv..."
60,2002,Jonah: A VeggieTales Movie,Bob the Tomato is driving Dad Asparagus and so...
86,2002,Windtalkers,"During World War II, USMC Cpl. Joseph F. 'Joe'..."
91,2002,The Master of Disguise,\r\n\r\n\r\n\r\n\r\nThis article's plot summar...
123,2002,Showtime,"The film centers on two cops, Det. Mitch Prest..."
164,2002,Enough,The film begins in a Los Angeles diner where a...
171,2002,The Count of Monte Cristo,\r\n\r\n\r\n\r\n\r\nThis article's plot summar...
174,2002,Crossroads,"Lucy, Kit, and Mimi are three friends who live..."
175,2002,Lilo & Stitch,Dr. Jumba Jookiba is arrested and put on trial...


In [56]:
df_us_wiki.shape #136 wiki plots are not available

(1164, 3)

In [61]:
#check missing plots with NaN
missing = df_us_wiki[pd.isnull(df_us_wiki).any(axis=1)]
missing.shape #total 136+27 = 163 wiki plots are not available

(27, 3)

In [66]:
#remove NaN rows
df_us_wiki= df_us_wiki[pd.notnull(df_us_wiki['wiki_plot'])]

In [68]:
df_us_wiki.shape

(1137, 3)

In [70]:
#make raw text as list 
documents = df_us_wiki['wiki_plot'].tolist()

In [71]:
#tokenization
from stop_words import get_stop_words
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
raw_text = []
for doc in documents:
    tokens = tokenizer.tokenize(doc.lower())
    raw_text.append(tokens)

In [74]:
#remove stopwords
en_stop = get_stop_words('en')
stopped_tokens = []
for token in raw_text:
    cleaned_token = [i for i in token if not i in en_stop]
    stopped_tokens.append(cleaned_token)

In [77]:
len(raw_text[0]), len(stopped_tokens[0])

(770, 468)

In [80]:
#Stemming words
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()
texts = []
for token in stopped_tokens:
    texts.append([p_stemmer.stem(i) for i in token])
        

In [83]:
#Construct a document-term matrix
from gensim import corpora, models
dictionary = corpora.Dictionary(texts)

In [116]:
print(dictionary)

Dictionary(22165 unique tokens: [u'fawn', u'circuitri', u'fawk', u'woodi', u'sonja']...)


In [112]:
#print(dictionary.token2id)

In [85]:
#Convert the dictionary to a bag-of-words
corpus = [dictionary.doc2bow(text) for text in texts] #(term id, term frequency)

In [89]:
#Applying LDA model
lda =  models.LdaModel(corpus,id2word=dictionary, num_topics=20, passes=20)

In [117]:
lda.print_topics(num_topics=20, num_words=5)

[(0, u'0.020*s + 0.007*find + 0.006*ben + 0.005*max + 0.005*hous'),
 (1, u'0.017*s + 0.007*kill + 0.005*find + 0.005*dale + 0.004*salt'),
 (2, u'0.020*s + 0.005*jim + 0.005*tell + 0.004*find + 0.004*john'),
 (3, u'0.026*s + 0.008*dave + 0.005*kill + 0.005*find + 0.004*reveal'),
 (4, u'0.023*s + 0.005*tom + 0.004*make + 0.004*get + 0.004*find'),
 (5, u'0.026*s + 0.005*will + 0.005*find + 0.004*father + 0.004*take'),
 (6, u'0.022*s + 0.005*team + 0.005*kill + 0.004*one + 0.004*ben'),
 (7, u'0.024*s + 0.005*kill + 0.005*nick + 0.004*tell + 0.004*find'),
 (8, u'0.029*jack + 0.023*s + 0.007*greg + 0.006*carter + 0.004*ethan'),
 (9, u'0.020*s + 0.010*kill + 0.005*ryan + 0.005*find + 0.004*vampir'),
 (10, u'0.017*s + 0.007*stu + 0.005*dastan + 0.005*kill + 0.005*rodney'),
 (11, u'0.022*s + 0.006*kill + 0.005*alex + 0.005*find + 0.005*will'),
 (12, u'0.026*s + 0.005*kill + 0.004*find + 0.004*take + 0.004*emili'),
 (13, u'0.021*s + 0.009*frank + 0.009*andi + 0.004*call + 0.004*reveal'),
 (14, u

In [126]:
#remove generic words
generics = [u's', u'z', u'john', u'ethan', u'will', u'emili', u'get', u'find', u'use', u'make', u'take', u'tom', u'dave', 
            u'jane', u'frank', u'tell', u'ben', u'plot', u'without', u'thing']

In [148]:
#remove proper-noun(names of characters)
f = open("2000names.txt")
lines = f.readlines()
lines = [x.strip() for x in lines]
names=[]
for w in lines:
    names.append(w.lower())
excludes = names + generics

In [153]:
new_texts = []
for plot in texts:
    new_words = [w for w in plot if w not in excludes]
    new_texts.append(new_words)
len(new_texts)

1137

In [154]:
new_dictionary = corpora.Dictionary(new_texts)

In [155]:
print(new_dictionary)

Dictionary(21196 unique tokens: [u'fawn', u'circuitri', u'fawk', u'chieko', u'woodi']...)


In [162]:
new_corpus = [new_dictionary.doc2bow(text) for text in new_texts] #(term id, term frequency)

In [163]:
len(new_corpus)

1137

In [165]:
lda2=models.LdaModel(new_corpus,id2word=new_dictionary, num_topics=20, passes=20)

In [166]:
lda2.print_topics(num_topics=20, num_words=5)

[(0, u'0.003*meet + 0.003*fire + 0.003*becom + 0.003*help + 0.003*can'),
 (1, u'0.006*time + 0.006*friend + 0.005*one + 0.005*go + 0.005*love'),
 (2, u'0.007*bourn + 0.004*gru + 0.004*vlad + 0.004*kill + 0.004*walt'),
 (3, u'0.014*kill + 0.007*agent + 0.007*bond + 0.007*escap + 0.005*reveal'),
 (4, u'0.005*billi + 0.004*team + 0.004*return + 0.004*kill + 0.004*j'),
 (5, u'0.007*kill + 0.006*sid + 0.005*manni + 0.004*leav + 0.004*attack'),
 (6, u'0.006*maggi + 0.006*car + 0.005*race + 0.005*kill + 0.004*hanna'),
 (7, u'0.011*kill + 0.006*alic + 0.004*one + 0.004*attack + 0.004*group'),
 (8, u'0.006*kill + 0.004*kenai + 0.004*bear + 0.004*return + 0.003*escap'),
 (9, u'0.004*marcu + 0.004*back + 0.004*later + 0.003*kill + 0.003*leav'),
 (10,
  u'0.010*harri + 0.007*larri + 0.006*game + 0.005*return + 0.005*exhibit'),
 (11, u'0.010*charli + 0.005*kill + 0.004*shrek + 0.004*codi + 0.004*man'),
 (12, u'0.006*kill + 0.005*dastan + 0.004*henri + 0.004*help + 0.003*order'),
 (13, u'0.008*kill 

In [167]:
lda3 = models.LdaModel(new_corpus,id2word=new_dictionary, num_topics=10, passes=40)

In [168]:
lda3.print_topics(num_topics=10, num_words=5)

[(0, u'0.005*one + 0.005*school + 0.004*friend + 0.004*new + 0.004*back'),
 (1, u'0.006*leav + 0.005*father + 0.005*love + 0.005*back + 0.004*home'),
 (2, u'0.007*kill + 0.004*return + 0.004*leav + 0.004*one + 0.003*two'),
 (3, u'0.012*kill + 0.005*polic + 0.004*leav + 0.004*shoot + 0.004*car'),
 (4, u'0.005*kill + 0.004*andi + 0.004*vampir + 0.004*one + 0.004*harri'),
 (5, u'0.008*kill + 0.005*reveal + 0.004*discov + 0.004*one + 0.003*escap'),
 (6, u'0.008*kill + 0.006*ship + 0.005*escap + 0.004*alien + 0.004*earth'),
 (7, u'0.009*kill + 0.004*time + 0.003*one + 0.003*attack + 0.003*return'),
 (8, u'0.004*kill + 0.004*man + 0.003*meet + 0.003*team + 0.003*one'),
 (9, u'0.007*alic + 0.006*kill + 0.004*larri + 0.004*escap + 0.004*slim')]

In [171]:
print(new_dictionary.token2id)

{u'fawn': 10538, u'circuitri': 4241, u'fawk': 1, u'chieko': 12926, u'woodi': 10507, u'osiri': 7576, u'yellow': 1483, u'elvi': 1555, u'askew': 12446, u'darlington': 15994, u'jihad': 18981, u'kadam': 20798, u'suzann': 6940, u'shibhu': 17688, u'gabriella': 10872, u'mp': 16604, u'penhal': 13188, u'pekkala': 14762, u'payoff': 11734, u'grenier': 12818, u'haqq': 11127, u'authoris': 11375, u'scold': 18129, u'salesgirl': 20165, u'emptiv': 7457, u'outwit': 19096, u'tnt': 19846, u'lore': 5223, u'lord': 26, u'tecumseh': 21048, u'undersecretari': 12069, u'saipan': 660, u'u2': 15458, u'digit': 4445, u'microbot': 21035, u'deli': 10420, u'rapson': 10351, u'linz': 20810, u'colossu': 21163, u'dell': 9251, u'triceratop': 21061, u'differenti': 18874, u'zaratho': 18877, u'alamut': 17721, u'lumin': 8180, u'dubmd': 10933, u'rearguard': 7879, u'foul': 11694, u'taj': 14480, u'delv': 14899, u'politician': 1758, u'cairncross': 20721, u'donkey': 9259, u'four': 2422, u'disturb': 6634, u'bittleman': 4658, u'prize':

In [172]:
#remove extreme words
new_dictionary.filter_extremes(no_below=1, no_above=0.8)

In [173]:
corpus_2=[new_dictionary.doc2bow(text) for text in new_texts]

In [174]:
%time
lda4 = models.LdaModel(corpus_2, num_topics=10, id2word=new_dictionary, passes=20)

Wall time: 0 ns


In [175]:
lda4.print_topics(num_topics=10)

[(0,
  u'0.009*kill + 0.005*alic + 0.004*escap + 0.004*memnon + 0.004*mathayu + 0.004*leav + 0.003*attack + 0.003*return + 0.003*one + 0.003*forc'),
 (1,
  u'0.008*kill + 0.004*one + 0.004*vampir + 0.003*reveal + 0.003*jimmi + 0.003*human + 0.003*time + 0.003*can + 0.003*escap + 0.003*return'),
 (2,
  u'0.008*kill + 0.004*hous + 0.004*one + 0.004*reveal + 0.004*back + 0.003*return + 0.003*leav + 0.003*discov + 0.003*escap + 0.003*becom'),
 (3,
  u'0.008*kill + 0.005*escap + 0.004*leav + 0.004*arriv + 0.004*one + 0.003*reveal + 0.003*return + 0.003*agent + 0.003*slim + 0.003*help'),
 (4,
  u'0.007*kill + 0.005*hous + 0.004*leav + 0.004*vampir + 0.003*reveal + 0.003*back + 0.003*return + 0.003*escap + 0.003*arriv + 0.003*charli'),
 (5,
  u'0.008*kill + 0.006*bond + 0.004*escap + 0.004*sid + 0.003*meet + 0.003*help + 0.003*leav + 0.003*reveal + 0.003*attack + 0.003*fight'),
 (6,
  u'0.006*kill + 0.003*return + 0.003*help + 0.003*discov + 0.003*reveal + 0.003*arriv + 0.003*tri + 0.003*leav

In [178]:
lda5 = models.LdaModel(corpus_2, num_topics=10, id2word=new_dictionary, chunksize=2000, passes=100)

In [179]:
lda5.print_topics(num_topics=10)

[(0,
  u'0.009*alic + 0.005*kill + 0.004*man + 0.004*white + 0.004*reveal + 0.004*king + 0.003*luci + 0.003*spider + 0.003*jesu + 0.003*shrek'),
 (1,
  u'0.006*kill + 0.005*marcu + 0.004*smurf + 0.004*sid + 0.004*human + 0.004*back + 0.003*save + 0.003*bear + 0.003*kenai + 0.003*tri'),
 (2,
  u'0.005*team + 0.004*reveal + 0.004*kill + 0.004*one + 0.004*game + 0.003*harri + 0.003*billi + 0.003*back + 0.003*leav + 0.003*two'),
 (3,
  u'0.006*ship + 0.006*kill + 0.006*alien + 0.006*earth + 0.005*destroy + 0.005*human + 0.004*blu + 0.004*attack + 0.004*discov + 0.004*escap'),
 (4,
  u'0.012*kill + 0.006*polic + 0.006*escap + 0.005*agent + 0.005*car + 0.005*leav + 0.004*shoot + 0.004*one + 0.003*help + 0.003*team'),
 (5,
  u'0.006*kill + 0.004*back + 0.004*escap + 0.003*leav + 0.003*return + 0.003*group + 0.003*boy + 0.003*help + 0.003*see + 0.003*attack'),
 (6,
  u'0.005*team + 0.004*kill + 0.004*wolf + 0.004*back + 0.004*father + 0.003*help + 0.003*return + 0.003*leav + 0.003*two + 0.003*

In [180]:
lda5 = models.LdaModel(corpus_2, num_topics=20, id2word=new_dictionary, chunksize=2000, passes=100)

In [181]:
lda5.print_topics()

[(13,
  u'0.024*charli + 0.006*shrek + 0.005*kill + 0.004*fiona + 0.004*return + 0.004*spirit + 0.004*new + 0.003*anim + 0.003*zuwani + 0.003*bolt'),
 (4,
  u'0.010*kill + 0.004*reveal + 0.004*harri + 0.003*white + 0.003*one + 0.003*selen + 0.003*viktor + 0.003*vampir + 0.003*help + 0.003*return'),
 (3,
  u'0.008*kill + 0.008*hanna + 0.007*witch + 0.007*salt + 0.005*alli + 0.005*kira + 0.005*english + 0.005*amsterdam + 0.004*cut + 0.004*dog'),
 (0,
  u'0.010*ship + 0.005*kill + 0.004*crew + 0.004*attack + 0.004*escap + 0.004*earth + 0.004*forc + 0.004*order + 0.003*command + 0.003*one'),
 (15,
  u'0.008*kill + 0.005*man + 0.004*reveal + 0.003*one + 0.003*return + 0.003*bear + 0.003*leav + 0.003*two + 0.003*attempt + 0.003*attack'),
 (2,
  u'0.007*father + 0.007*home + 0.007*leav + 0.007*love + 0.006*hous + 0.006*friend + 0.006*see + 0.006*one + 0.006*mother + 0.006*ask'),
 (8,
  u'0.010*wolf + 0.007*penguin + 0.005*emperor + 0.004*back + 0.004*hoover + 0.004*luci + 0.004*return + 0.004