Here, we'll prepare the data for visualization of popular topic distributions across some countries.

In [1]:
from gensim import corpora, models, similarities, matutils
import itertools
import numpy as np
import pandas as pd
from unidecode import unidecode

In [2]:
dropbox = "/Users/WSKIM/Dropbox/moviemeta/"

We can construct a topic matrix from imdb data, as follows:

In [636]:
imdb_lda = models.LdaModel.load(dropbox +'lda_imdb.model')
imdb_corpus = corpora.MmCorpus(dropbox +'lda_imdb.corpus')
imdb_dict = corpora.Dictionary.load(dropbox +'lda_imdb.dict')
imdb_meta_df = pd.read_csv(dropbox + 'imdb_meta_df.csv')
imdb_topic_matrix = matutils.corpus2dense(imdb_lda[imdb_corpus], num_terms=30)
imdb_topic_df = pd.DataFrame(np.ndarray.transpose(imdb_topic_matrix))
imdb_topic_df = pd.concat([imdb_topic_df, imdb_meta_df], axis=1) 
imdb_topic_df['title'] = imdb_topic_df['title'].apply(unidecode)
imdb_topic_df['title'] = imdb_topic_df['title'].apply(lambda x : x.split('(')[0].strip())
imdb_meta_df['title'] = imdb_meta_df['title'].apply(unidecode)
imdb_meta_df['title'] = imdb_meta_df['title'].apply(lambda x : x.split('(')[0].strip())

In [640]:
imdb_meta_df.head()

Unnamed: 0.1,Unnamed: 0,title,year
0,0,#1 Cheerleader Camp,2010
1,1,#1 Serial Killer,2013
2,2,#1 at the Apocalypse Box Office,2015
3,3,#137,2011
4,4,#29,2012


In [637]:
imdb_topic_df.shape

(259028, 33)

Now we have dataframe of topic matrix for all movies in imdb : column 0 ~29 represents 30 topics, and values in each column indicates the relative prevalence of that topic in each movie. 

In [638]:
imdb_topic_df.head()

Unnamed: 0.1,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,Unnamed: 0,title,year
0,0,0,0,0,0.0,0.098301,0,0.0,0,0,...,0.0,0.658098,0.0,0.0,0.0,0,0,0,#1 Cheerleader Camp,2010
1,0,0,0,0,0.037337,0.0,0,0.800634,0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,#1 Serial Killer,2013
2,0,0,0,0,0.0,0.0,0,0.0,0,0,...,0.0,0.226889,0.0,0.0,0.069645,0,0,2,#1 at the Apocalypse Box Office,2015
3,0,0,0,0,0.0,0.291623,0,0.290505,0,0,...,0.0,0.0,0.035633,0.030289,0.0,0,0,3,#137,2011
4,0,0,0,0,0.0,0.0,0,0.516595,0,0,...,0.223817,0.0,0.107112,0.0,0.0,0,0,4,#29,2012


To create dataframes of top movies for each country, let's check the available number of top movie titles and plots data first. 

In [6]:
#Top movies for 8 countries (US, GE, UK, South Korea, Japan, Hong Kong, Russia, Mexico) 
topmovies_df=pd.read_csv(dropbox + "TopMovies_final.csv")

In [639]:
#Top movies for each country
kr_df = topmovies_df[topmovies_df.country=="South Korea"] #available since 2007
jp_df = topmovies_df[topmovies_df.country=="Japan"] #available since 2002
hk_df = topmovies_df[topmovies_df.country=="Hong Kong"] #available since 2002
uk_df = topmovies_df[topmovies_df.country=="United Kingdom"] #available since 2002
ge_df = topmovies_df[topmovies_df.country=="Germany"] #available since 2002
us_df = topmovies_df[topmovies_df.country=="United States"] #available since 2001
ru_df = topmovies_df[topmovies_df.country=="Russia"]
mx_df = topmovies_df[topmovies_df.country=="Mexico"]
kr_df.shape, jp_df.shape, hk_df.shape, uk_df.shape, ge_df.shape, us_df.shape, ru_df.shape, mx_df.shape

((2553, 3),
 (1300, 3),
 (1300, 3),
 (4781, 3),
 (4139, 3),
 (6835, 3),
 (1599, 3),
 (1599, 3))

In [641]:
#collect movies data from imdb_topic_df : check the minimum number of available data, since 2007
years = range(2007, 2015)
available_kr =[]
available_jp =[]
available_hk =[]
available_uk =[]
available_ge =[]
available_us =[]
available_ru =[]
available_mx =[]
for year in years:
    num_movies_kr = imdb_meta_df[imdb_meta_df['year']==year][imdb_meta_df['title'].isin(kr_df[kr_df['year']==year].movie.tolist())]
    available_kr.append(num_movies_kr.shape)
    num_movies_jp = imdb_meta_df[imdb_meta_df['year']==year][imdb_meta_df['title'].isin(jp_df[jp_df['year']==year].movie.tolist())]
    available_jp.append(num_movies_jp.shape)
    num_movies_hk = imdb_meta_df[imdb_meta_df['year']==year][imdb_meta_df['title'].isin(hk_df[hk_df['year']==year].movie.tolist())]
    available_hk.append(num_movies_hk.shape)
    num_movies_uk = imdb_meta_df[imdb_meta_df['year']==year][imdb_meta_df['title'].isin(uk_df[uk_df['year']==year].movie.tolist())]
    available_uk.append(num_movies_uk.shape)
    num_movies_ge = imdb_meta_df[imdb_meta_df['year']==year][imdb_meta_df['title'].isin(ge_df[ge_df['year']==year].movie.tolist())]
    available_ge.append(num_movies_ge.shape)
    num_movies_us = imdb_meta_df[imdb_meta_df['year']==year][imdb_meta_df['title'].isin(us_df[us_df['year']==year].movie.tolist())]
    available_us.append(num_movies_us.shape)
    num_movies_ru = imdb_meta_df[imdb_meta_df['year']==year][imdb_meta_df['title'].isin(ru_df[ru_df['year']==year].movie.tolist())]
    available_ru.append(num_movies_ru.shape)
    num_movies_mx = imdb_meta_df[imdb_meta_df['year']==year][imdb_meta_df['title'].isin(mx_df[mx_df['year']==year].movie.tolist())]
    available_mx.append(num_movies_mx.shape)
print "Available movies for South Korea : " ,available_kr
print "Available movies for Japan : " ,available_jp
print "Available movies for Hong Kong : " ,available_hk
print "Available movies for United Kingdom : " ,available_uk
print "Available movies for Germany : " ,available_ge
print "Available movies for United States : " ,available_us
print "Available movies for Russia : " ,available_ru
print "Available movies for Mexico : " ,available_mx 

Available movies for South Korea :  [(74, 3), (68, 3), (63, 3), (65, 3), (65, 3), (51, 3), (50, 3), (51, 3)]
Available movies for Japan :  [(29, 3), (31, 3), (23, 3), (30, 3), (23, 3), (24, 3), (23, 3), (27, 3)]
Available movies for Hong Kong :  [(45, 3), (43, 3), (45, 3), (41, 3), (43, 3), (41, 3), (42, 3), (57, 3)]
Available movies for United Kingdom :  [(189, 3), (192, 3), (142, 3), (132, 3), (192, 3), (198, 3), (169, 3), (169, 3)]
Available movies for Germany :  [(124, 3), (105, 3), (104, 3), (97, 3), (110, 3), (117, 3), (107, 3), (123, 3)]
Available movies for United States :  [(226, 3), (220, 3), (208, 3), (200, 3), (197, 3), (191, 3), (179, 3), (174, 3)]
Available movies for Russia :  [(99, 3), (93, 3), (92, 3), (95, 3), (99, 3), (94, 3), (84, 3), (84, 3)]
Available movies for Mexico :  [(93, 3), (84, 3), (76, 3), (80, 3), (87, 3), (85, 3), (75, 3), (75, 3)]


As we can see from above results, data for Asian countires are not enough. Thus, it would be better to limit our analysis on topic distribution to Russia, South Korea, United States, Germany, United Kingdom, and Mexico : we can use 50 movies for each year since 2007, because the number of movies should be same for all countries to see if there is any similarity or difference in movie topics distribution for those countries. It's very small number of movies, but let's explore the data to see if tehre are differences across those countries.

In [642]:
def get(df, year, country=None):
    df = df[df['year']==year]
    if country:
        return df[df['country']==country]
    return df

imdb_merged_df = pd.DataFrame()

for year in range(2001, 2015):
    for country in ['Germany','United States', 'United Kingdom', 'South Korea', 'Russia', 'Mexico']:
        topic = get(imdb_topic_df, year)
        top = get(topmovies_df, year,country)
        #remove date from imdb title
        titles = topic.title.apply(lambda x : x.split('(')[0].strip())
        topic = topic[titles.isin(top.movie.tolist())]
        topic['country'] = country
        imdb_merged_df = pd.concat([imdb_merged_df, topic])
imdb_merged_df.head()    

Unnamed: 0.1,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,Unnamed: 0,title,year,country
5579,0.0,0,0.0,0.237901,0.0,0.0,0.037411,0.044432,0.0,0.056706,...,0.0,0.075269,0.084401,0.074583,0.0,0.035468,5579,A Knight's Tale,2001,Germany
13945,0.0,0,0.029535,0.0,0.0,0.151276,0.0,0.070639,0.0,0.0,...,0.14933,0.0,0.0,0.0,0.046453,0.0,13945,Along Came a Spider,2001,Germany
14601,0.0,0,0.027998,0.0,0.090975,0.0,0.130455,0.0,0.027856,0.0,...,0.070356,0.0,0.015746,0.061708,0.0,0.173776,14601,America's Sweethearts,2001,Germany
14900,0.0,0,0.0,0.0,0.0,0.107154,0.0,0.663064,0.0,0.0,...,0.0,0.062905,0.038708,0.0,0.0,0.097305,14900,American Pie 2,2001,Germany
16626,0.029772,0,0.0,0.0,0.0,0.0,0.0,0.030694,0.0,0.0,...,0.0,0.0,0.0,0.0,0.387213,0.0,16626,Angel Eyes,2001,Germany


In [643]:
imdb_merged_df.shape

(8237, 34)

In [644]:
#filtering all movies available in imdb for each country
imdb_df_us_all = imdb_merged_df[imdb_merged_df['country']=='United States']
imdb_df_ge_all = imdb_merged_df[imdb_merged_df['country']=='Germany']
imdb_df_uk_all = imdb_merged_df[imdb_merged_df['country']=='United Kingdom']
imdb_df_kr_all = imdb_merged_df[imdb_merged_df['country']=='South Korea']
imdb_df_ru_all = imdb_merged_df[imdb_merged_df['country']=='Russia']
imdb_df_mx_all = imdb_merged_df[imdb_merged_df['country']=='Mexico']
imdb_df_us_all.shape, imdb_df_ge_all.shape, imdb_df_uk_all.shape, imdb_df_kr_all.shape, imdb_df_ru_all.shape, imdb_df_mx_all.shape

((2859, 34), (1451, 34), (2045, 34), (487, 34), (740, 34), (655, 34))

Because all movie plots of top 50 movies(rank 1~rank50) are not available in imdb, we have to collect 50 available movie plots for each year and each country, while maintaining the rank(50 movies from the top, not in random order). We could filter out 50 movies from the top rank for which movie plots are available in imdb, by scanning the number of movie plots available in imdb for each country.

In [648]:
#check if the rank order is maintained in the dataframe for each country
us_title= us_df['movie'].tolist()
#us_title[:5] 
#'American Sniper','The Hunger Games: Mockingjay - Part 1','Guardians of the Galaxy','Captain America: The Winter Soldier', 'The LEGO Movie'

In [710]:
#selecting 50 movies for each year in each country, for those imdb data are available
#South Korea - 2014 : head(167), 2013: 185, 2012 : 168, 2011 : 155, 2010 : 150, 2009: 160, 2008 : 135, 2007 : 142
#United States - 2014 : head(71), 2013:76 , 2012 :61 , 2011 :66 , 2010 : 64, 2009: 59 , 2008 : 59 , 2007 : 57 
#United Kingdom - 2014 : head(72), 2013: 84, 2012 :74 , 2011 :74 , 2010 :74 , 2009:82, 2008 :71, 2007 :67 
#Germany - 2014 : head(82), 2013:103, 2012 :92 , 2011 :91 , 2010 :90 , 2009:101, 2008 :89, 2007 :89
#Russia - 2014 : head(102), 2013:88 , 2012:90, 2011:79, 2010:81,2009:85, 2008:84, 2007:82
#Mexico - 2014 : head(87), 2013:89 , 2012:92, 2011:75, 2010:82, 2009:85, 2008:78, 2007:67
yearly_tops = mx_df[mx_df['year']==2014].head(87)
mx_tops_title = yearly_tops['movie'].tolist()
imdb_df_mx_tops = imdb_df_mx_all[imdb_df_mx_all['title'].isin(mx_tops_title)]
imdb_df_mx_tops.shape[0]

50

In [712]:
#list of titles of available movies for each country, 2007~2014
yearly_tops_2014 = mx_df[mx_df['year']==2014].head(87)
tops_2014 = yearly_tops_2014['movie'].tolist()
yearly_tops_2013 = mx_df[mx_df['year']==2013].head(89)
tops_2013 = yearly_tops_2013['movie'].tolist()   
yearly_tops_2012 = mx_df[mx_df['year']==2012].head(92)
tops_2012 = yearly_tops_2012['movie'].tolist()   
yearly_tops_2011 = mx_df[mx_df['year']==2011].head(75)
tops_2011 = yearly_tops_2011['movie'].tolist()   
yearly_tops_2010 = mx_df[mx_df['year']==2010].head(82)
tops_2010 = yearly_tops_2010['movie'].tolist()   
yearly_tops_2009 = mx_df[mx_df['year']==2009].head(85)
tops_2009 = yearly_tops_2009['movie'].tolist()   
yearly_tops_2008 = mx_df[mx_df['year']==2008].head(78)
tops_2008 = yearly_tops_2008['movie'].tolist()   
yearly_tops_2007 = mx_df[mx_df['year']==2007].head(67)
tops_2007 = yearly_tops_2007['movie'].tolist()   
mx_tops = tops_2014 + tops_2013 + tops_2012 + tops_2011 + tops_2010 + tops_2009 + tops_2008 + tops_2007
len(mx_tops)

655

Finally, we obtained dataframe containing 400 movie plots for each country. 

In [713]:
#filtering imdb data for each country
#imdb_kr_df_tops = imdb_df_kr_all[imdb_df_kr_all['title'].isin(kr_tops)]
#imdb_us_df_tops = imdb_df_us_all[imdb_df_us_all['title'].isin(us_tops)]
#imdb_uk_df_tops = imdb_df_uk_all[imdb_df_uk_all['title'].isin(uk_tops)]
#imdb_ge_df_tops = imdb_df_ge_all[imdb_df_ge_all['title'].isin(ge_tops)]
imdb_ru_df_tops = imdb_df_ru_all[imdb_df_ru_all['title'].isin(ru_tops)]
imdb_mx_df_tops = imdb_df_mx_all[imdb_df_mx_all['title'].isin(mx_tops)]
imdb_kr_df_tops.shape, imdb_us_df_tops.shape, imdb_uk_df_tops.shape, imdb_ge_df_tops.shape, imdb_ru_df_tops.shape,  imdb_mx_df_tops.shape, 

((400, 34), (400, 34), (400, 34), (400, 34), (401, 34), (400, 34))

In [714]:
#save 
imdb_kr_df_tops.to_csv("imdb_kr_df_tops.csv", index=False)
imdb_us_df_tops.to_csv("imdb_us_df_tops.csv", index=False)
imdb_uk_df_tops.to_csv("imdb_uk_df_tops.csv", index=False)
imdb_ge_df_tops.to_csv("imdb_ge_df_tops.csv", index=False)
imdb_ru_df_tops.to_csv("imdb_ru_df_tops.csv", index=False)
imdb_mx_df_tops.to_csv("imdb_mx_df_tops.csv", index=False)

In [715]:
imdb_kr_df_tops.head()

Unnamed: 0.1,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,Unnamed: 0,title,year,country
971,0.0,0.037582,0.0,0.0,0,0,0.0,0,0,0,...,0.477381,0.034577,0.0,0.0,0.0,0.240574,971,1408,2007,South Korea
14125,0.052627,0.072762,0.027182,0.129195,0,0,0.037987,0,0,0,...,0.0,0.0,0.091679,0.0,0.0,0.0,14125,Alvin and the Chipmunks,2007,South Korea
14758,0.0,0.0,0.0,0.156749,0,0,0.040243,0,0,0,...,0.0,0.030968,0.157437,0.051071,0.32099,0.0,14758,American Gangster,2007,South Korea
20642,0.015923,0.108853,0.0,0.0,0,0,0.07007,0,0,0,...,0.0,0.0,0.0,0.068218,0.086696,0.0,20642,August Rush,2007,South Korea
24826,0.0,0.0,0.0,0.103353,0,0,0.0,0,0,0,...,0.0,0.0,0.0,0.0,0.107148,0.122594,24826,Because I Said So,2007,South Korea


In [716]:
#topic scores for top movies for each country
kr_topic = imdb_kr_df_tops[imdb_kr_df_tops.columns[:30]] #score for each topic is in column 0~29
kr_topic_score= kr_topic.sum(axis=0)
us_topic = imdb_us_df_tops[imdb_us_df_tops.columns[:30]]
us_topic_score= us_topic.sum(axis=0)
uk_topic = imdb_uk_df_tops[imdb_uk_df_tops.columns[:30]]
uk_topic_score= uk_topic.sum(axis=0)
ge_topic = imdb_ge_df_tops[imdb_ge_df_tops.columns[:30]]
ge_topic_score= ge_topic.sum(axis=0)
ru_topic = imdb_ru_df_tops[imdb_ru_df_tops.columns[:30]]
ru_topic_score= ru_topic.sum(axis=0)
mx_topic = imdb_mx_df_tops[imdb_mx_df_tops.columns[:30]]
mx_topic_score= mx_topic.sum(axis=0)
len(kr_topic_score), len(us_topic_score), len(uk_topic_score), len(ge_topic_score), len(ru_topic_score), len(mx_topic_score)

(30, 30, 30, 30, 30, 30)

In [717]:
#Create dataframe for visualization (since 2007, full year)
country_us = ['United States'] *30
country_ge = ['Germany']*30
country_uk = ['United Kingdom']*30
country_kr = ['South Korea'] *30
country_ru = ['Russia']*30
country_mx = ['Mexico']*30
topics =['magic, myths', 'school, college', 'fantasy, christmas', 'home', 'ships, sailing, pirates','love, relationships',
         'war','exploration, nature, space', 'comedy', 'places, nature, scenery', 'hollywood, stars', 'society, culture',
         'historical, costumes', 'money, robbery','photography, design','spies, terrorism', 'mixed', 'town', 'press, politics',
         'crime, police, underworld','documentary, interview','friendship, relationships','cowboys and indians', 'night life, enjoyment',
         'crime, mistery','music', 'farming, country side', 'fantasy, fairy tale','love, family', 'gangs, drugs, police']
data_us = {'Country':country_us, 'Topic':topics, 'Topic_Score':us_topic_score}
data_ge = {'Country':country_ge, 'Topic':topics, 'Topic_Score':ge_topic_score}
data_uk = {'Country':country_uk, 'Topic':topics, 'Topic_Score':uk_topic_score}
data_kr = {'Country':country_kr, 'Topic':topics, 'Topic_Score':kr_topic_score} 
data_ru = {'Country':country_ru, 'Topic':topics, 'Topic_Score':ru_topic_score}
data_mx = {'Country':country_mx, 'Topic':topics, 'Topic_Score':mx_topic_score}

us = pd.DataFrame(data_us)
ge = pd.DataFrame(data_ge)
uk = pd.DataFrame(data_uk)
kr = pd.DataFrame(data_kr)
ru = pd.DataFrame(data_ru)
mx = pd.DataFrame(data_mx)

frames =[us,ge, uk, kr, ru, mx]
popular_df= pd.concat(frames)

In [718]:
popular_df.head()

Unnamed: 0,Country,Topic,Topic_Score
0,United States,"magic, myths",2.901385
1,United States,"school, college",7.218853
2,United States,"fantasy, christmas",1.967969
3,United States,home,9.135294
4,United States,"ships, sailing, pirates",2.627621


In [725]:
#Global top 10 topics for popular movies 
topics =['magic, myths', 'school, college', 'fantasy, christmas', 'home', 'ships, sailing, pirates','love, relationships',
         'war','exploration, nature, space', 'comedy', 'places, nature, scenery', 'hollywood, stars', 'society, culture',
         'historical, costumes', 'money, robbery','photography, design','spies, terrorism', 'mixed', 'town', 'press, politics',
         'crime, police, underworld','documentary, interview','friendship, relationships','cowboys and indians', 'night life, enjoyment',
         'crime, mistery','music', 'farming, country side', 'fantasy, fairy tale','love, family', 'gangs, drugs, police']
from operator import add
us_ge = map(add, us_topic_score, ge_topic_score)
uk_kr = map(add, uk_topic_score, kr_topic_score)
ru_mx = map(add, ru_topic_score, mx_topic_score)
temp_= map(add, us_ge, uk_kr)
global_topic_score = map(add, temp_, ru_mx)
Global = ['Global']*30
global_topic_df = pd.DataFrame({'Country': Global, 'Topic':topics, 'Topic_Score':global_topic_score})
global_topic_df.head(5)

Unnamed: 0,Country,Topic,Topic_Score
0,Global,"magic, myths",16.459787
1,Global,"school, college",48.390441
2,Global,"fantasy, christmas",13.153329
3,Global,home,63.280514
4,Global,"ships, sailing, pirates",16.744218


In [726]:
#selecting global top 10 topics
global_top = global_topic_df.sort_values('Topic_Score', axis=0, ascending=False, inplace=False)
global_top.head(10) #topic 5, 28, 7, 21, 11, 29, 24, 20, 13, 3  --> 5, 7, 28, 21, 11, 20, 24, 29, 13, 3
#order(rank) of global top 10 topics changed by adding Russia and Mexcico.

Unnamed: 0,Country,Topic,Topic_Score
5,Global,"love, relationships",156.391804
7,Global,"exploration, nature, space",143.545347
28,Global,"love, family",140.336443
21,Global,"friendship, relationships",126.512566
11,Global,"society, culture",95.408396
20,Global,"documentary, interview",78.369434
24,Global,"crime, mistery",77.703915
29,Global,"gangs, drugs, police",75.114786
13,Global,"money, robbery",67.20529
3,Global,home,63.280514


In [727]:
#filtering global top10 topics for each country 
global_topics = ['love, relationships', 'exploration, nature, space','love, family', 'friendship, relationships', 'society, culture',
                 'documentary, interview', 'crime, mistery','gangs, drugs, police','money, robbery','home']
countrywise_global_topic = popular_df[popular_df['Topic'].isin(global_topics)]
countrywise_global_topic.head()

Unnamed: 0,Country,Topic,Topic_Score
3,United States,home,9.135294
5,United States,"love, relationships",26.157195
7,United States,"exploration, nature, space",21.967066
11,United States,"society, culture",16.640522
13,United States,"money, robbery",11.767998


In [731]:
#save data as csv for visualization
popularity = pd.concat([global_topic_df, popular_df])
popularity.to_csv("popularity_new.csv", index= False)

In [732]:
popularity.head()

Unnamed: 0,Country,Topic,Topic_Score
0,Global,"magic, myths",16.459787
1,Global,"school, college",48.390441
2,Global,"fantasy, christmas",13.153329
3,Global,home,63.280514
4,Global,"ships, sailing, pirates",16.744218


Let's get prevalent terms in each topic, to make our visualization more meaningful and understandable to our audience. We can the topic-terms distribution using lda.get_topic_terms(topic id). 

In [733]:
#prevalent terms for each topics
ids = range(0, 30)
topic_terms =[]
for id in ids:
    terms = imdb_lda.get_topic_terms(id,topn=5) #[(id, score), (id, score),...]
    topic_terms.append([x[0] for x in terms])
topic_terms[:3] #collection of top 5 prevalent terms in each topic

[[34322, 1240, 30396, 10059, 19977],
 [7575, 30745, 10710, 26115, 10567],
 [25939, 17413, 13018, 23922, 13016]]

In [734]:
#convert id to word
words = []
for topic in topic_terms:
    words.append(map(lambda id:imdb_dict[id], topic))
words

[[u'brown', u'yoga', u'sir', u'prize', u'disabl'],
 [u'school', u'student', u'team', u'game', u'colleg'],
 [u'dog', u'la', u'de', u'christma', u'di'],
 [u'enter', u'home', u'scene', u'leav', u'arriv'],
 [u'ship', u'danc', u'dancer', u'crew', u'captain'],
 [u'love', u'past', u'woman', u'relationship', u'friend'],
 [u'war', u'armi', u'soldier', u'fight', u'forc'],
 [u'human', u'vs', u'explor', u'experi', u'creat'],
 [u'adventur', u'girl', u'episod', u'comedi', u'comic'],
 [u'island', u'villag', u'water', u'sea', u'river'],
 [u'star', u'book', u'titl', u'movi', u'writer'],
 [u'american', u'famili', u'commun', u'documentari', u'cultur'],
 [u'tree', u'dress', u'servant', u'tabl', u'wear'],
 [u'money', u'plan', u'girl', u'tri', u'escap'],
 [u'pictur', u'anim', u'imag', u'color', u'view'],
 [u'american', u'japanes', u'russian', u'govern', u'pilot'],
 [u'lucki', u'zero', u'dive', u'camp', u'tag'],
 [u'town', u'local', u'disappoint', u'station', u'phone'],
 [u'parti', u'smith', u'newspap', u'ho

In [735]:
words_ascii=[]
for topic in words:
    words_ascii.append(map(lambda term:term.encode('ascii'), topic))
words_ascii

[['brown', 'yoga', 'sir', 'prize', 'disabl'],
 ['school', 'student', 'team', 'game', 'colleg'],
 ['dog', 'la', 'de', 'christma', 'di'],
 ['enter', 'home', 'scene', 'leav', 'arriv'],
 ['ship', 'danc', 'dancer', 'crew', 'captain'],
 ['love', 'past', 'woman', 'relationship', 'friend'],
 ['war', 'armi', 'soldier', 'fight', 'forc'],
 ['human', 'vs', 'explor', 'experi', 'creat'],
 ['adventur', 'girl', 'episod', 'comedi', 'comic'],
 ['island', 'villag', 'water', 'sea', 'river'],
 ['star', 'book', 'titl', 'movi', 'writer'],
 ['american', 'famili', 'commun', 'documentari', 'cultur'],
 ['tree', 'dress', 'servant', 'tabl', 'wear'],
 ['money', 'plan', 'girl', 'tri', 'escap'],
 ['pictur', 'anim', 'imag', 'color', 'view'],
 ['american', 'japanes', 'russian', 'govern', 'pilot'],
 ['lucki', 'zero', 'dive', 'camp', 'tag'],
 ['town', 'local', 'disappoint', 'station', 'phone'],
 ['parti', 'smith', 'newspap', 'hotel', 'elect'],
 ['brother', 'inspector', 'kill', 'twin', 'polic'],
 ['interview', 'documentar

In [736]:
#add column 'words' in the dataframe 'popularity'
popularity['words']=pd.Series(words_ascii*7, index=popularity.index)
popularity.head()

Unnamed: 0,Country,Topic,Topic_Score,words
0,Global,"magic, myths",16.459787,"[brown, yoga, sir, prize, disabl]"
1,Global,"school, college",48.390441,"[school, student, team, game, colleg]"
2,Global,"fantasy, christmas",13.153329,"[dog, la, de, christma, di]"
3,Global,home,63.280514,"[enter, home, scene, leav, arriv]"
4,Global,"ships, sailing, pirates",16.744218,"[ship, danc, dancer, crew, captain]"


In [737]:
popularity.tail()

Unnamed: 0,Country,Topic,Topic_Score,words
25,Mexico,music,9.53617,"[music, artist, perform, band, song]"
26,Mexico,"farming, country side",3.200936,"[farmer, toy, bride, farm, hat]"
27,Mexico,"fantasy, fairy tale",6.553479,"[mountain, king, evil, zombi, power]"
28,Mexico,"love, family",21.92252,"[love, marri, wife, father, famili]"
29,Mexico,"gangs, drugs, police",11.244498,"[polic, gang, prison, drug, crime]"


In [738]:
#save final 'popularity' to csv
popularity.to_csv("popularity_new.csv", index= False)

Note : 'popularity_new.csv' contains global data & data for 6 countries, with prevalent 5 words for each topic. 

## Topic Distribution with Origin of Popular Movies

Are there any relationship between topic distribution and origin of movies? In other words, do the movies made in United States show different topic distribution from those made in South Korea? To answer this question, we are going to investigate the topic distribution of popular movies with origin of movies. For that purpose, create a dataframe "imdb_meta_top_df" containing origin of movies for popular movies first. 

In [361]:
#load dataframes
imdb_lda = models.LdaModel.load(dropbox +'lda_imdb.model')
imdb_corpus = corpora.MmCorpus(dropbox +'lda_imdb.corpus')
imdb_dict = corpora.Dictionary.load(dropbox +'lda_imdb.dict')
imdb_meta_df = pd.read_csv(dropbox + 'imdb_meta_df.csv')

imdb_topic_matrix = matutils.corpus2dense(imdb_lda[imdb_corpus], num_terms=30, num_docs=len(imdb_corpus))
imdb_topic_df = pd.DataFrame(np.ndarray.transpose(imdb_topic_matrix))
imdb_topic_df = pd.concat([imdb_topic_df, imdb_meta_df], axis=1) 

In [368]:
#load dataframe containing origin of movies
imdb_meta_top_df = pd.read_csv(dropbox + 'imdb_meta_top_df.csv')
imdb_meta_top_df.head()  

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,year,origin
0,994,994,15 Minutes (2001),2001,Germany
1,2069,2069,3000 Miles to Graceland (2001),2001,USA
2,3342,3342,A Beautiful Mind (2001),2001,USA
3,5579,5579,A Knight's Tale (2001),2001,USA
4,12647,12647,Ali (2001),2001,USA


In [369]:
#filtering the topic matrix by popular movies
imdb_topic_top_df = pd.DataFrame()

for year in range(2001, 2015):
    for country in ['Germany','United States', 'United Kingdom', 'South Korea', 'Japan', 'Hong Kong']:
        topic = get(imdb_topic_df, year)
        top = get(topmovies_df, year,country)
        #remove date from imdb title, compare lower cased titles
        titles = topic.title.apply(lambda x : x.split('(')[0].strip().lower())
        topic = topic[titles.isin([t.lower() for t in top.movie.tolist()])]
        topic['country'] = country
        imdb_topic_top_df = pd.concat([imdb_topic_top_df, topic])
imdb_topic_top_df.head()  

Unnamed: 0.1,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,Unnamed: 0,title,year,country
5579,0.0,0,0.0,0.250719,0.0,0.0,0.040608,0.0,0.0,0.05785,...,0.0,0.081993,0.084435,0.074651,0.0,0.039129,5579,A Knight's Tale (2001),2001,Germany
13945,0.0,0,0.029535,0.0,0.0,0.151568,0.0,0.070572,0.0,0.0,...,0.14909,0.0,0.0,0.0,0.046381,0.0,13945,Along Came a Spider (2001),2001,Germany
14601,0.0,0,0.027998,0.0,0.090975,0.0,0.130275,0.0,0.027781,0.0,...,0.070602,0.0,0.015746,0.061785,0.0,0.173716,14601,America's Sweethearts (2001),2001,Germany
14900,0.0,0,0.0,0.0,0.0,0.106993,0.0,0.663135,0.0,0.0,...,0.0,0.062978,0.038708,0.0,0.0,0.097321,14900,American Pie 2 (2001),2001,Germany
16626,0.029772,0,0.0,0.0,0.0,0.0,0.0,0.030694,0.0,0.0,...,0.0,0.0,0.0,0.0,0.387287,0.0,16626,Angel Eyes (2001),2001,Germany


In [380]:
#merge two dataframes imdb_topic_top_df and imdb_meta_top_df
imdb_top_origin_df = pd.merge(imdb_topic_top_df, imdb_meta_top_df, on='title')
imdb_top_origin_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,Unnamed: 0_x,title,year_x,country,Unnamed: 0_y,Unnamed: 0.1,year_y,origin
0,0,0,0.0,0.250719,0.0,0.0,0.040608,0.0,0.0,0.05785,...,0.0,0.039129,5579,A Knight's Tale (2001),2001,Germany,5579,5579,2001,USA
1,0,0,0.0,0.250719,0.0,0.0,0.040608,0.0,0.0,0.05785,...,0.0,0.039129,5579,A Knight's Tale (2001),2001,United States,5579,5579,2001,USA
2,0,0,0.029535,0.0,0.0,0.151568,0.0,0.070572,0.0,0.0,...,0.046381,0.0,13945,Along Came a Spider (2001),2001,Germany,13945,13945,2001,Canada
3,0,0,0.029535,0.0,0.0,0.151568,0.0,0.070572,0.0,0.0,...,0.046381,0.0,13945,Along Came a Spider (2001),2001,United States,13945,13945,2001,Canada
4,0,0,0.027998,0.0,0.090975,0.0,0.130275,0.0,0.027781,0.0,...,0.0,0.173716,14601,America's Sweethearts (2001),2001,Germany,14601,14601,2001,USA


In [383]:
#check the number 
origins=imdb_top_origin_df['origin'].tolist() #len(origins) = 3761 
origins = list(set(origins))  #after removing duplicate, len(origins) = 77
origins[:6]

['Canada', 'Turkey', 'Italy', 'Czech Republic', 'USA', 'Afghanistan']

In [385]:
grouped = imdb_top_origin_df.groupby('origin')

In [394]:
origin_topic_sum= grouped.sum()
origin_topic_sum.head(5)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,Unnamed: 0_x,year_x,Unnamed: 0_y,Unnamed: 0.1,year_y
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.0,0.05525,0.0,0.0,0.040695,0.0,0.209245,0.0,0.0,0.0,...,0.065245,0.0,0.0,0.069425,0.179746,88325,2011,88325,88325,2011
Argentina,0.0,0.0,0.0,0.0,0.0,0.0,0.529603,0.0,0.0,0.0,...,0.45561,0.0,0.0,0.250142,0.0,674015,8040,674015,674015,8040
Australia,0.73934,4.774235,1.64771,3.457284,1.363858,16.394829,5.036436,10.928599,2.202809,1.122519,...,7.545632,1.406462,4.682922,13.168547,7.206612,27140833,413226,27140833,27140833,413226
Austria,0.112778,0.15198,0.098123,0.723472,0.072395,2.051567,0.441433,0.825162,0.511871,0.237841,...,0.134382,0.013983,0.698932,1.191857,0.195471,650029,24113,650029,650029,24113
Bahamas,0.0,0.261953,0.0,0.0,0.0,0.492779,0.0,0.0,0.0,0.0,...,0.0,0.0,0.495543,0.412132,0.0,191080,10030,191080,191080,10030


In [433]:
#subdataframe containing only topcis score
origin_topic_score = origin_topic_sum.iloc[:, :30]
origin_topic_score.head(5)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.0,0.05525,0.0,0.0,0.040695,0.0,0.209245,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.058815,0.065245,0.0,0.0,0.069425,0.179746
Argentina,0.0,0.0,0.0,0.0,0.0,0.0,0.529603,0.0,0.0,0.0,...,0.0,0.632386,0.0,0.116809,0.423718,0.45561,0.0,0.0,0.250142,0.0
Australia,0.73934,4.774235,1.64771,3.457284,1.363858,16.394829,5.036436,10.928599,2.202809,1.122519,...,7.717166,17.888649,2.815919,1.791592,10.954605,7.545632,1.406462,4.682922,13.168547,7.206612
Austria,0.112778,0.15198,0.098123,0.723472,0.072395,2.051567,0.441433,0.825162,0.511871,0.237841,...,0.20957,0.365508,0.160856,0.586655,1.274669,0.134382,0.013983,0.698932,1.191857,0.195471
Bahamas,0.0,0.261953,0.0,0.0,0.0,0.492779,0.0,0.0,0.0,0.0,...,0.526696,0.0,0.0,0.0,0.0,0.0,0.0,0.495543,0.412132,0.0


In [398]:
#Number of movies in each of origins
num_movies = grouped.count()
num_movies.head(5)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,Unnamed: 0_x,title,year_x,country,Unnamed: 0_y,Unnamed: 0.1,year_y
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Argentina,3,3,3,3,3,3,3,3,3,3,...,3,3,3,4,4,4,4,4,4,4
Australia,162,162,162,162,162,162,162,162,162,162,...,162,162,162,206,206,206,206,206,206,206
Austria,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
Bahamas,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5


In [410]:
#to confirm the number in column(1~29) means total number of movies in that origin
imdb_top_origin_df[imdb_top_origin_df.origin =='Afghanistan']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,Unnamed: 0_x,title,year_x,country,Unnamed: 0_y,Unnamed: 0.1,year_y,origin
5818,0,0.05525,0,0,0.040695,0,0.209245,0,0,0,...,0.069425,0.179746,88325,Hell and Back Again (2011),2011,United States,88325,88325,2011,Afghanistan


In [412]:
#number in each column(0~29) is equal to total number of movies in that origin(excluding movies without plots)
imdb_top_origin_df[imdb_top_origin_df.origin =='Argentina']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,Unnamed: 0_x,title,year_x,country,Unnamed: 0_y,Unnamed: 0.1,year_y,origin
1713,,,,,,,,,,,...,,,227792,The Take (2004),2004,United States,227792,227792,2004,Argentina
6167,0.0,0.0,0.0,0.0,0.0,0.0,0.176534,0.0,0.0,0.0,...,0.083381,0.0,148741,On the Road (2012),2012,Germany,148741,148741,2012,Argentina
6168,0.0,0.0,0.0,0.0,0.0,0.0,0.176534,0.0,0.0,0.0,...,0.083381,0.0,148741,On the Road (2012),2012,United States,148741,148741,2012,Argentina
6169,0.0,0.0,0.0,0.0,0.0,0.0,0.176534,0.0,0.0,0.0,...,0.083381,0.0,148741,On the Road (2012),2012,United Kingdom,148741,148741,2012,Argentina


In [511]:
#make dataframe for visualization
n_movies = num_movies[29].tolist()
n_movies = [x for x in n_movies if x != 0]
len(n_movies)

68

In [520]:
from itertools import repeat
n_movie=[x for item in n_movies for x in repeat(item, 30)]

In [415]:
origin_topic_score.shape[0]

76

In [435]:
#drop origins with no available data(rows with NaN due to the absence of movie plots data)
origin_topic_score = origin_topic_score.dropna()
origin_topic_score.shape # 8 origins droppped

(68, 30)

In [436]:
origin_topic_score = origin_topic_score.reset_index()
origin_topic_score.head()

Unnamed: 0,origin,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,Afghanistan,0.0,0.05525,0.0,0.0,0.040695,0.0,0.209245,0.0,0.0,...,0.0,0.0,0.0,0.0,0.058815,0.065245,0.0,0.0,0.069425,0.179746
1,Argentina,0.0,0.0,0.0,0.0,0.0,0.0,0.529603,0.0,0.0,...,0.0,0.632386,0.0,0.116809,0.423718,0.45561,0.0,0.0,0.250142,0.0
2,Australia,0.73934,4.774235,1.64771,3.457284,1.363858,16.394829,5.036436,10.928599,2.202809,...,7.717166,17.888649,2.815919,1.791592,10.954605,7.545632,1.406462,4.682922,13.168547,7.206612
3,Austria,0.112778,0.15198,0.098123,0.723472,0.072395,2.051567,0.441433,0.825162,0.511871,...,0.20957,0.365508,0.160856,0.586655,1.274669,0.134382,0.013983,0.698932,1.191857,0.195471
4,Bahamas,0.0,0.261953,0.0,0.0,0.0,0.492779,0.0,0.0,0.0,...,0.526696,0.0,0.0,0.0,0.0,0.0,0.0,0.495543,0.412132,0.0


Let's make a dataframe for visualization using Tableau public, containing origin, topic label, topic score, and # of movies.

In [455]:
new_origin = origin_topic_score['origin'].tolist()
len(new_origin) #list of origins

68

In [473]:
origin_list=[]
for origin in new_origin:
    origin_list.append([origin]*30)
len(origin_list)

68

In [495]:
topic_list = ['magic, myths', 'school, college', 'fantasy, christmas', 'home', 'ships, sailing, pirates', 'love, relationships', 'war', 'exploration, nature, space', 
              'comedy','places, nature, scenery','hollywood, stars', 'society, culture','historical, costumes', 'money, robbery',
              'photography, design','spies, terrorism', 'mixed','town','press, politics', 'crime, police, underworld',
             'documentary, interview', 'friendship, relationships', 'cowboys and indians','night life, enjoyment', 'crime, mistery',  
              'music', 'farming, country side','fantasy, fairy tale', 'love, family', 'gangs, drugs, police']
topics = [topic_list]*68

In [462]:
df = origin_topic_score.iloc[:, 1:31]
df.head()
t = map(list, df.values) #list of topic scores for each origin
len(t)

68

In [507]:
#make dataframe for visualization on Tableau public
origin_df =pd.DataFrame()
num_origin= range(0,68)
for i in num_origin:
    temp = pd.DataFrame({'origin':origin_list[i], 'topic': topics[i] , 'topic_score': t[i]})
    origin_df = pd.concat([origin_df, temp])

    origin_df.tail()                       

Unnamed: 0,origin,topic,topic_score
25,Vietnam,music,0.109885
26,Vietnam,"farming, country side",0.0
27,Vietnam,"fantasy, fairy tale",0.0
28,Vietnam,"love, family",0.0
29,Vietnam,"gangs, drugs, police",0.0


In [546]:
#add number of movies as a column
origin_df['num_movies'] = pd.Series(n_movie, index=origin_df.index)
origin_df.tail()

Unnamed: 0,origin,topic,topic_score,num_movies,normalized_score
25,Vietnam,music,0.109885,1,3.296556
26,Vietnam,"farming, country side",0.0,1,0.0
27,Vietnam,"fantasy, fairy tale",0.0,1,0.0
28,Vietnam,"love, family",0.0,1,0.0
29,Vietnam,"gangs, drugs, police",0.0,1,0.0


In [548]:
from operator import truediv
s = origin_df['topic_score'].tolist()
n = origin_df['num_movies'].tolist()
n_score = map(truediv, s, n)
origin_df['normalized_score'] = pd.Series(n_score, index=origin_df.index)
origin_df.head()

Unnamed: 0,origin,topic,topic_score,num_movies,normalized_score
0,Afghanistan,"magic, myths",0.0,1,0.0
1,Afghanistan,"school, college",0.05525,1,0.05525
2,Afghanistan,"fantasy, christmas",0.0,1,0.0
3,Afghanistan,home,0.0,1,0.0
4,Afghanistan,"ships, sailing, pirates",0.040695,1,0.040695


The num_mobies should be divided by number of topics(30), because Tableau public plots the sum of num_mobies for each origin.

In [549]:
origin_df['num_movies'] = origin_df['num_movies']/30
origin_df.head()

Unnamed: 0,origin,topic,topic_score,num_movies,normalized_score
0,Afghanistan,"magic, myths",0.0,0.033333,0.0
1,Afghanistan,"school, college",0.05525,0.033333,0.05525
2,Afghanistan,"fantasy, christmas",0.0,0.033333,0.0
3,Afghanistan,home,0.0,0.033333,0.0
4,Afghanistan,"ships, sailing, pirates",0.040695,0.033333,0.040695


In [551]:
origin_df = origin_df.sort_values('num_movies', axis=0, ascending=False, inplace=False)
origin_df.head()

Unnamed: 0,origin,topic,topic_score,num_movies,normalized_score
15,USA,"spies, terrorism",63.330414,106.4,0.01984
16,USA,mixed,29.020412,106.4,0.009092
2,USA,"fantasy, christmas",29.022633,106.4,0.009092
3,USA,home,119.264356,106.4,0.037364
4,USA,"ships, sailing, pirates",32.048349,106.4,0.01004


In [552]:
#save to csv 
origin_df.to_csv("origin_df.csv", index=False)

In [532]:
#origins with number of movies greater than 50
num_df = pd.DataFrame({'origin':new_origin, 'total_num':n_movies})
num_df.head()

Unnamed: 0,origin,total_num
0,Afghanistan,1
1,Argentina,3
2,Australia,162
3,Austria,12
4,Bahamas,5


In [536]:
major_origin = num_df[num_df.total_num >=100] 

In [537]:
major_origin

Unnamed: 0,origin,total_num
2,Australia,162
8,Canada,258
15,France,181
17,Germany,400
24,India,513
32,Japan,113
61,UK,492
62,USA,3192


In [538]:
major_origin['origin'].tolist()

['Australia', 'Canada', 'France', 'Germany', 'India', 'Japan', 'UK', 'USA']