<h1 align='center'>Topic Distribution by Origin of Movies</h1>

In previous analyses, we studied the topic distribution by origin for popular movies, and we could see some interesting differences in topics. But, due to the limited movies plots for popular movies, interpretation of the results should be limited. So, we are trying to expand the topic distribution by origin for all movies across the world since 2007.
<hr>

In [2]:
from gensim import corpora, models, similarities, matutils
import itertools
import numpy as np
import pandas as pd
from unidecode import unidecode
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [3]:
dropbox = "/Users/mr/Dropbox/moviemeta/"

## LDA topic distribution for IMDB data

In [55]:
imdb_topic_df = pd.read_csv(dropbox + 'imdb_topic_df.csv')
imdb_topic_df = imdb_topic_df.set_index('Unnamed: 0')
imdb_topic_df = imdb_topic_df.drop('Unnamed: 0.1', axis=1)

In [56]:
imdb_topic_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,title,year,origin
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0.0,0.096385,0,0.0,0,0,...,0.0,0.65935,0.0,0.0,0.0,0,0,#1 Serial Killer (2013),2013,USA
4,0,0,0,0,0.037337,0.0,0,0.800602,0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,#29 (2012),2012,Netherlands
5,0,0,0,0,0.0,0.0,0,0.0,0,0,...,0.0,0.229568,0.0,0.0,0.069438,0,0,#30 (2013),2013,Australia
7,0,0,0,0,0.0,0.291638,0,0.290491,0,0,...,0.0,0.0,0.035633,0.030289,0.0,0,0,#47 (2014),2014,Portugal
8,0,0,0,0,0.0,0.0,0,0.516517,0,0,...,0.223738,0.0,0.10712,0.0,0.0,0,0,#5 (2013),2013,USA


### Adding country of origin from IMDB data

We create a data frame of all movies since 2007 and add the countries of origin fram the IMDB data set.

In [53]:
imdb_meta_2007_2015 = pd.read_csv(dropbox + 'imdb_meta_2007_2015.csv')
imdb_meta_2007_2015 = imdb_meta_2007_2015.set_index('Unnamed: 0')
imdb_meta_2007_2015 = imdb_meta_2007_2015[imdb_meta_2007_2015.index.isin(imdb_topic_df.index)]
imdb_meta_2007_2015 = imdb_meta_2007_2015.drop('Unnamed: 0.1', axis=1)
imdb_meta_2007_2015.head()

Unnamed: 0_level_0,title,year,origin
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,#1 Serial Killer (2013),2013,USA
4,#29 (2012),2012,Netherlands
5,#30 (2013),2013,Australia
7,#47 (2014),2014,Portugal
8,#5 (2013),2013,USA


In [42]:
imdb_topic_df = pd.concat([imdb_topic_df, imdb_meta_2007_2015[['origin']]], axis=1) #origin & topics merged dataframe
imdb_topic_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,Unnamed: 0.1,title,year,origin
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0.0,0.096385,0,0.0,0,0,...,0.65935,0.0,0.0,0.0,0,0,1,#1 Serial Killer (2013),2013,USA
4,0,0,0,0,0.037337,0.0,0,0.800602,0,0,...,0.0,0.0,0.0,0.0,0,0,4,#29 (2012),2012,Netherlands
5,0,0,0,0,0.0,0.0,0,0.0,0,0,...,0.229568,0.0,0.0,0.069438,0,0,5,#30 (2013),2013,Australia
7,0,0,0,0,0.0,0.291638,0,0.290491,0,0,...,0.0,0.035633,0.030289,0.0,0,0,7,#47 (2014),2014,Portugal
8,0,0,0,0,0.0,0.0,0,0.516517,0,0,...,0.0,0.10712,0.0,0.0,0,0,8,#5 (2013),2013,USA


In [80]:
#topic score for each origin & total number of movies produced in each origin
topic_score_mean= imdb_topic_df.groupby('origin').mean().drop('year', axis=1)
num_movies = imdb_topic_df.groupby('origin').count()['0']

In [82]:
topic_score_mean.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.006561,0.01476,0.007009,0.017836,0.004017,0.054678,0.153828,0.103329,0.010083,0.048052,...,0.061147,0.047057,0.01238,0.024405,0.016789,0.02396,0.012894,0.026863,0.057964,0.019931
Albania,0.00154,0.015485,0.004446,0.019431,0.003325,0.092006,0.031583,0.141454,0.021843,0.017354,...,0.042087,0.10257,0.008413,0.004577,0.104889,0.007376,0.004052,0.012282,0.104771,0.023857
Algeria,0.00972,0.00476,0.004296,0.03071,0.001522,0.119342,0.069403,0.090749,0.026078,0.020769,...,0.034813,0.081435,0.002745,0.029872,0.022597,0.0,0.014068,0.026,0.08294,0.04048
American Samoa,0.0,0.120243,0.0,0.0,0.0,0.043877,0.0,0.0,0.0,0.057441,...,0.05465,0.0,0.0,0.07771,0.0,0.030625,0.049067,0.050176,0.0,0.079651
Andorra,0.0,0.054211,0.016723,0.011782,0.00719,0.009783,0.008794,0.071936,0.023939,0.021307,...,0.118964,0.064279,0.0,0.044141,0.010504,0.069043,0.0,0.016254,0.112644,0.0


In [84]:
#check the number of origins
origins=topic_score_mean.index
print len(origins)
origins[:5]

220


Index([u'Afghanistan', u'Albania', u'Algeria', u'American Samoa', u'Andorra'], dtype='object', name=u'origin')

In [85]:
num_movies.head()

origin
Afghanistan       61
Albania           22
Algeria           16
American Samoa     1
Andorra            3
Name: 0, dtype: int64

In [86]:
#get number of movies as list
n_movies = num_movies.tolist()
n_movies = [x for x in n_movies if x != 0]
from itertools import repeat
total_movies=[x for item in n_movies for x in repeat(item, 30)]

In [87]:
origin_list=[]
for origin in origins:
    origin_list.append([origin]*30)
len(origin_list)

220

In [88]:
topic_list = ['magic, myths', 'school, college', 'fantasy, christmas', 'home', 'ships, sailing, pirates', 'love, relationships', 'war', 'exploration, nature, space', 
              'comedy','places, nature, scenery','hollywood, stars', 'society, culture','historical, costumes', 'money, robbery',
              'photography, design','spies, terrorism', 'mixed','town','press, politics', 'crime, police, underworld',
             'documentary, interview', 'friendship, relationships', 'cowboys and indians','night life, enjoyment', 'crime, mistery',  
              'music', 'farming, country side','fantasy, fairy tale', 'love, family', 'gangs, drugs, police']
topics = [topic_list]*len(origins)

In [89]:
df = topic_score_mean.iloc[:, 0:30]
t = map(list, df.values) #list of topic scores for each origin
len(t)

220

In [90]:
#make dataframe for visualization on Tableau public
origin_df_new =pd.DataFrame()
num_origins = range(0,len(origins))
for i in num_origins:
    temp = pd.DataFrame({'origin':origin_list[i], 'topic': topics[i] , 'topic_score_mean': t[i]})
    origin_df_new = pd.concat([origin_df_new, temp])

In [91]:
#add number of movies as a column
origin_df_new['num_movies'] = pd.Series(total_movies, index=origin_df_new.index)
origin_df_new.head()

Unnamed: 0,origin,topic,topic_score_mean,num_movies
0,Afghanistan,"magic, myths",0.006561,61
1,Afghanistan,"school, college",0.01476,61
2,Afghanistan,"fantasy, christmas",0.007009,61
3,Afghanistan,home,0.017836,61
4,Afghanistan,"ships, sailing, pirates",0.004017,61


In [93]:
#The num_movies should be divided by number of topics(30), because Tableau public plots the sum of num_movies for each origin.
origin_df_new['num_movies/30'] = origin_df_new['num_movies']/30
origin_df_new.head()

Unnamed: 0,origin,topic,topic_score_mean,num_movies,num_movies/30
6,USA,war,0.026705,47336,1577.866667
16,USA,mixed,0.008442,47336,1577.866667
0,USA,"magic, myths",0.008879,47336,1577.866667
2,USA,"fantasy, christmas",0.006919,47336,1577.866667
3,USA,home,0.022732,47336,1577.866667


In [94]:
#ordering the dataframe by total nubmer of movies
origin_df_new = origin_df_new.sort_values('num_movies', axis=0, ascending=False, inplace=False)
origin_df_new.head()

Unnamed: 0,origin,topic,topic_score_mean,num_movies,num_movies/30
6,USA,war,0.026705,47336,1577.866667
1,USA,"school, college",0.036914,47336,1577.866667
16,USA,mixed,0.008442,47336,1577.866667
19,USA,"crime, police, underworld",0.010188,47336,1577.866667
20,USA,"documentary, interview",0.053554,47336,1577.866667


In [95]:
#save above dataframe to csv
origin_df_new.to_csv(dropbox + "origin_df_all_new.csv", index=False)

In [97]:
#check the number of movies produced in each origin
num_df = pd.DataFrame({'origin':origins, 'total_num':n_movies})
num_df.head()

Unnamed: 0,origin,total_num
0,Afghanistan,61
1,Albania,22
2,Algeria,16
3,American Samoa,1
4,Andorra,3


In [98]:
major_origin = origin_df_new[origin_df_new.num_movies >=100] 
major_origin.head()

Unnamed: 0,origin,topic,topic_score_mean,num_movies,num_movies/30
6,USA,war,0.026705,47336,1577.866667
1,USA,"school, college",0.036914,47336,1577.866667
16,USA,mixed,0.008442,47336,1577.866667
19,USA,"crime, police, underworld",0.010188,47336,1577.866667
20,USA,"documentary, interview",0.053554,47336,1577.866667


In [99]:
major_origin.to_csv(dropbox + "origin_df_over_100.csv", index=False)