##Work Flow##
1. Get top 50 movies list from boxofficemojo.com for 4 countries, 5 years (~250 movies for each country)
2. Get origin(production country) from wiki
3. Collect wiki url for each movie
4. Scraping wiki plots
5. Cleaning plot text : remove tags
6. Add some popularity measures: sales data for each movie
7. Merge 4 dataframes into one

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
import urllib2
from bs4 import BeautifulSoup

In [3]:
#popular movies in US(top 50 movies for each of 2010~2014)
df_us=pd.read_csv("US.csv")
df_us.head(5)

Unnamed: 0,year,country,movie,source,origin,url_wiki
0,2014,US,American Sniper,boxofficemojo,United States,https://en.wikipedia.org/wiki/American_Sniper
1,2014,US,The Hunger Games: Mockingjay - Part 1,boxofficemojo,United States,https://en.wikipedia.org/wiki/The_Hunger_Games...
2,2014,US,Guardians of the Galaxy,boxofficemojo,United States,https://en.wikipedia.org/wiki/Guardians_of_the...
3,2014,US,Captain America: The Winter Soldier,boxofficemojo,United States,https://en.wikipedia.org/wiki/Captain_America:...
4,2014,US,The LEGO Movie,boxofficemojo,"United States, Australia,Denmark",https://en.wikipedia.org/wiki/The_Lego_Movie


In [4]:
df_us.shape

(271, 6)

In [5]:
#wiki-url list
wikis_us= df_us['url_wiki'].tolist()

In [8]:
#scraping movie plots : list of wiki plots for US top movies
plots_us =[]
for wiki in wikis_us:
    soup = BeautifulSoup(urllib2.urlopen(wiki))
    hs = soup.find_all('h2')
    ps = hs[1].find_next_siblings()
    contents_us=[]
    for section in ps:
        content = u""
        if section.name=='h2':
            break
        content +=unicode(section)
        contents_us.append(content)
    plots_us.append(contents_us)

In [9]:
len(plots_us)

271

In [10]:
#add plots_us to column 'wiki_plot' in the original dataframe 
df_us['wiki_plot'] = pd.Series(plots_us, index=df_us.index)
df_us.head(5)

Unnamed: 0,year,country,movie,source,origin,url_wiki,wiki_plot
0,2014,US,American Sniper,boxofficemojo,United States,https://en.wikipedia.org/wiki/American_Sniper,"[<p>Growing up in <a href=""/wiki/Texas"" title=..."
1,2014,US,The Hunger Games: Mockingjay - Part 1,boxofficemojo,United States,https://en.wikipedia.org/wiki/The_Hunger_Games...,[<p>After being rescued from the destroyed are...
2,2014,US,Guardians of the Galaxy,boxofficemojo,United States,https://en.wikipedia.org/wiki/Guardians_of_the...,"[<p>In 1988, following his mother's death, a y..."
3,2014,US,Captain America: The Winter Soldier,boxofficemojo,United States,https://en.wikipedia.org/wiki/Captain_America:...,"[<p>Two years after <a href=""/wiki/The_Avenger..."
4,2014,US,The LEGO Movie,boxofficemojo,"United States, Australia,Denmark",https://en.wikipedia.org/wiki/The_Lego_Movie,"[<p>In the <a href=""/wiki/Lego"" title=""Lego"">L..."


In [12]:
#popular movies in GE(top 50 movies for each of 2010~2014)
df_ge=pd.read_csv("GE.csv")
df_ge.head(5)

Unnamed: 0,year,country,movie,source,origin,url_wiki
0,2014,Germany,The Hobbit: The Battle of the Five Armies,boxofficemojo,"New Zealand,United States",https://en.wikipedia.org/wiki/The_Hobbit:_The_...
1,2014,Germany,Honig im Kopf,boxofficemojo,Germany,https://en.wikipedia.org/wiki/Head_Full_of_Honey
2,2014,Germany,Transformers: Age of Extinction,boxofficemojo,"United States, China",https://en.wikipedia.org/wiki/Transformers:_Ag...
3,2014,Germany,The Hunger Games: Mockingjay - Part 1,boxofficemojo,United States,https://en.wikipedia.org/wiki/The_Hunger_Games...
4,2014,Germany,Qu'est-ce qu'on a fait au Bon Dieu?,boxofficemojo,France,https://en.wikipedia.org/wiki/Serial_(Bad)_Wed...


In [13]:
df_ge.shape

(238, 6)

In [14]:
wikis_ge= df_ge['url_wiki'].tolist()

In [15]:
#scraping movie plots : list of wiki plots for GE top movies
plots_ge =[]
for wiki in wikis_ge:
    soup = BeautifulSoup(urllib2.urlopen(wiki))
    hs = soup.find_all('h2')
    ps = hs[1].find_next_siblings()
    contents_ge=[]
    for section in ps:
        content = u""
        if section.name=='h2':
            break
        content +=unicode(section)
        contents_ge.append(content)
    plots_ge.append(contents_ge)

In [16]:
len(plots_ge)

238

In [17]:
#add plots_us to column 'wiki_plot' in the original dataframe 
df_ge['wiki_plot'] = pd.Series(plots_ge, index=df_ge.index)
df_ge.head(5)

Unnamed: 0,year,country,movie,source,origin,url_wiki,wiki_plot
0,2014,Germany,The Hobbit: The Battle of the Five Armies,boxofficemojo,"New Zealand,United States",https://en.wikipedia.org/wiki/The_Hobbit:_The_...,"[<p><a href=""/wiki/Bilbo_Baggins"" title=""Bilbo..."
1,2014,Germany,Honig im Kopf,boxofficemojo,Germany,https://en.wikipedia.org/wiki/Head_Full_of_Honey,"[<p>Retired <a class=""mw-redirect"" href=""/wiki..."
2,2014,Germany,Transformers: Age of Extinction,boxofficemojo,"United States, China",https://en.wikipedia.org/wiki/Transformers:_Ag...,"[<table class=""metadata plainlinks ambox ambox..."
3,2014,Germany,The Hunger Games: Mockingjay - Part 1,boxofficemojo,United States,https://en.wikipedia.org/wiki/The_Hunger_Games...,[<p>After being rescued from the destroyed are...
4,2014,Germany,Qu'est-ce qu'on a fait au Bon Dieu?,boxofficemojo,France,https://en.wikipedia.org/wiki/Serial_(Bad)_Wed...,"[<p>Claude Verneuil, a <a href=""/wiki/Gaullism..."


In [18]:
#popular movies in KR(top 50 movies for each of 2010~2014)
df_kr=pd.read_csv("KR.csv")
df_kr.head(5)

Unnamed: 0,year,country,movie,source,origin,url_wiki
0,2014,KR,Myeong-ryang (The Admiral:Roaring Currents),boxofficemojo,South Korea,https://en.wikipedia.org/wiki/The_Admiral:_Roa...
1,2014,KR,Ode to My Father,boxofficemojo,South Korea,https://en.wikipedia.org/wiki/Ode_to_My_Father
2,2014,KR,Frozen (2013),boxofficemojo,United States,https://en.wikipedia.org/wiki/Frozen_(2013_film)
3,2014,KR,Interstellar,boxofficemojo,"United States,United Kingdom",https://en.wikipedia.org/wiki/Interstellar_(film)
4,2014,KR,The Pirates (2014),boxofficemojo,South Korea,https://en.wikipedia.org/wiki/The_Pirates_(201...


In [19]:
df_kr.shape

(240, 6)

In [20]:
wikis_kr= df_kr['url_wiki'].tolist()

In [21]:
#scraping movie plots : list of wiki plots for KR top movies
plots_kr =[]
for wiki in wikis_kr:
    soup = BeautifulSoup(urllib2.urlopen(wiki))
    hs = soup.find_all('h2')
    ps = hs[1].find_next_siblings()
    contents_kr=[]
    for section in ps:
        content = u""
        if section.name=='h2':
            break
        content +=unicode(section)
        contents_kr.append(content)
    plots_kr.append(contents_kr)

In [22]:
len(plots_kr)

240

In [23]:
#add plots_us to column 'wiki_plot' in the original dataframe 
df_kr['wiki_plot'] = pd.Series(plots_kr, index=df_kr.index)
df_kr.head(5)

Unnamed: 0,year,country,movie,source,origin,url_wiki,wiki_plot
0,2014,KR,Myeong-ryang (The Admiral:Roaring Currents),boxofficemojo,South Korea,https://en.wikipedia.org/wiki/The_Admiral:_Roa...,[<p>The film revolves around the titular <a hr...
1,2014,KR,Ode to My Father,boxofficemojo,South Korea,https://en.wikipedia.org/wiki/Ode_to_My_Father,"[<p>During the <a class=""mw-redirect"" href=""/w..."
2,2014,KR,Frozen (2013),boxofficemojo,United States,https://en.wikipedia.org/wiki/Frozen_(2013_film),"[<p><a href=""/wiki/Elsa_(Disney)"" title=""Elsa ..."
3,2014,KR,Interstellar,boxofficemojo,"United States,United Kingdom",https://en.wikipedia.org/wiki/Interstellar_(film),"[<p>Widespread catastrophic crop <a href=""/wik..."
4,2014,KR,The Pirates (2014),boxofficemojo,South Korea,https://en.wikipedia.org/wiki/The_Pirates_(201...,[<p>On the eve of the founding of the <a class...


In [24]:
#popular movies in UK(top 50 movies for each of 2010~2014)
df_uk=pd.read_csv("UK.csv")
df_uk.head(5)

Unnamed: 0,year,country,movie,source,origin,url_wiki
0,2014,UK,The Hobbit: The Battle of the Five Armies,boxofficemojo,"New Zealand,United States",https://en.wikipedia.org/wiki/The_Hobbit:_The_...
1,2014,UK,Paddington,boxofficemojo,"United Kingdom, France",https://en.wikipedia.org/wiki/Paddington_(film)
2,2014,UK,The Lego Movie,boxofficemojo,"United States, Australia,Denmark",https://en.wikipedia.org/wiki/The_Lego_Movie
3,2014,UK,The Inbetweeners 2,boxofficemojo,United Kingdom,https://en.wikipedia.org/wiki/The_Inbetweeners_2
4,2014,UK,Dawn of the Planet of the Apes,boxofficemojo,United States,https://en.wikipedia.org/wiki/Dawn_of_the_Plan...


In [25]:
df_uk.shape

(248, 6)

In [26]:
wikis_uk= df_uk['url_wiki'].tolist()

In [27]:
#scraping movie plots : list of wiki plots for UK top movies
plots_uk =[]
for wiki in wikis_uk:
    soup = BeautifulSoup(urllib2.urlopen(wiki))
    hs = soup.find_all('h2')
    ps = hs[1].find_next_siblings()
    contents_uk=[]
    for section in ps:
        content = u""
        if section.name=='h2':
            break
        content +=unicode(section)
        contents_uk.append(content)
    plots_uk.append(contents_uk)

In [28]:
len(plots_uk)

248

In [29]:
#add plots_us to column 'wiki_plot' in the original dataframe 
df_uk['wiki_plot'] = pd.Series(plots_uk, index=df_uk.index)
df_uk.head(5)

Unnamed: 0,year,country,movie,source,origin,url_wiki,wiki_plot
0,2014,UK,The Hobbit: The Battle of the Five Armies,boxofficemojo,"New Zealand,United States",https://en.wikipedia.org/wiki/The_Hobbit:_The_...,"[<p><a href=""/wiki/Bilbo_Baggins"" title=""Bilbo..."
1,2014,UK,Paddington,boxofficemojo,"United Kingdom, France",https://en.wikipedia.org/wiki/Paddington_(film),"[<p>In the deep jungles of darkest <a href=""/w..."
2,2014,UK,The Lego Movie,boxofficemojo,"United States, Australia,Denmark",https://en.wikipedia.org/wiki/The_Lego_Movie,"[<p>In the <a href=""/wiki/Lego"" title=""Lego"">L..."
3,2014,UK,The Inbetweeners 2,boxofficemojo,United Kingdom,https://en.wikipedia.org/wiki/The_Inbetweeners_2,"[<p><a class=""mw-redirect"" href=""/wiki/Will_Mc..."
4,2014,UK,Dawn of the Planet of the Apes,boxofficemojo,United States,https://en.wikipedia.org/wiki/Dawn_of_the_Plan...,[<p>Ten years after the worldwide pandemic of ...


In [30]:
#save 4 dataframes as csv 
df_us.to_csv("df_us.csv", index=False)
df_ge.to_csv("df_ge.csv", index=False)
df_uk.to_csv("df_uk.csv", index=False)
df_kr.to_csv("df_kr.csv", index=False)

In [None]:
#cleaning the plots text : remove tags like <p>, </p> 


In [None]:
#add popularity measures : dvd sales data


In [None]:
#merge 4 dataframes into 1 dataframe
