# IMDB Crawler

In [6]:
import urllib
from bs4 import BeautifulSoup as bs

## Search

In [7]:
TITLE_QUERY = (
    'http://www.imdb.com/find'
    '?q={title}&s=tt&ttype=ft&exact=true&ref_=fn_tt_ex'
)

In [15]:
movie_name="The Place Beyond the Pines"

In [16]:
def convert_title(title):
    return urllib.parse.quote(title).lower()

In [17]:
convert_title(movie_name)

'the%20place%20beyond%20the%20pines'

In [18]:
query = TITLE_QUERY.format(title=convert_title(movie_name))
search_res = bs(urllib.request.urlopen(query), "html.parser")

In [19]:
res_table = search_res.find_all("table", {"class": "findList"})[0]

In [20]:
first_row = res_table.find_all("tr")[0]

In [21]:
first_row

<tr class="findResult odd"> <td class="primary_photo"> <a href="/title/tt1817273/?ref_=fn_ft_tt_1"><img src="https://images-na.ssl-images-amazon.com/images/M/MV5BMjc1OTEwNjU4N15BMl5BanBnXkFtZTcwNzUzNDIwOQ@@._V1_UX32_CR0,0,32,44_AL_.jpg"/></a> </td> <td class="result_text"> <a href="/title/tt1817273/?ref_=fn_ft_tt_1">The Place Beyond the Pines</a> (2012) </td> </tr>

### Extracting the movie code

In [22]:
import re

In [23]:
MOVIE_CODE_REGEX = r'/title/([a-z0-9]+)/'

In [24]:
movie_code = re.findall(MOVIE_CODE_REGEX, str(first_row))[0]

In [25]:
movie_code

'tt1817273'

## Movie Profile

In [26]:
PROFILE_URL = 'http://www.imdb.com/title/{code}/' #?region=us

In [27]:
cur_profile_url = PROFILE_URL.format(code=movie_code)

In [28]:
prof_page = bs(urlopen(cur_profile_url), "html.parser")

### Rating

In [29]:
prof_page.find_all("span", {"itemprop": "ratingValue"})

[<span itemprop="ratingValue">7.3</span>]

In [30]:
rating = float(prof_page.find_all("span", {"itemprop": "ratingValue"})[0].contents[0])

In [31]:
rating

7.3

In [32]:
rating_count = int(prof_page.find_all("span", {"itemprop": "ratingCount"})[0].contents[0].replace(',', ''))

In [33]:
rating_count

195939

### Genres

In [34]:
genres = []

In [35]:
for span in prof_page.find_all("span", {"itemprop": "genre"}):
    genres.append(span.contents[0])

In [36]:
genres

['Crime', 'Drama', 'Thriller']

### Review counts

In [37]:
REVIEW_COUNT_REGEX = r'([0-9,]+) ([a-zA-Z]+)'

In [38]:
user_review_count = 0
critic_review_count = 0

In [39]:
for span in prof_page.find_all("span", {"itemprop": "reviewCount"}):
    span_str = span.contents[0]
    res = re.findall(REVIEW_COUNT_REGEX, span_str)[0]
    if res[1] == 'user':
        user_review_count = int(res[0].replace(',', ''))
    elif res[1] == 'critic':
        critic_review_count = int(res[0].replace(',', ''))

In [40]:
user_review_count

388

In [41]:
critic_review_count

419

### Metascore

In [42]:
metascore = int(prof_page.find_all("div", {"class": "metacriticScore"})[0].contents[1].contents[0])

In [43]:
metascore

68

### Year

In [44]:
year = int(prof_page.find_all("span", {"id": "titleYear"})[0].contents[1].contents[0])

In [45]:
year

2012

### Duration

In [46]:
MOVIE_DURATION_REGEX = r'PT([0-9]+)M'

In [47]:
duration_str = prof_page.find_all("time", {"itemprop": "duration"})[0]['datetime']

In [48]:
duration_in_minutes = int(re.findall(MOVIE_DURATION_REGEX, duration_str)[0])

In [49]:
duration_in_minutes

140

### Box office section

In [50]:
BOX_CONTENT_REGEX = r"<h3.*>Box Office</h3>([\s\S]+?)<h3"

In [51]:
box_contents = re.findall(BOX_CONTENT_REGEX, str(prof_page))[0]

In [52]:
box_contents

'\n<div class="txt-block">\n<h4 class="inline">Budget:</h4>        $15,000,000        \n\n      <span class="attribute">(estimated)</span>\n</div>\n<div class="txt-block">\n<h4 class="inline">Opening Weekend:</h4>         $279,457        \n\n      (USA)\n      <span class="attribute">(29 March 2013)</span>\n</div>\n<div class="txt-block">\n<h4 class="inline">Gross:</h4>        $21,383,298        \n\n      <span class="attribute">(USA)</span>\n<span class="attribute">(21 June 2013)</span>\n</div>\n<span class="see-more inline">\n<a href="business?ref_=tt_dt_bus" itemprop="url">See more</a>\xa0»\n  </span>\n<hr/>\n'

#### Budget

In [53]:
BUDGET_REGEX = r"<h4.*>Budget:</h4>\s*\$([0-9,]+)"

In [54]:
budget = int(re.findall(BUDGET_REGEX, box_contents)[0].replace(',', ''))

In [55]:
budget

15000000

#### Opening Weekend

In [56]:
from datetime import datetime

In [57]:
OPEN_DATE_REGEX = r"<h4.*>Opening Weekend:</h4>[\s\S]*?\([A-Z]+\)[\s\S]*?\(([0-9a-zA-Z\s]+)\)[\s\S]*?<h4"

In [58]:
open_date_str = re.findall(OPEN_DATE_REGEX, box_contents)[0]

In [59]:
open_date = datetime.strptime(open_date_str, "%d %B %Y").date()

In [60]:
open_date

datetime.date(2013, 3, 29)

In [66]:
OPEN_PROF_REGEX = r"<h4.*>Opening Weekend:</h4>\s*[\$\£]([0-9,]+)"

In [67]:
opening_weekend_income = int(re.findall(OPEN_PROF_REGEX, box_contents)[0].replace(',', ''))

In [68]:
opening_weekend_income

279457

In [64]:
OPEN_PROF_CURRENCY_REGEX = r"<h4.*>Opening Weekend:</h4>\s*([\$\£])[0-9,]+"

In [69]:
opening_weekend_currency = re.findall(OPEN_PROF_CURRENCY_REGEX, box_contents)[0]
opening_weekend_currency

'$'

#### Gross

In [70]:
GROSS_DATE_REGEX = r"<h4.*>Gross:</h4>[\s\S]*?\(USA\)[\s\S]*?\(([0-9a-zA-Z\s]+)\)"

In [71]:
gross_date_str = re.findall(GROSS_DATE_REGEX, box_contents)[0]

In [72]:
gross_date = datetime.strptime(gross_date_str, "%d %B %Y").date()

In [73]:
gross_date

datetime.date(2013, 6, 21)

In [74]:
GROSS_REGEX = r"<h4.*>Gross:</h4>\s*\$([0-9,]+)[\s\S]*?\(USA\)"

In [75]:
gross = int(re.findall(GROSS_REGEX, box_contents)[0].replace(',', ''))

In [76]:
gross

21383298

## Ratings page

In [87]:
RATINGS_URL = 'http://www.imdb.com/title/{code}/ratings'
cur_ratings_url = RATINGS_URL.format(code=movie_code)
ratings_page = bs(urlopen(cur_ratings_url), "html.parser")

In [88]:
tables = ratings_page.find_all("table")

In [89]:
def extract_table(table):
    content = []
    for row in table.find_all("tr")[1:]:
        content.append([td.get_text() for td in row.find_all("td")])
    return content

### Rating Frequency

In [90]:
hist_table = tables[0]

In [91]:
hist_content = extract_table(hist_table)

In [92]:
rating_freq = {}
for row in hist_content:
    rating_freq[int(row[2])] = int(row[0])
rating_freq

{1: 1774,
 2: 1108,
 3: 1892,
 4: 3760,
 5: 9230,
 6: 25357,
 7: 56173,
 8: 56331,
 9: 24703,
 10: 15611}

### Demographic breakdown

In [93]:
demog_table = tables[1]
demog_content = extract_table(demog_table)
demog_content

[[' Males ', '\xa0136212', '\xa07.3'],
 [' Females ', '\xa030252', '\xa07.2'],
 [' Aged under 18 ', '\xa0322', '\xa08.1'],
 [' Males under 18 ', '\xa0240', '\xa08.1'],
 [' Females under 18 ', '\xa078', '\xa07.8'],
 [' Aged 18-29 ', '\xa076850', '\xa07.5'],
 [' Males Aged 18-29 ', '\xa060767', '\xa07.6'],
 [' Females Aged 18-29 ', '\xa015223', '\xa07.4'],
 [' Aged 30-44 ', '\xa068898', '\xa07.2'],
 [' Males Aged 30-44 ', '\xa057379', '\xa07.2'],
 [' Females Aged 30-44 ', '\xa010527', '\xa07.1'],
 [' Aged 45+ ', '\xa012906', '\xa07.1'],
 [' Males Aged 45+ ', '\xa010350', '\xa07.1'],
 [' Females Aged 45+ ', '\xa02322', '\xa07.1'],
 [' IMDb staff ', '\xa021', '\xa07.0'],
 [' Top 1000 voters ', '\xa0511', '\xa06.8'],
 [' US users ', '\xa029049', '\xa07.5'],
 [' Non-US users ', '\xa099649', '\xa07.3'],
 ['\xa0'],
 [' IMDb users                         ', '\xa0195939', '\xa07.3']]

In [94]:
votes_per_demo = {}
avg_rating_per_demo = {}

In [95]:
for row in demog_content:
    try:
        votes_per_demo[row[0].strip()] = int(row[1])
        avg_rating_per_demo[row[0].strip()] = float(row[2])
    except IndexError:
        pass
print(votes_per_demo)
print(avg_rating_per_demo)

{'Aged under 18': 322, 'Males under 18': 240, 'Males Aged 45+': 10350, 'Females': 30252, 'Males Aged 18-29': 60767, 'IMDb staff': 21, 'Females Aged 30-44': 10527, 'Males Aged 30-44': 57379, 'Females Aged 45+': 2322, 'Aged 18-29': 76850, 'Females Aged 18-29': 15223, 'Aged 45+': 12906, 'Males': 136212, 'Top 1000 voters': 511, 'US users': 29049, 'Females under 18': 78, 'IMDb users': 195939, 'Aged 30-44': 68898, 'Non-US users': 99649}
{'Aged under 18': 8.1, 'Males under 18': 8.1, 'Males Aged 45+': 7.1, 'Females': 7.2, 'Males Aged 18-29': 7.6, 'IMDb staff': 7.0, 'Females Aged 30-44': 7.1, 'Males Aged 30-44': 7.2, 'Females Aged 45+': 7.1, 'Aged 18-29': 7.5, 'Females Aged 18-29': 7.4, 'Aged 45+': 7.1, 'Males': 7.3, 'Top 1000 voters': 6.8, 'US users': 7.5, 'Females under 18': 7.8, 'IMDb users': 7.3, 'Aged 30-44': 7.2, 'Non-US users': 7.3}


## Business page

In [182]:
BUSINESS_URL = 'http://www.imdb.com/title/{code}/business?ref_=tt_dt_bus'
cur_business_url = BUSINESS_URL.format(code=movie_code)
busi_page = bs(urlopen(cur_business_url), "html.parser")
busi_str = str(busi_page)

In [183]:
# #### Budget
# BUDGET_REGEX = r"<h5>Budget</h5>\n\s*\$([0-9,]+)"
# budget_dollar = int(re.findall(BUDGET_REGEX, busi_str)[0].replace(',', ''))

#### Number of screens (weekends)

In [185]:
WEEKEND_CONTENT_REGEX = r"<h5>Weekend Gross</h5>([\s\S]+?)<h5>"
weekend_contents = re.findall(WEEKEND_CONTENT_REGEX, busi_str)[0]
weekend_contents

'\n$29,577 (USA) (<a href="/date/06-23/">23 June</a> <a href="/year/2013/">2013</a>) (53 Screens)<br/>$33,275 (USA) (<a href="/date/06-16/">16 June</a> <a href="/year/2013/">2013</a>) (76 Screens)<br/>$72,647 (USA) (<a href="/date/06-09/">9 June</a> <a href="/year/2013/">2013</a>) (134 Screens)<br/>$84,670 (USA) (<a href="/date/06-02/">2 June</a> <a href="/year/2013/">2013</a>) (103 Screens)<br/>$160,957 (USA) (<a href="/date/05-26/">26 May</a> <a href="/year/2013/">2013</a>) (128 Screens)<br/>$306,969 (USA) (<a href="/date/05-19/">19 May</a> <a href="/year/2013/">2013</a>) (317 Screens)<br/>$705,333 (USA) (<a href="/date/05-12/">12 May</a> <a href="/year/2013/">2013</a>) (669 Screens)<br/>$1,280,152 (USA) (<a href="/date/05-05/">5 May</a> <a href="/year/2013/">2013</a>) (1,162 Screens)<br/>$2,699,140 (USA) (<a href="/date/04-28/">28 April</a> <a href="/year/2013/">2013</a>) (1,584 Screens)<br/>$4,917,545 (USA) (<a href="/date/04-21/">21 April</a> <a href="/year/2013/">2013</a>) (1,542

In [None]:

US_OPEN_WEEKEND_REGEX = r"\$([0-9,]+)\s*\(USA\)"
us_open_weekend = int(re.findall(US_OPEN_WEEKEND_REGEX, open_weekend_contents)[0].replace(',', ''))

In [184]:
# ### Gross Earnings
# GROSS_CONTENT_REGEX = r"<h5>Gross</h5>([\s\S]+?)<h5>"
# gross_contents = re.findall(GROSS_CONTENT_REGEX, busi_str)[0]
# GROSS_REGEX = r"<h5>Gross</h5>\n\s*\$([0-9,]+)\s*\(USA\)"
# gross_inc_dollar = int(re.findall(GROSS_REGEX, busi_str)[0].replace(',', ''))

# Uniting Dataframes

In [96]:
import pandas as pd

In [179]:
df = pd.read_csv('/Users/shaypalachy/clones/rotten_needles/data/movie_profiles.csv')

In [180]:
df.columns

Index(['Unnamed: 0', 'budget', 'budget_currency', 'closing_date',
       'critic_review_count', 'duration', 'gross_income', 'metascore', 'name',
       'opening_weekend_date', 'opening_weekend_income',
       'opening_weekend_income_currency', 'rating', 'rating_count',
       'user_review_count', 'year', 'avg_rating_per_demo.aged_18-29',
       'avg_rating_per_demo.aged_30-44', 'avg_rating_per_demo.aged_45+',
       'avg_rating_per_demo.aged_under_18', 'avg_rating_per_demo.females',
       'avg_rating_per_demo.females_aged_18-29',
       'avg_rating_per_demo.females_aged_30-44',
       'avg_rating_per_demo.females_aged_45+',
       'avg_rating_per_demo.females_under_18',
       'avg_rating_per_demo.imdb_staff', 'avg_rating_per_demo.imdb_users',
       'avg_rating_per_demo.males', 'avg_rating_per_demo.males_aged_18-29',
       'avg_rating_per_demo.males_aged_30-44',
       'avg_rating_per_demo.males_aged_45+',
       'avg_rating_per_demo.males_under_18',
       'avg_rating_per_demo.non-

In [175]:
import os
from rotten_needles.imdb_crawl.jsondate import (load, dump)

In [176]:
profiles = []
for profile_file in os.listdir('/Users/shaypalachy/clones/rotten_needles/data/movie_profiles'):
    print('Reading {}'.format(profile_file))
    file_path = os.path.join('/Users/shaypalachy/clones/rotten_needles/data/movie_profiles', profile_file)
    with open(file_path, 'r') as json_file:
        profiles.append(load(json_file))
df = pd.DataFrame(profiles)

Reading (500)_days_of_summer.json
Reading 12_angry_men.json
Reading 21_jump_street.json
Reading 3:10_to_yuma.json
Reading 300.json
Reading 30_minutes_or_less.json
Reading a.i._artificial_intelligence.json
Reading a_beautiful_mind.json
Reading a_bronx_tale.json
Reading a_clockwork_orange.json
Reading a_few_good_men.json
Reading a_nightmare_on_elm_street.json
Reading a_prophet.json
Reading a_separation.json
Reading a_very_harold_&_kumar_3d_christmas.json
Reading accepted.json
Reading adaptation..json
Reading alien.json
Reading aliens.json
Reading american_beauty.json
Reading american_gangster.json
Reading american_history_x.json
Reading american_psycho.json
Reading amores_perros.json
Reading argo.json
Reading as_good_as_it_gets.json
Reading barton_fink.json
Reading basic_instinct.json
Reading batman.json
Reading batman_begins.json
Reading batman_forever.json
Reading batman_returns.json
Reading before_sunrise.json
Reading before_sunset.json
Reading being_john_malkovich.json
Reading biutif

In [177]:
df.ix[0]

avg_rating_per_demo                {'Aged under 18': 7.9, 'Males under 18': 8.0, ...
budget                                                                       7.5e+06
budget_currency                                                                    $
closing_date                                                              2009-11-20
critic_review_count                                                              333
duration                                                                          95
genres                                                      [Comedy, Drama, Romance]
gross_income                                                             3.23914e+07
metascore                                                                         76
name                                                            (500) Days of Summer
opening_weekend_date                                                      2009-07-17
opening_weekend_income                                           

In [178]:
DEMOGRAPHICS = ['Aged under 18', 'Males under 18', 'Males Aged 45+', 'Females', 'Males Aged 18-29', 'IMDb staff', 'IMDb users', 'Males', 'Aged 30-44', 'Females Aged 45+', 'Aged 18-29', 'Females Aged 18-29', 'Aged 45+', 'Males Aged 30-44', 'Top 1000 voters', 'Females under 18', 'Females Aged 30-44', 'US users', 'Non-US users']

In [126]:
DEMOGRAPHICS = ['aged_under_18',
 'males_under_18',
 'males_aged_45+',
 'females',
 'males_aged_18-29',
 'imdb_staff',
 'imdb_users',
 'males',
 'aged_30-44',
 'females_aged_45+',
 'aged_18-29',
 'females_aged_18-29',
 'aged_45+',
 'males_aged_30-44',
 'top_1000_voters',
 'females_under_18',
 'females_aged_30-44',
 'us_users',
 'non-us_users']

In [117]:
def _parse_string(string):
    return string.lower().strip().replace(' ', '_')

In [137]:
def decompose_dict_column(df, colname, allowed_cols):
    newdf = df[colname].apply(pd.Series)
    newdf = newdf.drop([col for col in newdf.columns if col not in allowed_cols], axis=1)
    newdf.columns = [colname+'.'+col for col in newdf.columns]
    return pd.concat([df.drop([colname], axis=1), newdf], axis=1)

In [138]:
decompose_dict_column(df, 'avg_rating_per_demo', DEMOGRAPHICS);

  union = _union_indexes(indexes)


In [None]:
decompose_dict_column(df, 'votes_per_demo', DEMOGRAPHICS);

In [140]:
decompose_dict_column(df, 'rating_freq', [str(i) for i in range(1,11)])

  union = _union_indexes(indexes)


Unnamed: 0,avg_rating_per_demo,budget,budget_currency,closing_date,critic_review_count,duration,genres,gross_income,metascore,name,...,rating_freq.1,rating_freq.10,rating_freq.2,rating_freq.3,rating_freq.4,rating_freq.5,rating_freq.6,rating_freq.7,rating_freq.8,rating_freq.9
0,"{'Aged under 18': 7.9, 'Males under 18': 8.0, ...",7.500000e+06,$,2009-11-20,333.0,95.0,"[Comedy, Drama, Romance]",32391374.0,76.0,(500) Days of Summer,...,3527.0,57801.0,2057.0,3044.0,5937.0,13495.0,33513.0,81826.0,119093.0,71284.0
1,"{'Aged under 18': 9.2, 'Males under 18': 9.2, ...",3.500000e+05,$,,180.0,96.0,"[Crime, Drama]",,,12 Angry Men,...,6819.0,169710.0,1075.0,1171.0,1756.0,4120.0,9270.0,32027.0,92040.0,158009.0
2,"{'Aged under 18': 7.9, 'Males under 18': 7.9, ...",4.200000e+07,$,2012-06-29,374.0,110.0,"[Action, Comedy, Crime]",138447667.0,69.0,21 Jump Street,...,3725.0,42056.0,2405.0,3987.0,8193.0,21037.0,57949.0,127181.0,111662.0,45719.0
3,"{'Aged under 18': 7.8, 'Males under 18': 7.8, ...",5.500000e+07,$,2007-11-02,296.0,122.0,"[Adventure, Crime, Drama]",53574088.0,76.0,3:10 to Yuma,...,2209.0,28532.0,791.0,1251.0,2470.0,6066.0,18145.0,55124.0,86938.0,43147.0
4,"{'Aged under 18': 7.7, 'Males under 18': 7.7, ...",6.500000e+07,$,2007-07-06,458.0,117.0,"[Action, Fantasy]",210592590.0,52.0,300,...,20297.0,123392.0,5817.0,7623.0,11451.0,22015.0,48646.0,110633.0,167485.0,109762.0
5,"{'Aged under 18': 6.4, 'Males under 18': 6.5, ...",2.800000e+10,$,2011-09-23,219.0,83.0,"[Action, Comedy, Crime]",37053924.0,49.0,30 Minutes or Less,...,1263.0,3233.0,1129.0,2162.0,4919.0,12046.0,23884.0,20789.0,8241.0,2388.0
6,"{'Aged under 18': 7.4, 'Males under 18': 7.3, ...",1.000000e+08,$,2001-09-21,284.0,146.0,"[Adventure, Drama, Sci-Fi]",78616689.0,65.0,A.I. Artificial Intelligence,...,5264.0,27220.0,3122.0,4701.0,8108.0,16121.0,34887.0,63122.0,54479.0,29223.0
7,"{'Aged under 18': 8.5, 'Males under 18': 8.4, ...",5.800000e+07,$,2002-05-24,204.0,135.0,"[Biography, Drama]",170708996.0,72.0,A Beautiful Mind,...,3319.0,116968.0,1431.0,2277.0,4134.0,9950.0,28432.0,93993.0,211547.0,172520.0
8,"{'Aged under 18': 8.1, 'Males under 18': 8.1, ...",2.200000e+07,$,,52.0,121.0,"[Crime, Drama]",17266971.0,80.0,A Bronx Tale,...,401.0,13525.0,194.0,311.0,724.0,2319.0,7300.0,24263.0,35801.0,16566.0
9,"{'Aged under 18': 9.0, 'Males under 18': 9.0, ...",2.200000e+06,$,,220.0,136.0,"[Crime, Drama, Sci-Fi]",,78.0,A Clockwork Orange,...,10762.0,155374.0,4405.0,5479.0,7774.0,14324.0,29787.0,71918.0,134639.0,150877.0


In [150]:
genre_set = set([genre for genre_list in df.genres.dropna() for genre in genre_list])
genre_set

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'War',
 'Western'}

In [170]:
def dummy_list_column(df, colname):
    value_set = set([value for value_list in df[colname].dropna() for value in value_list])
    def value_list_to_dict(value_list):
        try:
            return {value : 1 if value in value_list else 0 for value in value_set}
        except TypeError:
            return {value : 0 for value in value_set}
    df[colname] = df[colname].apply(value_list_to_dict)
    return decompose_dict_column(df, colname, list(value_set))

In [171]:
dummy_list_column(df, 'genres')

Unnamed: 0,avg_rating_per_demo,budget,budget_currency,closing_date,critic_review_count,duration,gross_income,metascore,name,opening_weekend_date,...,genres.Horror,genres.Music,genres.Musical,genres.Mystery,genres.Romance,genres.Sci-Fi,genres.Sport,genres.Thriller,genres.War,genres.Western
0,"{'Aged under 18': 7.9, 'Males under 18': 8.0, ...",7.500000e+06,$,2009-11-20,333.0,95.0,32391374.0,76.0,(500) Days of Summer,2009-07-17,...,0,0,0,0,1,0,0,0,0,0
1,"{'Aged under 18': 9.2, 'Males under 18': 9.2, ...",3.500000e+05,$,,180.0,96.0,,,12 Angry Men,,...,0,0,0,0,0,0,0,0,0,0
2,"{'Aged under 18': 7.9, 'Males under 18': 7.9, ...",4.200000e+07,$,2012-06-29,374.0,110.0,138447667.0,69.0,21 Jump Street,2012-03-16,...,0,0,0,0,0,0,0,0,0,0
3,"{'Aged under 18': 7.8, 'Males under 18': 7.8, ...",5.500000e+07,$,2007-11-02,296.0,122.0,53574088.0,76.0,3:10 to Yuma,2007-09-07,...,0,0,0,0,0,0,0,0,0,0
4,"{'Aged under 18': 7.7, 'Males under 18': 7.7, ...",6.500000e+07,$,2007-07-06,458.0,117.0,210592590.0,52.0,300,2007-03-09,...,0,0,0,0,0,0,0,0,0,0
5,"{'Aged under 18': 6.4, 'Males under 18': 6.5, ...",2.800000e+10,$,2011-09-23,219.0,83.0,37053924.0,49.0,30 Minutes or Less,2011-08-12,...,0,0,0,0,0,0,0,0,0,0
6,"{'Aged under 18': 7.4, 'Males under 18': 7.3, ...",1.000000e+08,$,2001-09-21,284.0,146.0,78616689.0,65.0,A.I. Artificial Intelligence,2001-06-29,...,0,0,0,0,0,1,0,0,0,0
7,"{'Aged under 18': 8.5, 'Males under 18': 8.4, ...",5.800000e+07,$,2002-05-24,204.0,135.0,170708996.0,72.0,A Beautiful Mind,2001-12-21,...,0,0,0,0,0,0,0,0,0,0
8,"{'Aged under 18': 8.1, 'Males under 18': 8.1, ...",2.200000e+07,$,,52.0,121.0,17266971.0,80.0,A Bronx Tale,,...,0,0,0,0,0,0,0,0,0,0
9,"{'Aged under 18': 9.0, 'Males under 18': 9.0, ...",2.200000e+06,$,,220.0,136.0,,78.0,A Clockwork Orange,,...,0,0,0,0,0,1,0,0,0,0


In [161]:
df.replace?

In [162]:
df['genres'] = df['genres'].apply(genre_list_to_dict)