In [74]:
import pandas as pd
from imdb import IMDb
import numpy as np
from bs4 import BeautifulSoup
import urllib.request

In [2]:
#read the CSV files
movies_df = pd.read_csv('movies.csv')
links_df = pd.read_csv('links.csv')

In [3]:
#look at the data
movies_df.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [4]:
#links dataframe contains a cross-reference between id
#in the provided dataset and id on imdb.com
links_df.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0


In [5]:
#join both the dataframes based on movieId
movies_df=movies_df.set_index('movieId')
links_df=links_df.set_index('movieId')
movies_df=movies_df.join(links_df)
movies_df['imdbId']=movies_df.imdbId.apply(str)
movies_df.head(2)

Unnamed: 0_level_0,title,genres,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0


In [7]:
#exclude the documentaries
movies_df=movies_df[~(movies_df.genres.str.contains('Docu'))]

In [8]:
movies_df['year']=movies_df.title.apply(lambda x:(x.strip())[-5:-1])
movies_df.head(2)

Unnamed: 0_level_0,title,genres,imdbId,tmdbId,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995
2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,1995


In [9]:
#List the items that do not have a numeric year
movies_df[~movies_df.year.str.isnumeric()]

Unnamed: 0_level_0,title,genres,imdbId,tmdbId,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
108548,"Big Bang Theory, The (2007-)",Comedy,898266,,007-
143410,Hyena Road,(no genres listed),4034452,316042.0,Roa
151307,The Lovers and the Despot,(no genres listed),5278868,373355.0,espo
162376,Stranger Things,Drama,4574334,410612.0,hing


In [10]:
#Remove the items that do not have a numeric year
movies_df=movies_df[movies_df.year.str.isnumeric()]

In [22]:
im = IMDb(accessSystem='html',adultSearch=0)

In [59]:
(im.get_movie(movies_df['imdbId'].iloc[0])['cast'][0]).getID()

'0000158'

In [58]:
th.getID()

'0000158'

In [43]:
movies_df['imdbId']=movies_df.imdbId.apply(str)
movies_df.head(2)

Unnamed: 0_level_0,title,genres,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0


In [82]:
def count_movies(movie_list,year):
    '''
    given a list of movies and a year, this function counts the
    number of movies released before the given year
    '''
    ct=0
    for movie in movie_list:
        if movie.has_key('year'):
            if movie['year']<=year:
                ct=ct+1
        else:
            # the movie may be a tv series and no year key was found
            # however, it will still be included in the count
            ct=ct+1
    return ct

def get_movie_count(person_id,year,drctr_search=False):
    '''
    returns the number of movies the person has acted in or 
    directed till the year
    inputs:
    1. person id
    2. year
    3. indictor for director search
    output:
    1. number of movies the person has acted in till the year
    '''
    try:
        prsn=im.get_person(person_id)
    except Exception as e:
        print('Error while searching for person %s not found. \
          Please check the id' % person_id)
        return 0
    if ((drctr_search==True) & (prsn.has_key('director'))):
        count=count_movies(prsn['director'],year)
    elif ((drctr_search==False) & (prsn.has_key('actor'))):
        count=count_movies(prsn['actor'],year)
    elif ((drctr_search==False) & (prsn.has_key('actress'))):
        count=count_movies(prsn['actress'],year)
    else:
        count=0
    return count

def get_cast_mv_details_from_mvie(movieId):
    '''
    given a movie id, this function checks if the movie id
    is a valid movie id and return the details of the movie
    if the id is valid
    A movie id is decided to be valid if the 'kind' is movie and 
    'English' is one of the movie languages and 'USA' is one of 
    the countries in which the movie was released. Basically, we 
    would like to include only movies (not TV Shows or documentaries) 
    that have been made in English language and has been released in
    USA to build the data.
    If a movie is found to be valid, this function returns the following
    as a dict:kind, year of release, plot of the movie, production studio
    and the score of the first 4 actors in the movie and the score of the 
    director. 
    An actor's score is just the total number of movies the actor
    has acted in till the year in which the movie was released. Similarly,
    the score of the director is the total number of moves the director
    has directed till the year in which the movie was released.
    Input:
    1. Movie Id (String)
    Output:
    1. Dict containing details of the movie
    '''
    movie_details = {'found':False,'kind':'','actor1_name':'','plot':'','year':np.NaN,
                'actor1_score':'','actor2_name':'','actor2_score':'',
                'actor3_name':'','actor3_score':'','actor4_name':'',
                'actor4_score':'','director':'','director_score':'',
                'studio':'','release_date':''}
    movie=im.get_movie(movieId)
    print('kind:',movie['kind'],(movie['kind']!='movie'))
    print('lang:',movie['language'],('English' not in movie['language']))
    print('country:',movie['country'],('USA' not in movie['country']))
    if ((movie['kind']!='movie') | ('English' not in movie['language']) | ('USA' not in movie['country'])):
        return movie_details
    else:
        movie_details['found']=True
        movie_details['kind']=movie['kind']
        movie_details['plot']=movie['plot']
        movie_details['year']=movie['year']
        movie_details['studio']=movie['production companies'][0]['name']
        actor1=im.get_person(movie['cast'][0].getID())
        actor2=im.get_person(movie['cast'][1].getID())
        actor3=im.get_person(movie['cast'][2].getID())
        actor4=im.get_person(movie['cast'][3].getID())
        director=im.get_person(movie['director'][0].getID())
        movie_details['actor1_name']=actor1['name']
        movie_details['actor1_score']=get_movie_count(actor1.getID(),movie_details['year'],drctr_search=False)
        movie_details['actor2_name']=actor2['name']
        movie_details['actor2_score']=get_movie_count(actor2.getID(),movie_details['year'],drctr_search=False)
        movie_details['actor3_name']=actor3['name']
        movie_details['actor3_score']=get_movie_count(actor3.getID(),movie_details['year'],drctr_search=False)
        movie_details['actor4_name']=actor4['name']
        movie_details['actor4_score']=get_movie_count(actor4.getID(),movie_details['year'],drctr_search=False)
        movie_details['director']=director['name']
        movie_details['director_score']=get_movie_count(director.getID(),movie_details['year'],drctr_search=True)
        return movie_details
    
def get_movie_budget_imdb(movie_id):
    '''
    imdbpy does not have an inbuilt method to
    get the movie budget. This function gets
    the movie budget values from imdb.com
    input
    1. movie id (string)
    output
    1. the budget (int) of the movie if the
    movie id is found or a string 'BudgetNotFound'
    if the movie is not found
    '''
    urlpre='http://www.imdb.com/title/tt'
    urlpost='/business'
    #complete URL
    urlfull=urlpre+movie_id+urlpost

    #get the page to the IMDB business page
    url = urllib.request.urlopen(urlfull)
    page = url.read()

    #read it into bs4
    soup = BeautifulSoup(page, 'html.parser')

    #navigate to the table div that has the budget details
    pagetext=(soup.find("div",{'id':'tn15content'})).text

    #parse the budget numbers
    if pagetext.find('Budget')==1:
        budget_str=pagetext[pagetext.find('$')+1:pagetext.find(' ')]
    else:
        #Budget not found
        return('BudgetNotFound')

    try:
        #convert to int and return
        budget=int((pagetext[pagetext.find('$')+1:pagetext.find(' ')]).replace(',',''))
        return budget
    except Exception as e:
        #error while converting to int
        return('BudgetNotFound')
    
    
def get_movie_4thWkndBo_BoMojo(m_title):
    urlpre='http://www.boxofficemojo.com/movies/?page=weekend&id='
    urlfull=urlpre+m_title[0:m_title.find('(')].replace(' ','')+'.htm'

    #print(urlfull)
    #get the page to the weekend collection chart
    url = urllib.request.urlopen(urlfull)
    page = url.read()

    #read it into bs4
    soup = BeautifulSoup(page, 'html.parser')

    #navigate to the table that shows the box-office chart
    #the charts are by years
    bochart=soup.findAll("table",{'chart-wide'})

    #to get the box office collection for the fourth weekend
    #get the rows of the first chart
    if len(bochart)>0:
        #valid movie title
        rows=bochart[0].findAll('tr')
    else:
        return('Not Found. URL:'+urlfull)

    #check if the table has atleast 4 rows excluding header
    #this is needed in case of movies released during the
    #last weeks of a year, in which case, it will have only
    #1 or 2 rows
    rows_in_first_chart = len(rows)

    if rows_in_first_chart>4:
        # the first table has at least 4 weekend data
        fourth_wknd_row = rows[4]
    else:
        # the first table has less than 4 weekend data
        if len(bochart)>1:
            rows=bochart[1].findAll('tr')
            fourth_wknd_row = rows[5-rows_in_first_chart]
        else:
            return('Not Found. URL:'+urlfull)

    #the box office value is present in the 8th column
    wknd_bo_coln = fourth_wknd_row.findAll('td')[7].string
    #strip the $ and , from the text e.g. from $2,333,443
    wknd_bo_coln = wknd_bo_coln.replace('$','')
    wknd_bo_coln = wknd_bo_coln.replace(',','')

    return(int(wknd_bo_coln))


In [62]:
get_cast_mv_details_from_mvie('114709')

kind: movie False
lang: ['English'] False
country: ['USA'] False


(True,
 {'actor1_name': 'Tom Hanks',
  'actor1_score': 36,
  'actor2_name': 'Tim Allen',
  'actor2_score': 6,
  'actor3_name': 'Don Rickles',
  'actor3_score': 61,
  'actor4_name': 'Jim Varney',
  'actor4_score': 33,
  'director': 'John Lasseter',
  'director_score': 10,
  'kind': 'movie',
  'plot': ['A little boy named Andy loves to be in his room, playing with his toys, especially his doll named "Woody". But, what do the toys do when Andy is not with them, they come to life. Woody believes that he has life (as a toy) good. However, he must worry about Andy\'s family moving, and what Woody does not know is about Andy\'s birthday party. Woody does not realize that Andy\'s mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a toy, and quickly becomes Andy\'s new favorite toy. Woody, who is now consumed with jealousy, tries to get rid of Buzz. Then, both Woody and Buzz are now lost. They must find a way to get back to Andy before he moves without the

In [87]:
#a small dataframe to test the functions
mv_sm_df=movies_df[0:6]

In [88]:
mv_sm_df['movie_details']=mv_sm_df.imdbId.apply(lambda x: get_cast_mv_details_from_mvie(x))

kind: movie False
lang: ['English'] False
country: ['USA'] False
kind: movie False
lang: ['English', 'French'] False
country: ['USA'] False
kind: movie False
lang: ['English'] False
country: ['USA'] False
kind: movie False
lang: ['English'] False
country: ['USA'] False
kind: movie False
lang: ['English'] False
country: ['USA'] False
kind: movie False
lang: ['English', 'Spanish'] False
country: ['USA'] False


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [89]:
mv_sm_df.head()

Unnamed: 0_level_0,title,genres,imdbId,tmdbId,year,movie_details
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'..."
2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'..."
3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'..."
4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'..."
5,Father of the Bride Part II (1995),Comedy,113041,11862.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'..."


In [90]:
mv_sm_df['movie_budget']=mv_sm_df.imdbId.apply(lambda x: get_movie_budget_imdb(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [91]:
mv_sm_df.head()

Unnamed: 0_level_0,title,genres,imdbId,tmdbId,year,movie_details,movie_budget
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",30000000
2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",50000000
3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",25000000
4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",16000000
5,Father of the Bride Part II (1995),Comedy,113041,11862.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",30000000


In [92]:
mv_sm_df['movie_4th_wknd_bo']=mv_sm_df.title.apply(lambda x: get_movie_4thWkndBo_BoMojo(x))

http://www.boxofficemojo.com/movies/?page=weekend&id=ToyStory.htm
http://www.boxofficemojo.com/movies/?page=weekend&id=Jumanji.htm
http://www.boxofficemojo.com/movies/?page=weekend&id=GrumpierOldMen.htm
http://www.boxofficemojo.com/movies/?page=weekend&id=WaitingtoExhale.htm
http://www.boxofficemojo.com/movies/?page=weekend&id=FatheroftheBridePartII.htm
http://www.boxofficemojo.com/movies/?page=weekend&id=Heat.htm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [93]:
mv_sm_df.head()

Unnamed: 0_level_0,title,genres,imdbId,tmdbId,year,movie_details,movie_budget,movie_4th_wknd_bo
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",30000000,97458735
2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",50000000,68400785
3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",25000000,50891320
4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",16000000,52538430
5,Father of the Bride Part II (1995),Comedy,113041,11862.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",30000000,Not Found. URL:http://www.boxofficemojo.com/mo...


In [101]:
#split the column containin the movie details
#and concat it to the original dataframe
mv_sm_df=pd.concat([mv_sm_df,mv_sm_df['movie_details'].apply(pd.Series)],axis=1)

In [103]:
mv_sm_df.head(3)

Unnamed: 0_level_0,title,genres,imdbId,tmdbId,year,movie_details,movie_budget,movie_4th_wknd_bo,actor1_name,actor1_score,...,actor4_name,actor4_score,director,director_score,found,kind,plot,release_date,studio,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",30000000,97458735,Tom Hanks,36,...,Jim Varney,33,John Lasseter,10,True,movie,[A little boy named Andy loves to be in his ro...,,Pixar Animation Studios,1995
2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",50000000,68400785,Robin Williams,52,...,Bradley Pierce,29,Joe Johnston,6,True,movie,[After being trapped in a jungle board game fo...,,TriStar Pictures,1995
3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,1995,"{'found': True, 'kind': 'movie', 'actor1_name'...",25000000,50891320,Walter Matthau,100,...,Ann-Margret,52,Howard Deutch,18,True,movie,[Things don't seem to change much in Wabasha C...,,Lancaster Gate,1995


In [104]:
mv_sm_df.columns

Index(['title', 'genres', 'imdbId', 'tmdbId', 'year', 'movie_details',
       'movie_budget', 'movie_4th_wknd_bo', 'actor1_name', 'actor1_score',
       'actor2_name', 'actor2_score', 'actor3_name', 'actor3_score',
       'actor4_name', 'actor4_score', 'director', 'director_score', 'found',
       'kind', 'plot', 'release_date', 'studio', 'year'],
      dtype='object')

In [106]:
mv_sm_df.iloc[0]

title                                                 Toy Story (1995)
genres                     Adventure|Animation|Children|Comedy|Fantasy
imdbId                                                          114709
tmdbId                                                             862
year                                                              1995
movie_details        {'found': True, 'kind': 'movie', 'actor1_name'...
movie_budget                                                  30000000
movie_4th_wknd_bo                                             97458735
actor1_name                                                  Tom Hanks
actor1_score                                                        36
actor2_name                                                  Tim Allen
actor2_score                                                         6
actor3_name                                                Don Rickles
actor3_score                                                        61
actor4