In [1]:
# packages

from bs4 import BeautifulSoup
import requests
import re
import pandas as pd 


In [2]:
# function to scrape the data 

def scrape_imdb_top_250(url, dest, count):
    url = url # define url for top 250 
    response = requests.get(url) # define response 
    top_soup = BeautifulSoup(response.text, 'html.parser') # parse the response 


    movies = top_soup.select('td.titleColumn') # get the title of a film 
    ratings = [b.attrs.get('data-value')  
            for b in top_soup.select('td.posterColumn span[name=ir]')] # get the ratings
    links = [a.attrs.get('href') for a in top_soup.select('td.titleColumn a')] # get the likns for futher sscraping

    # list for storing data 
    top_list = []
    # convert each movie title to string 
    for index in range(0, len(movies)):
        movie_string = movies[index].get_text() 
        movie = (' '.join(movie_string.split()).replace('.', ''))
        movie_title = movie[len(str(index))+1:-7]

        # capture the scaraped items     
        data = {'movie_title': movie_title,
               'rating': ratings[index],
               "link": links[index]}

        # add data to list 
        top_list.append(data)

    # convert to df 
    df1 = pd.DataFrame(top_list)

    # get the top 20 films 
    df1 = df1.head(count)

    # convert reviews to numeric 
    df1['rating'] = pd.to_numeric(df1['rating'])


    ### get the number of oscars 

    # creat list for scraped data 
    oscar_list = []

    for i in df1['link']:
        url = 'http://www.imdb.com' + i
        response = requests.get(url) # define response 
        oscar_soup = BeautifulSoup(response.text, 'html.parser') # parse the response
    
    # get the oscars text 
        oscar = oscar_soup.find_all('a', class_ = 
                  'ipc-metadata-list-item__label ipc-metadata-list-item__label--link')[2].text
    # get the alternative oscars text     
        oscar_alt = oscar_soup.find_all('a', class_ = 
                  'ipc-metadata-list-item__label ipc-metadata-list-item__label--link')[4].text
# NOTE that for some of the titles, the oscar count comes from a different position. To account for that, we scrape both positions and then join the results
        # capture the scaraped items  
        data = {'oscars': oscar,
               'oscars_alt': oscar_alt} 
        # add data to list  
        oscar_list.append(data)

    # convert to dataframe 
    df2 = pd.DataFrame(oscar_list)

    # formatting 
    df2['oscars'] = df2['oscars'].str.replace("[^0-9]+", "")
    df2['oscars_alt'] = df2['oscars_alt'].str.replace("[^0-9]+", "")
    df2['oscars'] = pd.to_numeric(df2['oscars'])
    df2['oscars_alt'] = pd.to_numeric(df2['oscars_alt'])
    df2['oscars'] = df2['oscars'].fillna(df2['oscars_alt'])
    df2['oscars'] = df2['oscars'].fillna(0) 
    df2 = df2.drop(['oscars_alt'], axis=1)



    ### get the review count

    reviews_list = []

    for i in df1['link']:
        url = 'http://www.imdb.com' + i + 'ratings'
        response = requests.get(url) # define response 
        review_soup = BeautifulSoup(response.text, 'html.parser') # parse the response
    # get the oscars text 
        rating = review_soup.find_all('div', class_ = 'smallcell')[0].text

    # create list data instance 
        data = {'ratings': rating}

    # append the list     
        reviews_list.append(data)

    # convert to dataframe 
    df3 = pd.DataFrame(reviews_list)

    # clean the reviews 
    df3['ratings'] = df3['ratings'].str.replace("[^0-9]+", "")

    # format as numeric 
    df3['ratings'] = pd.to_numeric(df3['ratings'])

    # merge data together 
    imdb_df = pd.concat([df1, df2, df3], axis=1)

    # rename columns 
    imdb_df = imdb_df.rename(columns={'movie_title': 'movie_title',
                                          'ratings': 'num_of_ratings',
                                          'rating': 'avg_rating', 
                                          'oscars': 'num_of_oscars', 
                                          'link': 'link'})

    # write data to csv
    imdb_df.to_csv(dest + 'imdb_scrape.csv')
    
    return imdb_df

In [5]:
# scrape the data 

# define the page to scrape 
url = 'http://www.imdb.com/chart/top'
# define the folder for saving the data 
destination = '/Users/steve_j/Documents/work/app/'
# define the number of titles to scarape
count = 20 

# call the function 
imdb_df = scrape_imdb_top_250(url, destination, count)

  df2['oscars'] = df2['oscars'].str.replace("[^0-9]+", "")
  df2['oscars_alt'] = df2['oscars_alt'].str.replace("[^0-9]+", "")
  df3['ratings'] = df3['ratings'].str.replace("[^0-9]+", "")
