In [5]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import clear_output
from dateutil import parser

In [6]:
'''
LEGEND:
r_ = request
s_ = soup
'''

'\nLEGEND:\nr_ = request\ns_ = soup\n'

In [7]:
# Function scraps synopsis and original title
def scrap_synopsis(title_id):
    '''
    returns elements: synopsis, original_title
    '''
    
    # 1. Create working link to a movie. Combingin constant imdb url with title ID : '/title/ID/'
    movie_url = 'https://www.imdb.com'+title_id
    # 2. Link that leads to synopsis
    plot_url = 'plotsummary?ref_=tt_stry_pl#synopsis'
    # 3. Create working synopsis link and get request model
    r_synopsis = requests.get(movie_url + plot_url).text
    # 4. Use soup to parse synopsis html object
    s_synopsis = BeautifulSoup(r_synopsis, 'html.parser')
    # 5. Extract synopsis text
    synopsis = s_synopsis.find('ul', id='plot-synopsis-content').li.text
    
    # Scrapping original title. If basic title = original it creates error. Get basic title.
    try:
        r_title = requests.get(movie_url).text
        s_title = BeautifulSoup(r_title, 'html.parser')
        original_title = s_title.find_all('div', class_='originalTitle')[0].text
        original_title = re.sub(' \(original title\)', '', original_title)
    except:
        original_title = s_title.find_all('div', class_='title_wrapper')
        original_title = re.search(r'[^\xa0]*',original_title[0].h1.text).group()
    
    return synopsis, original_title

In [27]:
def scrap_country(title_id):
    '''
    returns list: countries if multiple or a one element list if it was one country production
    '''
    # 1. Create working link to a movie. Combingin constant imdb url with title ID : '/title/ID/'
    movie_url = 'https://www.imdb.com'+title_id
    # 3. Create working  link and get request model
    r_country = requests.get(movie_url).text
    # 4. Use soup to parse synopsis html object
    s_country = BeautifulSoup(r_country, 'html.parser')
    
    countries = []
    for div in s_country.find_all('div', id='titleDetails'):
        for a in div.find_all('div', class_="txt-block")[1].find_all('a'):
            countries.append(a.text)
    return countries

In [22]:
def scrap_release_date(title_id, country='USA'):
    '''
    returns element: release_date
    '''
    release_date = None
    release_url = 'https://www.imdb.com'+title_id+'releaseinfo'
    r_release = requests.get(release_url).text
    s_release = BeautifulSoup(r_release, 'html.parser')

    for tr in s_release.find_all('tr', class_='ipl-zebra-list__item release-date-item'):
        if (tr.find_all('td', class_='release-date-item__country-name')[0].text == f'{country}\n') & (not tr.find_all('td', class_='release-date-item__attributes')): #Second part makes sure that it wasn't a premiere on a festival
            date_str = tr.find_all('td', class_='release-date-item__date')[0].text
            release_date = parser.parse(date_str)
    return release_date

In [43]:
def basics():
    '''
    returns lists: title_ids, titles, years, ratings, votes
    '''
    title_ids = []
    titles = []
    years = []
    ratings = []
    votes = []

    for page_counter in range(0,251,50): # Looping over next pages in top 250
        print(f'Page: {page_counter}')
        # 1. Get request model of top 250 imdb movies and convert it to str with .txt
        r_top250 = requests.get(f'https://www.imdb.com/search/title/?groups=top_250&view=simple&sort=year,desc&start={page_counter}').text
        # 2. Use soup to parse html object
        s_top250 = BeautifulSoup(r_top250, 'html.parser')

        # Scrapping title ID and title
        for div in s_top250.find_all('div', class_='col-title'):
            for link in div.find_all('a'):
                title_ids.append(link.get('href'))
                titles.append(link.text)

        # Scrapping year    
        for span in s_top250.find_all('span', class_='lister-item-year text-muted unbold'):
            year_str = span.text
            year = re.search('\d\d\d\d',year_str).group()
            years.append(int(year))

        # Scrapping rating and votes
        for div in s_top250.find_all('div', class_='col-imdb-rating'):
            for strong in div.find_all('strong'):
                s = re.split(' ', strong.get('title'))
                ratings.append(s[0])
                votes.append(int(s[3].replace(',',''))) # Converting comma separated string to int ('115,334' = 115334)
        clear_output()
        
    return title_ids, titles, years, ratings, votes

In [57]:
'''
title_ids
titles
years
ratings
votes
'''
title_ids, titles, years, ratings, votes = basics()

'''
def scrap_synopsis / original title (title_ids)
def scrap_country (title_ids)
def scrap_release_date (title_ids)
'''

# Scrapping release dates into a list
release_date = []
for index, id_ in enumerate(title_ids):
    print('Dates scrapping')
    print(f'IDs left: {len(title_ids)-index}')
    release_date.append(scrap_release_date(id_, country='USA'))
    clear_output()

# Scrapping countries into a list
countries = []
for index, id_ in enumerate(title_ids):
    print('Countries scrapping')
    print(f'IDs left: {len(title_ids)-index}')
    countries.append(scrap_country(id_))
    clear_output()
    
# Scrapping plots and original titles into a list
plots = []
original_titles = []
for index, id_ in enumerate(title_ids):
    print('Plots and original titles scrapping')
    print(f'IDs left: {len(title_ids)-index}') # Printing counter and clearing output in the end
    plot, title = scrap_synopsis(id_)
    plots.append(plot)
    original_titles.append(title)
    clear_output()

'''
Create movies DF
Create movies2004 DF
'''
movies = pd.DataFrame({'ID':title_ids, 'Title':titles, 'Original Title': original_titles, 'Year':years, 'Rating':ratings, 'Votes':votes, 'Country':countries, 'Plot':plots})

# Replacing info about missing plot with None
movies['Plot'] = movies['Plot'].apply(lambda x: None if 'looks like we don\'t have' in x else x)

movies2004 = movies[movies['Year'] >= 2004]

'''
movies to csv
movies2004 to csv
'''

# Saving both dataframes to csv
print('Saving to .csv')
movies.to_csv('movies.csv', index=False)
movies2004.to_csv('movies2004.csv', index=False)
clear_output()