In [1]:
import pandas as pd   
import requests       
from bs4 import BeautifulSoup
import numpy as np 
import json
import re
import tqdm as tqdm

In [2]:
# Create a list for all pages
pages = list(range(1,7))

# Create lists for columns needed for the data frame
title = []
runtime = []
genre = []
imdb_rating = []
year = []
metascore = []
director_names = []
actors_names = []
gross = []
certifications = []
imdbID = []
number_wins = []
number_nominations = []

# Loop through all ages to get information
for page in pages:
    params = {
        'st_dt': '',
        'mode': 'detail',
        'page': page,
        'sort': 'list_order,asc'
    }

    web_page_url = 'https://www.imdb.com/list/ls062911411/?sort=release_date,desc&st_dt=&mode=detail&page=1'

    response = requests.get(web_page_url, params=params)


    # Create an instance of beautiful soup for film information
    film_soup = BeautifulSoup(response.content, 'html.parser')

    # Get the divs where the movie information is located
    film_info = film_soup.find_all('div', class_= 'lister-item-content')
    
    # Loop through film_info object to extract necessary information
    for item in film_info:
        
        # Get movie's titles
        title.append((item.a.string))

        # Get movies' runtime
        time_ = item.find('span', class_ = 'runtime')
        runtime.append(time_.string.replace("min", ""))
        
        # Get movies' IMDB id
        ID = item.find('h3', class_ = 'lister-item-header').a['href'].split('/')[2]
        imdbID.append(ID)
        
        # Call OMDBAPI to get additional information
        infos = requests.get('http://www.omdbapi.com/?i='+ ID + '&apikey=1362d863').text
        json_infos = json.loads(infos)
        
        # Get movies' number of wins and nominations
        awards_noms = json_infos['Awards'].split(".")

        # Create a regular expression to get the number of wins and nominations
        pattern = r'\d+'
        if('N/A' not in awards_noms and awards_noms):
            if(len(awards_noms) == 2):
                if ("wins" not in awards_noms[1] or "win" not in awards_noms[1]):
                    results = re.findall(pattern, awards_noms[1])
                    total_wins = 0
                    total_nominations = results[0]
                elif ("nominations" not in awards_noms[1] or "nomination" not in awards_noms[1]):
                    results = re.findall(pattern, awards_noms[1])
                    total_wins = results[0]
                    total_nominations = 0
                else:
                    results = re.findall(pattern, awards_noms[1])
                    total_wins = results[0]
                    total_nominations = results[1]
            else:
                if ("wins" not in awards_noms[0] or "win" not in awards_noms[0]):
                    results = re.findall(pattern, awards_noms[0])
                    total_wins = 0
                    total_nominations = results[0]
                elif ("nominations" not in awards_noms[0] or "nomination" not in awards_noms[0]):
                    results = re.findall(pattern, awards_noms[0])
                    total_wins = results[0]
                    total_nominations = 0
                else:
                    results = re.findall(pattern, awards_noms[0])
                    if(len(results) == 3):
                        results = results[1:3]
                        total_wins = results[0]
                        total_nominations = results[1]
                    total_wins = results[0]
                    total_nominations = results[1]
            number_wins.append(total_wins)
            number_nominations.append(total_nominations)
        else:
            number_wins.append('N/A')
            number_nominations.append('N/A')

        # Get movies' age certification
        cert= json_infos['Rated']
        certifications.append(cert)

        # Get movies' genre
        genre_= item.find('span', class_ = 'genre')
        genre.append(((genre_.string).replace('\n', '')).strip())

        # Get movies' rating
        rate = item.find('span',class_ = 'ipl-rating-star__rating')
        imdb_rating.append(float(rate.string))

        # Get movies' released year and metascore
        year_ = item.find('span', class_ = 'lister-item-year text-muted unbold').string.split()
        if len(year_) <= 1:
            year.append(int(year_[0][1:5]))
        else:
            year.append(int(year_[1][1:5]))

        if item.find('span', class_ = 'metascore favorable') == None:
            metascore.append(np.nan)
        else:
            metascore_ = item.find('span', class_ = 'metascore favorable').string.strip()
            metascore.append(int(metascore_))


    # Create an instance of beautiful soup for directors and actors information
    directors_actors_soup = BeautifulSoup(response.content, 'html.parser')
    
    # Retrieve all tags and links that have directors' and actors' information
    director_links = directors_actors_soup.find_all('p', {'class': 'text-muted text-small'})

    directors_info = []
    for links in director_links:
        directors_info.append(links.a)

    # Retreving director names. Ensuring the list is 100
    for name in directors_info:
        if name != None:
            director_names.append(name.string)

    # Retreive actors information
    actors_links = []
    for each_tag in directors_actors_soup.find_all('p', {'class': 'text-muted text-small'}):
        if each_tag.find_all('a') != []:
            actors_links.append(each_tag.findAll('a')[1:])


    for links in actors_links:
        each_name = []
        for each_link in links:
            each_name.append(each_link.string)
        actors_names.append(each_name)


    # Retrieve list for gross
    list_of_grossEarnings = []
    for i in directors_actors_soup.find_all('p', {'class': 'text-muted text-small'}):
        list_of_grossEarnings.append(i.findAll('span', {'name': 'nv' }))


    for lists in list_of_grossEarnings:
        if len(lists) > 1:
            gross.append(lists[1].string)
        elif len(lists) == 1:
            gross.append('N/A')


In [3]:
# Create a movie dataframe 
movie_Df = pd.DataFrame({'ID': imdbID, 'Movie_title': title,'Certification': certifications ,'Genre': genre, 'Duration': runtime, 'Year': year, 'IMDB Rating': imdb_rating, 'Meta Score': metascore,'Gross earnings': gross, 'Wins': number_wins, "Nominations": number_nominations}, index=range(1, len(title)+1))
movie_Df

Unnamed: 0,ID,Movie_title,Certification,Genre,Duration,Year,IMDB Rating,Meta Score,Gross earnings,Wins,Nominations
1,tt4633694,Người Nhện: Vũ Trụ Mới,PG,"Animation, Action, Adventure",117,2018,8.4,87.0,$190.24M,81,57
2,tt5083738,Sủng Ái,R,"Biography, Comedy, Drama",119,2018,7.5,91.0,$34.37M,180,345
3,tt6155172,Khu Phố Roma,R,Drama,135,2018,7.7,96.0,,251,229
4,tt4912910,Nhiệm Vụ Bất Khả Thi: Sụp Đổ,PG-13,"Action, Adventure, Thriller",147,2018,7.7,86.0,$220.16M,25,41
5,tt8075192,Kẻ Trộm Siêu Thị,R,"Crime, Drama, Thriller",121,2018,7.9,93.0,$3.31M,49,96
...,...,...,...,...,...,...,...,...,...,...,...
513,tt0015324,Sherlock Jr.,Passed,"Action, Comedy, Romance",45,1924,8.2,,$0.98M,2,0
514,tt0013442,Nosferatu,Not Rated,"Fantasy, Horror",94,1922,7.9,,,2,2
515,tt0012349,Đứa Trẻ,Passed,"Comedy, Drama, Family",68,1921,8.2,,$5.45M,2,0
516,tt0010323,Das Cabinet des Dr. Caligari,Not Rated,"Horror, Mystery, Thriller",76,1920,8.0,,,2,0


In [4]:
# Create a director dataframe
director_Df = pd.DataFrame({
    'imdbID': imdbID,
    'Directors': director_names
})

director_Df

Unnamed: 0,imdbID,Directors
0,tt4633694,Bob Persichetti
1,tt5083738,Yorgos Lanthimos
2,tt6155172,Alfonso Cuarón
3,tt4912910,Christopher McQuarrie
4,tt8075192,Hirokazu Koreeda
...,...,...
512,tt0015324,Buster Keaton
513,tt0013442,F.W. Murnau
514,tt0012349,Charles Chaplin
515,tt0010323,Robert Wiene


In [5]:
# Create a casts dataframe
casts_Df = pd.DataFrame({
    'imdb': imdbID,
    'Casts': actors_names
})
casts_Df = casts_Df.explode('Casts')
casts_Df

Unnamed: 0,imdb,Casts
0,tt4633694,Peter Ramsey
0,tt4633694,Rodney Rothman
0,tt4633694,Shameik Moore
0,tt4633694,Jake Johnson
0,tt4633694,Hailee Steinfeld
...,...,...
515,tt0010323,Lil Dagover
516,tt0006864,Lillian Gish
516,tt0006864,Robert Harron
516,tt0006864,Mae Marsh


In [6]:
# Export movie dataframe to csv file
movie_Df.to_csv("movies.csv", index = False)

In [7]:
# Export director dataframe to csv file
director_Df.to_csv("directors.csv", index = False)

In [8]:
# Export cast dataframe to csv file
casts_Df.to_csv("casts.csv", index = False)