# Web Scraping Project - Top 500 Movies IMDB

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import math
import warnings
warnings.filterwarnings('ignore')

# Helper functions:

In [2]:
"""
The read_html(url) return the contents of an URL hmtl file as a string.
"""
def read_html(url):
    response = requests.get(url)
    content = response.content
    return content

#===========================================================================

"""
If a string contains comma, use "" to enclose the string to write it to a comma-separated values (CSV) 
file with no issue with the comma.
"""
def process_str_with_comma(string):
    if ',' in string:
        new_string = '"' + string.strip() + '"'
    else:
        new_string = string
    return new_string


***

## read_m_from_url

For each movie, we will pull the following information from the webpage:
- movie_id
- title 
- rank
- year
- runtime
- rating
- votes 
- genres

In [65]:
def read_m_from_url(url, num_of_m=50):
    print(url)
    
    html_string = read_html(url)
    soup = BeautifulSoup(html_string, "html.parser")

    
    movie_list = soup.find('div', 'lister-list')
    
    list_movies = [] 
    
    count = 0
    

    divs=  movie_list.find_all('div','lister-item mode-advanced')
    for d in divs:
        dict_each_movie = {}
        
        # movie_id
        try:
            h = d.find('h3','lister-item-header') 
            movie_id= h.find('a').attrs['href']
            movie_id= movie_id[7:-1]
            
        except:
            movie_id=""
        finally:
            dict_each_movie["movie_id"] = movie_id
            print(movie_id)
            
        # title
        try:
            m = d.find('h3', 'lister-item-header')
            title = m.find('a')
            title = title.get_text()
        except:
            title = ""
        finally: 
            dict_each_movie["title"] = title
            print(title)

        # rank
        try:
            m = d.find('h3', 'lister-item-header')
            rank = m.find('span', 'lister-item-index unbold text-primary' )
            rank = rank.get_text()
        except:
            rank = ""
        finally: 
            dict_each_movie["rank"] = rank
            print(rank)    

        # year 
        try:
            m = d.find('h3', 'lister-item-header')
            year = m.find('span', 'lister-item-year text-muted unbold')
            year = year.get_text().replace('(', "").replace(')', "").replace('I',"").replace(" ", "")

        except:
            year = ""
        finally: 
            dict_each_movie["year"] = year
            print(year) 

        # runtime
        try:
            m = d.find('p', 'text-muted')
            rtime = m.find('span', 'runtime')
            rtime = rtime.get_text()

        except:
            rtime = ""
        finally: 
            dict_each_movie["runtime"] = rtime
            print(rtime) 

        # rating 
        try:
            m = d.find('div', 'ratings-bar')
            rating = m.find('strong')
            rating = rating.get_text()
        except:
            rating = ""
        finally: 
            dict_each_movie["rating"] = rating
            print(rating)   

        # votes
        try: 
            m = d.find('p', 'sort-num_votes-visible')
            votes = m.find_next('span').find_next('span')
            votes = votes.get_text()
        except:
            votes = ""
        finally: 
            dict_each_movie["votes"] = votes
            print(votes)   

        # genres
        try:
            m = d.find('p', 'text-muted')
            genres = m.find('span', 'genre')
            genres = genres.get_text().replace("\n", "")

        except:
            genres = ""
        finally: 
            dict_each_movie["genres"] = genres
            print(genres)
            
        list_movies.append(dict_each_movie) 

        count +=1
        print('===============================')
        print()
        if count == num_of_m:
            break 

    return list_movies


**Comments on pulling movie information**: For each variable scraped from the IMDb url, I first inspected the HTML code to find the relevant code block. Using the '.find' function, I filtered through the HTML code to retreive the relevant data. This filtered data was stored in the 'dict_each_movie' dictionary. The '.append' function added each variable to the 'list_movies' dataframe.

**Notes**: 

- Since data for 'year' was in the format '(20XX)', the data was modified to remove '( )' before it was stored in the dictionary. The 'year' entry for the film 'Joker' had an additional 'I' before it, which was also removed before being stored in the dictionary. <br><br>

- The 'find.next' function was used to pull observations for 'votes' since the number value was the second 'span' element under 'sort-num_votes-visible'. <br><br>

- After running the 'test_read_m_from_url' function, I observed that an unnecessary line break was insterted before 'genre' observations in the output. To correct this, I removed '\n' from 'genre' observations before they were added to the dictionary. 

### Test  read_m_from_url

In [66]:
def test_read_m_from_url():
    """ output:
    Movies list: [{'movie_id': 'tt7286456', 'title': 'Joker', 'year': '(2019)', 'rank': '1.', 'runtime': '122 min', 'rating': '8.4', 'votes': '1,074,230'}, {'movie_id': 'tt4154796', 'title': 'Avengers: Endgame', 'year': '(2019)', 'rank': '2.', 'runtime': '181 min', 'rating': '8.4', 'votes': '945,461'}, {'movie_id': 'tt4154756', 'title': 'Avengers: Infinity War', 'year': '(2018)', 'rank': '3.', 'runtime': '149 min', 'rating': '8.4', 'votes': '928,596'}, {'movie_id': 'tt1825683', 'title': 'Black Panther', 'year': '(2018)', 'rank': '4.', 'runtime': '134 min', 'rating': '7.3', 'votes': '678,964'}, {'movie_id': 'tt6751668', 'title': 'Parasite', 'year': '(2019)', 'rank': '5.', 'runtime': '132 min', 'rating': '8.6', 'votes': '666,646'}, {'movie_id': 'tt7131622', 'title': 'Once Upon a Time... In Hollywood', 'year': '(2019)', 'rank': '6.', 'runtime': '161 min', 'rating': '7.6', 'votes': '642,048'}, {'movie_id': 'tt8946378', 'title': 'Knives Out', 'year': '(2019)', 'rank': '7.', 'runtime': '130 min', 'rating': '7.9', 'votes': '534,299'}, {'movie_id': 'tt5463162', 'title': 'Deadpool 2', 'year': '(2018)', 'rank': '8.', 'runtime': '119 min', 'rating': '7.7', 'votes': '519,760'}, {'movie_id': 'tt8579674', 'title': '1917', 'year': '(2019)', 'rank': '9.', 'runtime': '119 min', 'rating': '8.3', 'votes': '495,380'}, {'movie_id': 'tt4154664', 'title': 'Captain Marvel', 'year': '(2019)', 'rank': '10.', 'runtime': '123 min', 'rating': '6.8', 'votes': '493,817'}, {'movie_id': 'tt1727824', 'title': 'Bohemian Rhapsody', 'year': '(2018)', 'rank': '11.', 'runtime': '134 min', 'rating': '7.9', 'votes': '489,064'}, {'movie_id': 'tt6644200', 'title': 'A Quiet Place', 'year': '(2018)', 'rank': '12.', 'runtime': '90 min', 'rating': '7.5', 'votes': '482,141'}, {'movie_id': 'tt4633694', 'title': 'Spider-Man: Into the Spider-Verse', 'year': '(2018)', 'rank': '13.', 'runtime': '117 min', 'rating': '8.4', 'votes': '430,153'}, {'movie_id': 'tt6966692', 'title': 'Green Book', 'year': '(2018)', 'rank': '14.', 'runtime': '130 min', 'rating': '8.2', 'votes': '428,762'}, {'movie_id': 'tt6723592', 'title': 'Tenet', 'year': '(2020)', 'rank': '15.', 'runtime': '150 min', 'rating': '7.4', 'votes': '426,125'}, {'movie_id': 'tt1477834', 'title': 'Aquaman', 'year': '(2018)', 'rank': '16.', 'runtime': '143 min', 'rating': '6.9', 'votes': '417,286'}, {'movie_id': 'tt1270797', 'title': 'Venom', 'year': '(2018)', 'rank': '17.', 'runtime': '112 min', 'rating': '6.7', 'votes': '410,565'}, {'movie_id': 'tt2527338', 'title': 'Star Wars: The Rise Of Skywalker', 'year': '(2019)', 'rank': '18.', 'runtime': '141 min', 'rating': '6.5', 'votes': '404,527'}, {'movie_id': 'tt1677720', 'title': 'Ready Player One', 'year': '(2018)', 'rank': '19.', 'runtime': '140 min', 'rating': '7.4', 'votes': '398,599'}, {'movie_id': 'tt6320628', 'title': 'Spider-Man: Far from Home', 'year': '(2019)', 'rank': '20.', 'runtime': '129 min', 'rating': '7.4', 'votes': '383,087'}, {'movie_id': 'tt1517451', 'title': 'A Star Is Born', 'year': '(2018)', 'rank': '21.', 'runtime': '136 min', 'rating': '7.6', 'votes': '356,745'}, {'movie_id': 'tt1302006', 'title': 'The Irishman', 'year': '(2019)', 'rank': '22.', 'runtime': '209 min', 'rating': '7.8', 'votes': '352,691'}, {'movie_id': 'tt5095030', 'title': 'Ant-Man and the Wasp', 'year': '(2018)', 'rank': '23.', 'runtime': '118 min', 'rating': '7.0', 'votes': '346,822'}, {'movie_id': 'tt2584384', 'title': 'Jojo Rabbit', 'year': '(2019)', 'rank': '24.', 'runtime': '108 min', 'rating': '7.9', 'votes': '340,733'}, {'movie_id': 'tt1950186', 'title': 'Ford v Ferrari', 'year': '(2019)', 'rank': '25.', 'runtime': '152 min', 'rating': '8.1', 'votes': '337,490'}, {'movie_id': 'tt3778644', 'title': 'Solo: A Star Wars Story', 'year': '(2018)', 'rank': '26.', 'runtime': '135 min', 'rating': '6.9', 'votes': '313,683'}, {'movie_id': 'tt4912910', 'title': 'Mission: Impossible - Fallout', 'year': '(2018)', 'rank': '27.', 'runtime': '147 min', 'rating': '7.7', 'votes': '308,761'}, {'movie_id': 'tt6146586', 'title': 'John Wick: Chapter 3 - Parabellum', 'year': '(2019)', 'rank': '28.', 'runtime': '130 min', 'rating': '7.4', 'votes': '306,871'}, {'movie_id': 'tt2737304', 'title': 'Bird Box', 'year': '(2018)', 'rank': '29.', 'runtime': '124 min', 'rating': '6.6', 'votes': '305,135'}, {'movie_id': 'tt2798920', 'title': 'Annihilation', 'year': '(I) (2018)', 'rank': '30.', 'runtime': '115 min', 'rating': '6.8', 'votes': '298,808'}, {'movie_id': 'tt0448115', 'title': 'Shazam!', 'year': '(2019)', 'rank': '31.', 'runtime': '132 min', 'rating': '7.0', 'votes': '295,848'}, {'movie_id': 'tt4881806', 'title': 'Jurassic World: Fallen Kingdom', 'year': '(2018)', 'rank': '32.', 'runtime': '128 min', 'rating': '6.2', 'votes': '283,862'}, {'movie_id': 'tt8367814', 'title': 'The Gentlemen', 'year': '(2019)', 'rank': '33.', 'runtime': '113 min', 'rating': '7.8', 'votes': '282,900'}, {'movie_id': 'tt2948372', 'title': 'Soul', 'year': '(2020)', 'rank': '34.', 'runtime': '100 min', 'rating': '8.1', 'votes': '280,267'}, {'movie_id': 'tt7653254', 'title': 'Marriage Story', 'year': '(2019)', 'rank': '35.', 'runtime': '137 min', 'rating': '7.9', 'votes': '274,033'}, {'movie_id': 'tt7784604', 'title': 'Hereditary', 'year': '(2018)', 'rank': '36.', 'runtime': '127 min', 'rating': '7.3', 'votes': '270,157'}, {'movie_id': 'tt3606756', 'title': 'Incredibles 2', 'year': '(2018)', 'rank': '37.', 'runtime': '118 min', 'rating': '7.6', 'votes': '269,810'}, {'movie_id': 'tt6857112', 'title': 'Us', 'year': '(II) (2019)', 'rank': '38.', 'runtime': '116 min', 'rating': '6.8', 'votes': '257,673'}, {'movie_id': 'tt8772262', 'title': 'Midsommar', 'year': '(2019)', 'rank': '39.', 'runtime': '148 min', 'rating': '7.1', 'votes': '255,303'}, {'movie_id': 'tt0437086', 'title': 'Alita: Battle Angel', 'year': '(2019)', 'rank': '40.', 'runtime': '122 min', 'rating': '7.3', 'votes': '247,522'}, {'movie_id': 'tt5727208', 'title': 'Uncut Gems', 'year': '(2019)', 'rank': '41.', 'runtime': '135 min', 'rating': '7.4', 'votes': '246,741'}, {'movie_id': 'tt6139732', 'title': 'Aladdin', 'year': '(2019)', 'rank': '42.', 'runtime': '128 min', 'rating': '6.9', 'votes': '245,331'}, {'movie_id': 'tt7349662', 'title': 'BlacKkKlansman', 'year': '(2018)', 'rank': '43.', 'runtime': '135 min', 'rating': '7.5', 'votes': '242,325'}, {'movie_id': 'tt4123430', 'title': 'Fantastic Beasts: The Crimes of Grindelwald', 'year': '(2018)', 'rank': '44.', 'runtime': '134 min', 'rating': '6.5', 'votes': '239,448'}, {'movie_id': 'tt7126948', 'title': 'Wonder Woman 1984', 'year': '(2020)', 'rank': '45.', 'runtime': '151 min', 'rating': '5.4', 'votes': '232,962'}, {'movie_id': 'tt7349950', 'title': 'It Chapter Two', 'year': '(2019)', 'rank': '46.', 'runtime': '169 min', 'rating': '6.5', 'votes': '230,874'}, {'movie_id': 'tt6105098', 'title': 'The Lion King', 'year': '(2019)', 'rank': '47.', 'runtime': '118 min', 'rating': '6.8', 'votes': '226,997'}, {'movie_id': 'tt6823368', 'title': 'Glass', 'year': '(2019)', 'rank': '48.', 'runtime': '129 min', 'rating': '6.6', 'votes': '224,677'}, {'movie_id': 'tt1979376', 'title': 'Toy Story 4', 'year': '(2019)', 'rank': '49.', 'runtime': '100 min', 'rating': '7.7', 'votes': '223,650'}, {'movie_id': 'tt2704998', 'title': 'Game Night', 'year': '(I) (2018)', 'rank': '50.', 'runtime': '100 min', 'rating': '6.9', 'votes': '220,159'}]    
    """
    url = "http://www.imdb.com/search/title?at=0&sort=num_votes,desc&start=1&title_type=feature&year=2018,2020"
    print ("Movies list: ", read_m_from_url(url))

In [67]:
 test_read_m_from_url()

http://www.imdb.com/search/title?at=0&sort=num_votes,desc&start=1&title_type=feature&year=2018,2020
tt7286456
Joker
1.
2019
122 min
8.4
1,271,181
Crime, Drama, Thriller            

tt4154796
Avengers: Endgame
2.
2019
181 min
8.4
1,121,459
Action, Adventure, Drama            

tt4154756
Avengers: Infinity War
3.
2018
149 min
8.4
1,073,326
Action, Adventure, Sci-Fi            

tt6751668
Parasite
4.
2019
132 min
8.5
794,414
Drama, Thriller            

tt1825683
Black Panther
5.
2018
134 min
7.3
767,098
Action, Adventure, Sci-Fi            

tt7131622
Once Upon a Time in Hollywood
6.
2019
161 min
7.6
731,841
Comedy, Drama            

tt8946378
Knives Out
7.
2019
130 min
7.9
625,540
Comedy, Crime, Drama            

tt8579674
1917
8.
2019
119 min
8.2
585,340
Action, Drama, War            

tt5463162
Deadpool 2
9.
2018
119 min
7.7
581,115
Action, Adventure, Comedy            

tt4154664
Captain Marvel
10.
2019
123 min
6.8
559,308
Action, Adventure, Sci-Fi            

tt1727824
Bohemian 

We can see that 50 entries from the first page of the IMDb list were successfully scraped using the 'test_read_from_url' function. 

##  read_m_by_voting

In [27]:
m_per_page = 50 # by default, imdb return 50 movies page url.
def read_m_by_voting(first_year, last_year, top_number):
    
    current_index = 1  # initialize current_index. In the first iteration, we need to have start = 1.
    
    final_list = []  # initialize the return value. This method returns a list. Each item in the list is a dictionary. 
                     # Each dictionary includes information regarding a movie.

    for i in range(int(math.ceil(top_number/50.0))):
        url= 'http://www.imdb.com/search/title/?at=0&sort=num_votes,desc&start='+str(current_index)+'&title_type=feature&year='+str(first_year)+','+ str(last_year)

        if (i+1) == len(range( int(math.ceil(top_number/50.0)))):
            lis = read_m_from_url(url, top_number - current_index + 1)
        else:
            lis = read_m_from_url(url, m_per_page)
        final_list += lis
        current_index +=50

    return final_list

### Test read_m_by_voting

In [28]:
def test_read_m_by_voting():
    """output:
    [{'movie_id': 'tt7286456', 'title': 'Joker', 'year': '(2019)', 'rank': '1.', 'genres': 'Crime, Drama, Thriller', 'runtime': '122 min', 'rating': '8.4', 'votes': '"1,074,179"'}, {'movie_id': 'tt4154796', 'title': 'Avengers: Endgame', 'year': '(2019)', 'rank': '2.', 'genres': 'Action, Adventure, Drama', 'runtime': '181 min', 'rating': '8.4', 'votes': '"945,422"'}]
    """
    print (read_m_by_voting(2018,2020,3))  # This will print a list of top three movies.

In [29]:
test_read_m_by_voting()

http://www.imdb.com/search/title/?at=0&sort=num_votes,desc&start=1&title_type=feature&year=2018,2020
tt7286456
Joker
1.
2019
122 min
8.4
1,271,181
Crime, Drama, Thriller            

tt4154796
Avengers: Endgame
2.
2019
181 min
8.4
1,121,459
Action, Adventure, Drama            

tt4154756
Avengers: Infinity War
3.
2018
149 min
8.4
1,073,326
Action, Adventure, Sci-Fi            

[{'movie_id': 'tt7286456', 'title': 'Joker', 'rank': '1.', 'year': '2019', 'runtime': '122 min', 'rating': '8.4', 'votes': '1,271,181', 'genres': 'Crime, Drama, Thriller            '}, {'movie_id': 'tt4154796', 'title': 'Avengers: Endgame', 'rank': '2.', 'year': '2019', 'runtime': '181 min', 'rating': '8.4', 'votes': '1,121,459', 'genres': 'Action, Adventure, Drama            '}, {'movie_id': 'tt4154756', 'title': 'Avengers: Infinity War', 'rank': '3.', 'year': '2018', 'runtime': '149 min', 'rating': '8.4', 'votes': '1,073,326', 'genres': 'Action, Adventure, Sci-Fi            '}]


# write_movies_csv

In [35]:
import csv
def write_movies_csv(final_list, filename):
    keys = final_list[0].keys()
    with open(filename, 'w', newline='') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(final_list)


In [36]:
# The output of the test_write_movies_csv method is the "IMDb_TopVoted.csv" file.
def test_write_movies_csv(): 
    li = read_m_by_voting(2018, 2020, 500) # To read the top voted 500 movies between 2018 and 2020 from imdb.
    print(li)
    print("================================================================")
    write_movies_csv(li,"IMDb_TopVoted.csv") 


In [37]:
test_write_movies_csv()

http://www.imdb.com/search/title/?at=0&sort=num_votes,desc&start=1&title_type=feature&year=2018,2020
tt7286456
Joker
1.
2019
122 min
8.4
1,271,181
Crime, Drama, Thriller            

tt4154796
Avengers: Endgame
2.
2019
181 min
8.4
1,121,459
Action, Adventure, Drama            

tt4154756
Avengers: Infinity War
3.
2018
149 min
8.4
1,073,326
Action, Adventure, Sci-Fi            

tt6751668
Parasite
4.
2019
132 min
8.5
794,414
Drama, Thriller            

tt1825683
Black Panther
5.
2018
134 min
7.3
767,098
Action, Adventure, Sci-Fi            

tt7131622
Once Upon a Time in Hollywood
6.
2019
161 min
7.6
731,841
Comedy, Drama            

tt8946378
Knives Out
7.
2019
130 min
7.9
625,540
Comedy, Crime, Drama            

tt8579674
1917
8.
2019
119 min
8.2
585,340
Action, Drama, War            

tt5463162
Deadpool 2
9.
2018
119 min
7.7
581,115
Action, Adventure, Comedy            

tt4154664
Captain Marvel
10.
2019
123 min
6.8
559,308
Action, Adventure, Sci-Fi            

tt1727824
Bohemian

Based on the the parameters provided, (2018, 2020, 500), we can see that the top 500 movies were successfully scraped from the IMDb ranking list.

# Importing the given dataset "Movies.csv" to Pandas DataFrame called df1

In [73]:
df1 = pd.read_csv(r'C:\Users\patki\Movies.csv')

# Import the scraped data from the IMDb_TopVoted.csv file to Pandas DataFrame called df2

In [38]:
df2 = pd.read_csv(r'C:\Users\patki\IMDb_TopVoted.csv', encoding = "ISO-8859-1")

# Data cleansing and transformation for df2.

### Data Exploration:

In [46]:
# Checking the data types for each column in df2:
df2.dtypes

movie_id     object
title        object
rank        float64
year         object
runtime      object
rating      float64
votes        object
genres       object
dtype: object

In [47]:
# Checking df2 for null values:
df2.isnull().sum()

movie_id    0
title       0
rank        0
year        0
runtime     0
rating      0
votes       0
genres      0
dtype: int64

We can see that there are no null values in any of the columns in df2.

In [54]:
# Using '.describe()' to count the unique values in each column. This is to ensure that all the observations in 'movie_id', 'title'  'rank' are unique, and that the 'year' column only has 3 distinct values (2018, 2019, or 2020).
# For columns with 'object' data type:
df2.describe(include=object)

Unnamed: 0,movie_id,title,year,runtime,votes,genres
count,500,500,500,500,500,500
unique,500,500,4,86,498,138
top,tt7286456,Joker,2018,90 min,106127,"Action, Adventure, Sci-Fi"
freq,1,1,202,16,2,18


In [59]:
# Counting unique values in the 'rank' column:
df2['rank'].unique().size

500

We can see that 'movie_id', 'title', and 'rank' columns correctly have 500 unique values. However, we also see that the 'year' column has **4** unique values instead of 3. I explore the 'year' column to investigate further:

In [61]:
df2.year.unique()

array(['2019', '2018', '2020', 'V2020'], dtype=object)

The 'year' column has an incorrect value of 'V2020' which will have to be replaced. 

### Data cleansing and transformation:

In order to convert 'year', 'runtime' and 'votes' into integer data type, we will first have to clean the data in order to:
- Replace 'V2020' in the 'year' column with '2020'.
- Remove the 'min' characters from each observation in the 'runtime' column. 
- Remove the commas from each observation in the 'votes' column.

In [71]:
# Removing non-numeric characters from 'year' and 'runtime' columns
df2['year'] = df2['year'].replace('V2020', '2020')
df2.runtime = pd.to_numeric(df2.runtime.str.replace("min", ""))

# Removing commas from 'votes' columns

df2.votes = df2.votes.str.replace(",", "")

# rank, year, runtime, and votes should have a numeric integer data type.

cols = ['rank', 'year', 'runtime', 'votes']
df2[cols]=df2[cols].astype(int)

df2.dtypes

movie_id     object
title        object
rank          int32
year          int32
runtime       int32
rating      float64
votes         int32
genres       object
dtype: object

# 	Enrich the given dataset (df1) by merging it to the scraped data (df2).

In [74]:
# Merge the two dataframes to one dataframe called df.
df = pd.merge(df1, df2)

# Rearrange the dataset fields to be listed in the following order: 
 movie_id, rank, votes, title, originalTitle, year, rating, titleType, isAdult, runtime,  genres

In [75]:
# Rearrange the dataset fields.

df = df[['movie_id','rank', 'votes', 'title', 'originalTitle', 'year', 'rating', 'titleType', 'isAdult', 'runtime', 'genres']]


#ordering the dataset by 'rank'

df = df.sort_values(by=['rank'])

df

Unnamed: 0,movie_id,rank,votes,title,originalTitle,year,rating,titleType,isAdult,runtime,genres
0,tt7286456,1,1271181,Joker,Joker,2019,8.4,movie,0,122,"Crime, Drama, Thriller"
1,tt4154796,2,1121459,Avengers: Endgame,Avengers: Endgame,2019,8.4,movie,0,181,"Action, Adventure, Drama"
2,tt4154756,3,1073326,Avengers: Infinity War,Avengers: Infinity War,2018,8.4,movie,0,149,"Action, Adventure, Sci-Fi"
3,tt6751668,4,794414,Parasite,Gisaengchung,2019,8.5,movie,0,132,"Drama, Thriller"
4,tt1825683,5,767098,Black Panther,Black Panther,2018,7.3,movie,0,134,"Action, Adventure, Sci-Fi"
...,...,...,...,...,...,...,...,...,...,...,...
483,tt6116856,495,26553,The Night Comes for Us,The Night Comes for Us,2018,6.9,movie,0,121,"Action, Thriller"
485,tt6348138,496,26508,Missing Link,Missing Link,2019,6.7,movie,0,93,"Animation, Adventure, Comedy"
491,tt2011311,497,26458,The Outsider,The Outsider,2018,6.2,movie,0,120,"Action, Crime, Drama"
484,tt8851668,498,26372,The Lovebirds,The Lovebirds,2020,6.1,movie,0,86,"Action, Comedy, Crime"


# Exploring enriched dataset for missing values:

In [77]:
df.isnull().sum()

movie_id         0
rank             0
votes            0
title            0
originalTitle    0
year             0
rating           0
titleType        0
isAdult          0
runtime          0
genres           0
dtype: int64

The merged dataset 'df' has **496** rows and **11** columns. **4 observations** were removed from the dataset due to a mismatch in the movies between the two original datasets. Apart from this, there are no missing values in the merged dataset. 

# Export the enriched dataset to a CSV file:

In [262]:
# Use the following naming convention: 
#  Project_3_PartA_Group#.csv

df.to_csv('Project_3_PartA_Patki_Soham.csv')
