# Web Scraping of IMDB Movies

Purpose: Being the most ardent fan of Movies in general, with India eventually warming up to winning accolades for movies such as Bahubali or Puspa at an increasing level, it seemed like an interesting option to scrape data from.

# Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np

import requests
from requests import get
from bs4 import BeautifulSoup

from time import sleep
from random import randint

# Scraping from web page

In [2]:
# Creating the lists we want to write into
titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []

In [3]:
# Getting English translated titles from the movies
headers = {'Accept-Language': 'en-US, en;q=0.5'}

# Creating Object of Beautiful Soup

In [4]:
pages = np.arange(1, 51, 50)
pages

array([1])

In [5]:
# Storing each of the urls of 50 movies 
for page in pages:
    # Getting the contents from the each url
    page = requests.get('https://www.imdb.com/search/title/?groups=top_1000&start=' + str(page) + '&ref_=adv_nxt', headers=headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Aiming the part of the html we want to get the information from
    movie_div = soup.find_all('div', class_='lister-item mode-advanced')
    
    # Controling the loop’s rate by pausing the execution of the loop for a specified amount of time
    # Waiting time between requests for a number between 2-10 seconds
    sleep(randint(2,10))
    
    for container in movie_div:
        # Scraping the movie's name
        name = container.h3.a.text
        titles.append(name)
        
        # Scraping the movie's year
        year = container.h3.find('span', class_='lister-item-year').text
        years.append(year)
        
        # Scraping the movie's length
        runtime = container.find('span', class_='runtime').text if container.p.find('span', class_='runtime') else '-'
        time.append(runtime)
        
        # Scraping the rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)
        
        # Scraping the metascore
        m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'
        metascores.append(m_score)
        
        # Scraping votes and gross earnings
        nv = container.find_all('span', attrs={'name':'nv'})
        vote = nv[0].text
        votes.append(vote)
        grosses = nv[1].text if len(nv) > 1 else '-'
        us_gross.append(grosses)

In [7]:
movies = pd.DataFrame({'movie':titles,
                       'year':years,
                       'time_minute':time,
                       'imdb_rating':imdb_ratings,
                       'metascore':metascores,
                       'vote':votes,
                       'gross_earning':us_gross})

movies.head()

Unnamed: 0,movie,year,time_minute,imdb_rating,metascore,vote,gross_earning
0,Top Gun: Maverick,(2022),130 min,8.4,78,401478,#78
1,Halloween,(1978),91 min,7.7,87,275013,$47.00M
2,Everything Everywhere All at Once,(2022),139 min,8.1,81,221766,#213
3,The Lord of the Rings: The Fellowship of the Ring,(2001),178 min,8.8,92,1854300,$315.54M
4,Harry Potter and the Sorcerer's Stone,(2001),152 min,7.6,65,770346,$317.58M


In [8]:
movies.dtypes

movie             object
year              object
time_minute       object
imdb_rating      float64
metascore         object
vote              object
gross_earning     object
dtype: object

# Cleaning the dataset

In [9]:
# Cleaning 'year' column
movies['year'] = movies['year'].str.extract('(\d+)').astype(int)
movies.head(3)

Unnamed: 0,movie,year,time_minute,imdb_rating,metascore,vote,gross_earning
0,Top Gun: Maverick,2022,130 min,8.4,78,401478,#78
1,Halloween,1978,91 min,7.7,87,275013,$47.00M
2,Everything Everywhere All at Once,2022,139 min,8.1,81,221766,#213


In [None]:
# Checking for missing values
final_df.isna().sum()

In [None]:
# To find any duplicat values
final_df.duplicated()

# Saving to CSV Format

In [None]:
dataset.to_csv("Olympic Games medal Dataset(from 1896 to 2022).csv",index=False)