In [11]:
# Importing all the required libraries for web scrapping and file handling in python. 

import requests
from bs4 import BeautifulSoup
import pandas as pd 
import os 
from datetime import datetime 

In [12]:
# Initializing all the required lists such as title , rating , year , certification , genre,vote and time. 
title_list = []
rating_list = []
year_list = []
certification_list = []
genre_list = []
vote_list = []
time_list = []


In [13]:
# IMDB Scrapper function to scrape 100 pages of imdb using the required genre. 

def imdb_scraper(genre):
    for page in range(1, 101):  # Scrape 100 pages
        url = f"https://www.imdb.com/search/title/?genres={genre}&start={page * 50}&explore=title_type,genres&ref_=adv_prv"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        movie_containers = soup.find_all('div', class_='lister-item mode-advanced')

        for container in movie_containers:
            # Extract movie details
            title = container.h3.a.text.strip()
            title_list.append(title)

            rating_element = container.find('div', class_='ratings-imdb-rating')
            rating = rating_element.strong.text if rating_element else ''       
            rating_list.append(rating)

            year = container.h3.find('span', class_='lister-item-year').text.strip('()')
            year_list.append(year)

            certification = container.find('span', class_='certificate').text if container.find('span', class_='certificate') else ''
            certification_list.append(certification) 

            genres = container.find('span', class_='genre').text.strip()
            genre_list.append(genres)

            vote_element = container.find('span', attrs={'name': 'nv'})
            vote = vote_element['data-value'] if vote_element else ''
            vote_list.append(vote)

            time_element = container.find('span', class_='runtime')
            time = time_element.text.strip() if time_element else ''
            time_list.append(time)
            



In [14]:
genre = "action"  # Initialize the genre which you want to scrape. 

imdb_scraper(genre)  # Calling the scrapper function. 

In [15]:
# Creating a dictionary to store all the lists which are generated using scrapper. 

imdb_dict = {
        'Title' : title_list , 
        'Rating' : rating_list , 
        'Year' : year_list ,
        'Certification' : certification_list , 
        'Genre' : genre_list , 
        'Vote' : vote_list , 
        'Time' : time_list
        }

In [16]:
imdb_dict[:5]  # Check out if the dictionary loads properly or not. 

{'Title': ['Reacher',
  'Justified',
  'S.W.A.T.',
  'Retribution',
  'Top Gun: Maverick',
  'Kraven the Hunter',
  'One Piece',
  'Teenage Mutant Ninja Turtles: Mutant Mayhem',
  'Vikings',
  'Ant-Man and the Wasp: Quantumania',
  '9-1-1',
  'Sisu',
  'Shazam! Fury of the Gods',
  'The Wrath of Becky',
  'The Batman',
  'Everything Everywhere All at Once',
  'Vinland Saga',
  'The Dark Knight',
  'Avatar',
  'Star Trek: The Next Generation',
  'Citadel',
  'The Last Kingdom',
  'Inception',
  'Hypnotic',
  'Bullet Train',
  'Spider-Man: No Way Home',
  'Banshee',
  'Smallville',
  'Ghosted',
  'Chicago Fire',
  'Prison Break',
  'Bloodhounds',
  'Buffy the Vampire Slayer',
  'John Wick',
  'Star Trek: Picard',
  'American Gladiators',
  '65',
  'Operation Fortune: Ruse de Guerre',
  'Final Fantasy XVI',
  'Avengers: Endgame',
  'Star Trek: Discovery',
  'Andor',
  'Mission: Impossible - Fallout',
  'Heat',
  'The Lord of the Rings: The Rings of Power',
  'Arrow',
  'Gotham Knights',
 

In [17]:
imdb_df = pd.DataFrame.from_dict(imdb_dict)  # Creating a dataframe to save .csv file of imdb dictionary.
imdb_df.head()

Unnamed: 0,Title,Rating,Year,Certification,Genre,Vote,Time
0,Reacher,8.1,2022–,A,"Action, Crime, Drama",142435.0,49 min
1,Justified,8.6,2010–2015,,"Action, Crime, Drama",106772.0,44 min
2,S.W.A.T.,7.2,2017–2023,16,"Action, Adventure, Crime",28104.0,43 min
3,Retribution,,I) (2023,,"Action, Crime, Drama",,
4,Top Gun: Maverick,8.3,2022,UA,"Action, Drama",602751.0,130 min


In [19]:
# Naming the csv file and the future versions. {Naming convention} 

csv_file = f"{datetime.now().strftime('%m_%d_%Y_%H_%M_%S')}_imdb.csv"  
imdb_df.to_csv(csv_file)  # Saving the dataframe into .csv 