In [1]:
# Import necessary libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Define headers to simulate a browser visit and set the default language to English
headers = {
    'Accept-Language': 'en-US,en;q=0.5',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}


# Define base URLs
base_url = "https://www.themoviedb.org"
Page_url = "https://www.themoviedb.org/movie?page="

#check website status code 
requests.get("https://www.themoviedb.org",headers=headers)
requests.get("https://www.themoviedb.org/movie?page=",headers=headers)

# Initialize an empty list to store URLs for pagination
url_lst=[]

# Generate URLs for the first 100 pages of movie listings
for i in range(1,101):
    url_lst.append(Page_url+str(i))
    
# Initialize a list to store all movie details
all_movies=[]

# Loop through each URL in the URL list   
for url in url_lst:
        # Get the source code of the current page
        source_code = requests.get(url,headers=headers).text
        # Parse the source code using BeautifulSoup
        converted_data = BeautifulSoup(source_code, 'lxml')
        
        # Find all movie detail containers on the page
        movie_details = converted_data.find_all('div', class_='card style_1')
        
         # Loop through each movie detail container
        for box in movie_details:
            # Extract the movie name
            movie_name = box.find('h2').text.strip()
            
            # Extract the release date
            release_date = box.find('p').text.strip()
            
            # Extract the user rating
            ratings = box.find('div', class_='user_score_chart')['data-percent']
            
            # Extract the link for each movie
            movie_link = base_url + box.find('a')['href']
        
            # Navigate to the individual movie page
            inner_response = requests.get(movie_link,headers=headers).text
            inner_soup = BeautifulSoup(inner_response,'lxml')
            
            # Extract duration
            M_duration = inner_soup.find('span', class_='runtime')
            duration = M_duration.text.strip() if duration_elem else "NA"

            # Extract genres
            M_genres = inner_soup.find('span', class_='genres')
            genres = M_genres.text.strip().replace("\n", "").replace("\xa0", "") if genres_elem else "NA"

            # Extract director name
            director= inner_soup.find('li', class_='profile')
            director_name = director.find('a').text.strip() if director_elem else "NA"

            # Store the details in a dictionary
            movie_details = {
                "Name": movie_name,
                "Release_Date": release_date,
                "Ratings": ratings,
                "Duration": duration,
                "Genres": genres,
                "Director": director_name
            }
            
            # Append the dictionary to the list of all movies
            all_movies.append(movie_details)

In [2]:
# Convert the list of movie details to a DataFrame
df = pd.DataFrame(all_movies)

In [3]:
df

Unnamed: 0,Name,Release_Date,Ratings,Duration,Genres,Director
0,Inside Out 2,"Jun 14, 2024",77,1h 37m,"Animation,Family,Adventure,Comedy,Drama",Kelsey Mann
1,Despicable Me 4,"Jul 03, 2024",78,1h 35m,"Animation,Family,Comedy,Action",Ken Daurio
2,Furiosa: A Mad Max Saga,"May 22, 2024",77,2h 29m,"Action,Adventure,Science Fiction",George Miller
3,Kingdom of the Planet of the Apes,"May 08, 2024",69,2h 25m,"Science Fiction,Adventure,Action",Amanda Silver
4,Bad Boys: Ride or Die,"Jun 14, 2024",70,1h 55m,"Action,Crime,Thriller,Comedy",George Gallo
...,...,...,...,...,...,...
1995,Jim Henson Idea Man,"May 18, 2024",80,1h 48m,Documentary,Ron Howard
1996,Old School,"Feb 21, 2003",66,1h 28m,Comedy,Todd Phillips
1997,Boyz n the Hood,"Jul 12, 1991",76,1h 52m,"Crime,Drama",John Singleton
1998,Pinocchio,"Feb 23, 1940",71,1h 28m,"Animation,Family,Fantasy",Albert Hurter


In [4]:
#Converting the data into excel sheet
df.to_excel('Moviesdetails.xlsx')