In [9]:
# Importing all necessary libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time

# Initialize lists to store data
Movie_Name = []
Year_list = []
Director_List = []
Rating = []
Genre = []
Top_5_cast = []
Image_links = []

# Setup Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless") 
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
service = Service(r'C:\Users\Rutik Retwade\Web Scrap jupyter notebook\chromedriver-win64\chromedriver.exe') 
driver = webdriver.Chrome(service=service, options=chrome_options)

def parse_movies(soup):
    # Find all movie data
    movie_data = soup.findAll('div', class_="_mvbx _flx")
    
    for i in movie_data:
        # Extract movie name
        movies_name = i.h3.a.text if i.h3 else "N/A"
        Movie_Name.append(movies_name)

        # Extract release year
        year = i.find("div", class_="_flx").text.strip() if i.find("div", class_="_flx") else "N/A"
        Year_list.append(year)

        # Extract director
        director = i.find("li", class_="_mvdrc").text.strip("Director: ") if i.find("li", class_="_mvdrc") else "N/A"
        Director_List.append(director)

        # Extract rating
        rating = i.find("span", class_="_revw").text.strip() if i.find("span", class_="_revw") else "N/A"
        Rating.append(rating)

        # Extract genre
        genre = i.find("li", class_="_mvgenre").text.strip().replace("\n", "").replace(",", "") if i.find("li", class_="_mvgenre") else "N/A"
        Genre.append(genre)

        # Extract top 5 cast
        top_5_cast = i.find("li", class_="lclamp").text.split(", ")[:5] if i.find("li", class_="lclamp") else "N/A"
        Top_5_cast.append(", ".join(top_5_cast))

        # Find the image link within the current movie data
        img_tag = i.find('img')
        if img_tag:
            src_link = img_tag.get('src')
            data_original_link = img_tag.get('data-original')
            
            # Append links if they are not the spacer.png link
            if src_link and src_link != "https://www.gadgets360.com/static/v1/images/spacer.png":
                Image_links.append(src_link)
            if data_original_link and data_original_link != "https://www.gadgets360.com/static/v1/images/spacer.png":
                Image_links.append(data_original_link)

# Load the base URL
base_url = "https://www.gadgets360.com/entertainment/new-hindi-movies"
driver.get(base_url)

# Wait for the initial content to load
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "_mvbx")))

for _ in range(20):  # Adjust the range to load more pages if needed
    view_more_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.ID, "viewmore"))
    )
    driver.execute_script("arguments[0].scrollIntoView(true);", view_more_button)
    driver.execute_script("arguments[0].click();", view_more_button)
    time.sleep(2)
    
# Parse the final loaded page content with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
parse_movies(soup)

# Close the Selenium WebDriver
driver.quit()

# Ensure all lists have the same length
min_length = min(len(Movie_Name), len(Year_list), len(Director_List), len(Rating), len(Genre), len(Top_5_cast), len(Image_links))

# Create a DataFrame with the extracted data
data = {
    'Movie_Name': Movie_Name[:min_length],
    'Release_Year': Year_list[:min_length],
    'Director': Director_List[:min_length],
    'Rating': Rating[:min_length],
    'Genre': Genre[:min_length],
    'Top_5_cast': Top_5_cast[:min_length],
    'Image_link': Image_links[:min_length]
}

df = pd.DataFrame(data)

# Save the DataFrame to an Excel file
df.to_excel('movie_data_final.xlsx', index=False)
print("Movies data saved successfully")


Movies data saved successfully
