# Import libraries 

In [None]:
# Import required libraries for web scraping and data handling
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import requests
import time
import csv
import pandas as pd

# SCRAPING STEP 1
# Collect all the movie links from the IMDb page

In [None]:
# Set up Chrome driver options
options = Options()
options.add_experimental_option('prefs', {'intl.accept_languages': 'en'})  # Ensure English language preference
# options.add_argument('--headless')  # Uncomment to enable headless mode (no browser window)
options.add_argument("--disable-search-engine-choice-screen")  # Disable search engine selection prompts

# IMDb URL for the movies sorted by number of votes
url = "https://www.imdb.com/search/title/?title_type=feature&count=100&sort=num_votes,desc"

# Launch the Chrome WebDriver
driver = webdriver.Chrome(options=options)
driver.get(url)

# Close the cookie consent pop-up (if present)
cookies_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/div/div/div[2]/div/button[1]'))
)
cookies_button.click()

# Function to scroll the page and click "Load More" buttons (if available)
def scroll_and_click_button():
    """
    Scrolls through the IMDb page and clicks the "Load More" button to reveal additional results.
    """
    # Wait for the page's main content to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

    # XPath of the "Load More" button
    button_xpath = '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[2]/div/span/button/span/span'

    while True:
        try:
            # Attempt to locate and click the button
            button = WebDriverWait(driver, 2).until(
                EC.element_to_be_clickable((By.XPATH, button_xpath))
            )
            button.click()
            print("Button found and clicked.")
            break  # Exit the loop once the button is successfully clicked
        except Exception:
            # Scroll down if the button isn't found
            driver.execute_script("window.scrollBy(0, 300);")
            time.sleep(1)  # Pause briefly to prevent overloading the browser

# Execute the scroll-and-click function multiple times
for _ in range(1):
    scroll_and_click_button()

# Function to extract movie links from the currently loaded page
def extract_links():
    """
    Extracts all movie links from the IMDb search results page.
    """
    # Save the page's HTML content for debugging purposes
    html_content = driver.page_source
    with open('page_source.html', 'w', encoding='utf-8') as f:
        f.write(html_content)
    print("HTML of the page saved as 'page_source.html' for diagnostics.")

    links = []

    # Locate all <a> tags containing <h3> elements, which represent movie links
    results = driver.find_elements(By.XPATH, '//*[contains(@id, "__next")]//a/h3')
    if not results:
        print("No links found with the current selector.")
    else:
        print(f"Found {len(results)} links.")

    # Extract the href attribute from the parent <a> tag of each <h3>
    for result in results:
        link = result.find_element(By.XPATH, '..').get_attribute('href')
        if link:
            links.append(link)

    return links

# Extract all movie links from the page
all_links = extract_links()

# Save the extracted links to a text file
with open('imdb_links.txt', 'w', encoding='utf-8') as file:
    for link in all_links:
        if link:
            file.write(link + '\n')

# Print the total number of links extracted
print(f"Total links extracted: {len(all_links)}")

# Close the browser once the links are collected
driver.quit()

# SCRAPING STEP 2
# Collect detailed movie information from each link

In [None]:
# SCRAPING STEP 2
# Collect detailed movie information from each link

# Function to get details about a movie
def get_movie_details(driver, movie_url):
    driver.get(movie_url)
    time.sleep(3)  # Wait for the page to load
    
    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    movie_data = {
        "Title": "N/A",
        "Rating": "N/A",
        "Directors": "N/A",
        "Writers": "N/A",
        "Actors": "N/A",
        "Production Company": "N/A",
        "Release Date": "N/A",
        "Genres": "N/A",
        "Parental Guide": "N/A",
        "Country of Origin": "N/A",
        "Languages": "N/A",
        "Runtime": "N/A",
        "Box Office": "N/A",
        "Budget": "N/A",
        "Plot Summary": "N/A"
    }
    
    try:
        # Extract movie details
        movie_data['Title'] = soup.find('h1').text.strip() if soup.find('h1') else 'N/A'
        
        # Rating
        movie_data['Rating'] = soup.find('span', class_='sc-d541859f-1 imUuxf').text.strip() if soup.find('span', class_='sc-d541859f-1 imUuxf') else 'N/A'
        
        # Directors
        directors_elements = soup.find_all('a', href=lambda x: x and 'tt_ov_dr_' in x)
        movie_data['Directors'] = ', '.join(set(director.text.strip() for director in directors_elements)) if directors_elements else 'N/A'
        
        # Writers
        writers_elements = soup.find_all('a', href=lambda x: x and 'tt_ov_wr_' in x)
        movie_data['Writers'] = ', '.join(set(writer.text.strip() for writer in writers_elements)) if writers_elements else 'N/A'
        
        # Actors
        actors_elements = soup.find_all('a', class_="sc-cd7dc4b7-1 kVdWAO")
        movie_data['Actors'] = ', '.join(actor.text.strip() for actor in actors_elements) if actors_elements else 'N/A'
        
        # Production Company
        produc_elements = soup.find_all('a', href=lambda x: x and 'tt_dt_cmpy_' in x)
        movie_data['Production Company'] = ', '.join(set(produc.text.strip() for produc in produc_elements)) if produc_elements else 'N/A'
        
        # Release Date
        release_date = soup.select_one('a.ipc-link[href*="releaseinfo"]')
        movie_data['Release Date'] = release_date.text.strip() if release_date else 'N/A'
        
        # Genres
        genre_tag = soup.find_all('span', class_='ipc-chip__text')
        all_genres = [genre.get_text(strip=True) for genre in genre_tag if genre.get_text(strip=True) != 'Back to top']
        movie_data['Genres'] = ', '.join(all_genres) if all_genres else 'N/A'

        # Parental guide
        guide_tag = soup.find('a', href=lambda x: x and 'parentalguide' in x)
        movie_data['Parental Guide'] = guide_tag.text if guide_tag else 'N/A'
        
        # Country of origin
        country_list = soup.find_all('a', class_='ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link')
        movie_data['Country of Origin'] = ', '.join([country.text.strip() for country in country_list if 'country_of_origin' in country['href'].lower()]) if country_list else 'N/A'
        
        # Languages
        language_list = soup.find_all('a', class_='ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link')
        movie_data['Languages'] = ', '.join([language.text.strip() for language in language_list if 'primary_language' in language['href'].lower()]) if language_list else 'N/A'
        
       # Extract the duration
        ul_tag = soup.find('ul', class_='ipc-inline-list ipc-inline-list--show-dividers sc-ec65ba05-2 joVhBE baseAlt')
        duration_tag = ul_tag.find_all('li')[-1] if ul_tag else None
        movie_data['Runtime'] = duration_tag.text.strip() if duration_tag else 'N/A'
        
        # Box Office
        box_office_tag = soup.find('li', {'data-testid': 'title-boxoffice-cumulativeworldwidegross'}).find('span', class_='ipc-metadata-list-item__list-content-item')
        movie_data['Box Office'] = box_office_tag.text.strip() if box_office_tag else 'N/A'
        
        # Budget
        budget_tag = soup.find('li', {'data-testid': 'title-boxoffice-budget'}).find('span', class_='ipc-metadata-list-item__list-content-item')
        movie_data['Budget'] = budget_tag.text.strip() if budget_tag else 'N/A'
        
        # Plot Summary
        plot_summary = soup.find('span', class_='sc-3ac15c8d-0 hRUoSB')
        movie_data['Plot Summary'] = plot_summary.text.strip() if plot_summary else 'N/A'
        
        return movie_data
    
    except Exception as e:
        print(f"Error collecting data for '{movie_url}': {e}")
        return None

# Set up Chrome options
chrome_options = Options()
chrome_options.add_experimental_option('prefs', {'intl.accept_languages': 'en'})
chrome_options.add_argument("--disable-search-engine-choice-screen")
chrome_options.add_argument("--start-maximized")  # Maximize the window for convenience

# Set up the WebDriver
driver = webdriver.Chrome(options=chrome_options)

# List to store the scraped movie data
movies_data = []

# Loop through each movie link and scrape details
for movie_url in all_links:
    movie_details = get_movie_details(driver, movie_url)
    if movie_details:
        movies_data.append(movie_details)
        print(f"Scraped: {movie_details['Title']}")  # Track progress

# Save the movie data to a CSV file
csv_file = 'movies_data_1.csv'
csv_columns = ['Title', 'Rating','Directors','Release Date','Writers','Actors','Production Company','Plot Summary', 'Genres', 'Parental Guide', 'Country of Origin', 'Languages', 'Runtime', 'Box Office','Budget']

with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=csv_columns)
    writer.writeheader()
    writer.writerows(movies_data)

# Close the Selenium WebDriver
driver.quit()

print(f"Scraped data for {len(movies_data)} movies and saved to {csv_file}")

# SCRAPING STEP 3
# Extract and save all image links for movies to a CSV file

In [None]:
# SCRAPING STEP 3
# Scrape all image links for movies and save them to a CSV file

# Set up for Selenium and ChromeDriver
options = Options()
options.add_experimental_option('prefs', {'intl.accept_languages': 'en'})
# options.add_argument('--headless')  # Esegui in modalità headless (senza interfaccia grafica)
options.add_argument("--disable-search-engine-choice-screen")
driver = webdriver.Chrome(options=options)

# Open a CSV to write data
with open('movie_images_1.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Title', 'Image Link']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()  

    for url in all_links:
        driver.get(url)

        # Close cookies
        try:
            cookies_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/div/div/div[2]/div/button[1]'))
            )
            cookies_button.click()
        except Exception as e:
            print(f"Non è stato possibile chiudere i cookies per {url}. Errore: {e}")

        try:
            # Find title
            movie_title_element = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/h1/span')
            movie_title = movie_title_element.text.strip()

            # Find button to open the img
            enlarge_button = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[1]/div[1]/div/a')
            
            # Click on button to open img
            enlarge_button.click()

            # Wait
            time.sleep(3)

            # Find the img
            enlarged_img_element = driver.find_element(By.XPATH, '/html/body/div[2]/main/div[2]/div[3]/div[4]/img')
            img_url = enlarged_img_element.get_attribute("src")

            # Write title and url to CSV
            writer.writerow({'Title': movie_title, 'Image Link': img_url})
            print(f"Titolo: {movie_title} - Link dell'immagine: {img_url}")

        except Exception as e:
            print(f"Errore durante l'elaborazione di {url}: {e}")

# Close the driver
driver.quit()