In [53]:
# imdb_scraper.py
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time

def scrape_imdb_2024_movies():
    """Scrape IMDB 2024 movies with storylines"""
    
    # Setup Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
    
    driver = webdriver.Chrome(options=chrome_options)
    driver.maximize_window()
    
    all_movies = []
    max_movies = 250
    
    try:
        print("Starting to scrape 250 movies from IMDb 2024...")
        
        # Navigate to IMDB page with 250 results per page
        url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&sort=num_votes,desc&count=250"
        print(f"\nURL: {url}")
        driver.get(url)
        time.sleep(5)
        
        # Find all movie cards
        movie_cards = driver.find_elements(By.CLASS_NAME, "ipc-metadata-list-summary-item")
        print(f"Found {len(movie_cards)} movies on page\n")
        
        if len(movie_cards) == 0:
            print("No movies found!")
            return pd.DataFrame()
        
        # Go through each movie card
        for idx, card in enumerate(movie_cards[:250], 1):
            try:
                # Get movie name
                name_element = card.find_element(By.CLASS_NAME, "ipc-title__text")
                movie_name = name_element.text
                
                # Remove numbering
                if '. ' in movie_name:
                    movie_name = movie_name.split('. ', 1)[1]
                
                # Get storyline - try multiple selectors
                storyline = "No storyline available"
                
                # Try method 1: data-testid
                try:
                    storyline_element = card.find_element(By.CSS_SELECTOR, 'span[data-testid="plot"]')
                    storyline = storyline_element.text
                except:
                    pass
                
                # Try method 2: class name
                if storyline == "No storyline available":
                    try:
                        storyline_element = card.find_element(By.CLASS_NAME, "ipc-html-content-inner-div")
                        storyline = storyline_element.text
                    except:
                        pass
                
                # Try method 3: by finding div with plot text
                if storyline == "No storyline available":
                    try:
                        all_text_divs = card.find_elements(By.TAG_NAME, "div")
                        for div in all_text_divs:
                            text = div.text
                            if len(text) > 50 and not text.startswith(movie_name):
                                storyline = text
                                break
                    except:
                        pass
                
                # Add to list
                all_movies.append({
                    'Movie Name': movie_name,
                    'Storyline': storyline
                })
                
                status = "✓" if storyline != "No storyline available" else "○"
                print(f"  {status} [{idx}] {movie_name}")
                
            except Exception as e:
                print(f"  ✗ Error on movie {idx}: {str(e)}")
                continue
        
        # Remove duplicates if any
        df = pd.DataFrame(all_movies)
        df = df.drop_duplicates(subset=['Movie Name'], keep='first')
        
        # Save to CSV
        df.to_csv('imdb_2024_movies.csv', index=False, encoding='utf-8')
        
        print(f"\n{'='*60}")
        print(f"✓ Successfully scraped {len(df)} unique movies!")
        print(f"✓ Saved to 'imdb_2024_movies.csv'")
        print(f"{'='*60}")
        
        return df
        
    except Exception as e:
        print(f"\n✗ Error during scraping: {str(e)}")
        import traceback
        traceback.print_exc()
        return pd.DataFrame()
    
    finally:
        driver.quit()

if __name__ == "__main__":
    df = scrape_imdb_2024_movies()
    
    if len(df) > 0:
        print(f"\nDataset shape: {df.shape}")
        print(f"\nFirst 5 movies:")
        print(df.head())
        print(f"\nMovies with storylines: {df[df['Storyline'] != 'No storyline available'].shape[0]}")
        print(f"Movies without storylines: {df[df['Storyline'] == 'No storyline available'].shape[0]}")
    else:
        print("\n✗ Scraping failed. No data collected.")

Starting to scrape 250 movies from IMDb 2024...

URL: https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&sort=num_votes,desc&count=250
Found 250 movies on page

  ✓ [1] Dune: Part Two
  ✓ [2] Deadpool & Wolverine
  ✓ [3] The Substance
  ✓ [4] Furiosa: A Mad Max Saga
  ✓ [5] Gladiator II
  ✓ [6] Alien: Romulus
  ✓ [7] Civil War
  ✓ [8] Anora
  ✓ [9] Nosferatu
  ✓ [10] The Fall Guy
  ✓ [11] Conclave
  ✓ [12] Inside Out 2
  ✓ [13] Wicked
  ✓ [14] Longlegs
  ✓ [15] Heretic
  ✓ [16] The Wild Robot
  ✓ [17] Twisters
  ✓ [18] Carry-On
  ✓ [19] Road House
  ✓ [20] Joker: Folie à Deux
  ✓ [21] The Beekeeper
  ✓ [22] Challengers
  ✓ [23] A Quiet Place: Day One
  ✓ [24] Beetlejuice Beetlejuice
  ✓ [25] Kingdom of the Planet of the Apes
  ✓ [26] Trap
  ✓ [27] Red One
  ✓ [28] The Ministry of Ungentlemanly Warfare
  ✓ [29] Venom: The Last Dance
  ✓ [30] I'm Still Here
  ✓ [31] A Real Pain
  ✓ [32] Smile 2
  ✓ [33] Godzilla x Kong: The New Empire
  ✓ [34] The Br

In [55]:
import os
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop", "imdb_2024_movies.csv")
df.to_csv(desktop_path, index=False)
print(f"File saved to: {desktop_path}")

File saved to: C:\Users\ACER\Desktop\imdb_2024_movies.csv


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
import pandas as pd
from nltk.corpus import stopwords
import re
movie=pd.read_csv("C:\\Users\\ACER\\Desktop\\imdb_2024_movies.csv")
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    text=str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split() 
    filtered_words = [word for word in words if word not in stop_words] # Tokenize (split into words) and filter out stop words
    return ' '.join(filtered_words)# Rejoin the filtered words into a single string
movie['clean_storyline'] = movie['Storyline'].apply(remove_stopwords)  # Apply to storyline


In [11]:
movie.to_csv('cleaned_imdb_2024.csv', index=False)
print(movie['clean_storyline'].head())

0               paul atreides unites fremen warpath revenge conspirators destroyed family facing choice love life fate universe endeavors prevent terrible future
1                            deadpool offered place marvel cinematic universe time variance authority instead recruits variant wolverine save universe extinction
2                                                           fading celebrity takes blackmarket drug cellreplicating substance helps create younger better version
3    snatched green place many mothers tyrants dementus immortan joe fight power control young furiosa must survive many trials puts together means find way home
4                                home conquered tyrannical emperors lead rome lucius forced enter colosseum must look past find strength return glory rome people
Name: clean_storyline, dtype: object


In [12]:
pd.set_option('display.max_colwidth', None)
print(movie[['Storyline', 'clean_storyline']].head(10))

                                                                                                                                                                                                                                     Storyline  \
0  Paul Atreides unites with the Fremen while on a warpath of revenge against the conspirators who destroyed his family. Facing a choice between the love of his life and the fate of the universe, he endeavors to prevent a terrible future.   
1                                                               Deadpool is offered a place in the Marvel Cinematic Universe by the Time Variance Authority, but instead recruits a variant of Wolverine to save his universe from extinction.   
2                                                                                                       A fading celebrity takes a black-market drug: a cell-replicating substance that helps her create a younger, better version of herself.   
3          After being snatched 

In [41]:
import os
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop", "cleaned_imdb_2024.csv")
movie.to_csv(desktop_path, index=False)
print(f"File saved to: {desktop_path}")

File saved to: C:\Users\ACER\Desktop\cleaned_imdb_2024.csv


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
movies = pd.read_csv('cleaned_imdb_2024.csv')
corpus = movies['clean_storyline'].dropna().tolist() 
vectorizer = TfidfVectorizer(max_features=5000, min_df=2, stop_words='english')
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['1950s' '1970s' '1985' '80s' 'abilities' 'able' 'accident' 'accidentally'
 'account' 'act' 'adapt' 'adolescence' 'adventure' 'adversary' 'age'
 'agent' 'ago' 'ahead' 'alien' 'alliance' 'alongside' 'altering'
 'ambitious' 'american' 'ancient' 'architect' 'army' 'arrives' 'art'
 'arthur' 'artist' 'aside' 'asked' 'aspiring' 'assassin' 'assignment'
 'attempts' 'author' 'backdrop' 'balance' 'bank' 'based' 'begin' 'begins'
 'behavior' 'beloved' 'best' 'betrayed' 'better' 'big' 'black' 'bob'
 'body' 'bond' 'book' 'bounty' 'boy' 'break' 'breaks' 'breakup'
 'brilliant' 'bring' 'brings' 'british' 'brother' 'brothers' 'brutal'
 'brutally' 'buried' 'business' 'called' 'campaign' 'car' 'care' 'career'
 'case' 'cat' 'chance' 'changed' 'childhood' 'children' 'chilling'
 'choice' 'christmas' 'church' 'circumstances' 'city' 'clear' 'client'
 'coach' 'come' 'comes' 'community' 'complex' 'conflict' 'confronts'
 'connection' 'consequences' 'conspiracy' 'convent' 'cops' 'corpse'
 'corrupt' 'corruption' 'c

In [None]:
print("✅ Shape:", X.shape)
print("✅ Non-zeros:", X.nnz)
print("✅ Density:", X.nnz/(X.shape[0]*X.shape[1]))  
print("✅ Sample row sum:", X[0].sum())  


✅ Shape: (250, 627)
✅ Non-zeros: 2187
✅ Density: 0.013952153110047848
✅ Sample row sum: 2.9373378718580914


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Calculate cosine similarity
similarity_score = cosine_similarity(X, X)  # Compare each row with the first row

print(f"Cosine Similarity: {similarity_score[0][0]:.4f}")

Cosine Similarity: 1.0000
