In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install wikipedia-api tqdm wikipedia

Collecting wikipedia-api
  Downloading wikipedia_api-0.7.1.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api, wikipedia
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.7.1-py3-none-any.whl size=14347 sha256=8d8ea809454605cab07745f48a8e1f1d9a3eedb68a6edc8be04cab647285a1f8
  Stored in directory: /root/.cache/pip/wheels/4c/96/18/b9201cc3e8b47b02b510460210cfd832ccf10c0c4dd0522962
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=29b60b142c68388c1971a9e124c49cab91806af032b8e13d780a62bc35616cac
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipe

In [3]:
import requests
from bs4 import BeautifulSoup
import wikipediaapi
import time
import re
from urllib.parse import unquote
from tqdm import tqdm
from datetime import datetime
from urllib.parse import quote
import concurrent.futures
import aiohttp
import asyncio
import wikipedia
import wikipedia.wikipedia

wikipedia.wikipedia.BeautifulSoup = lambda html: BeautifulSoup(html, 'lxml')

In [4]:
tmdb_df = pd.read_csv('../datasets/TMDB_movie_dataset_v11.csv')
tmdb_df.shape

(1154777, 24)

In [45]:
tmdb_df['release_date'] = pd.to_datetime(tmdb_df['release_date'])

movies_2000_plus = tmdb_df[tmdb_df['release_date'].dt.year >= 2000].copy()

popular_movies = tmdb_df[tmdb_df['popularity'] >= 100]

# Use .loc to safely modify the 'weighted_rating' column
movies_2000_plus.loc[:, 'weighted_rating'] = (movies_2000_plus['vote_average'] * movies_2000_plus['vote_count']) / movies_2000_plus['vote_count'].max()

# Filter the DataFrame further
filtered_movies_df = movies_2000_plus[
    (movies_2000_plus['release_date'].dt.year >= 2000) & 
    (movies_2000_plus['vote_count'] >= 69) #cursed numbers rule
].sort_values('weighted_rating', ascending=False)

# Output the results
print(f"Total movies: {len(tmdb_df)}")
print(f"Movies from 2000 onwards: {len(movies_2000_plus)}")
print(f"Percentage: {(len(movies_2000_plus)/len(tmdb_df)*100):.2f}%")
print(f"Movies with enough popularity: {len(popular_movies)}")
print(f"Movies with enough calculated popularity: {len(filtered_movies_df)}")


Total movies: 1154777
Movies from 2000 onwards: 645199
Percentage: 55.87%
Movies with enough popularity: 313
Movies with enough calculated popularity: 15801


In [5]:
def clean_title_for_wiki(title):
    """Clean movie title for Wikipedia search"""
    # Remove special characters except for spaces, hyphens, and colons
    clean_title = re.sub(r'[^\w\s\-_:]', '', title)
    # Replace '...' with '_'
    clean_title = clean_title.replace("...", "_")
    # Replace spaces with underscores
    clean_title = clean_title.replace(' ', '_')
    return clean_title

In [6]:
async def async_head_request(session, url):
    """
    Perform an async HEAD request and check both 200 and redirect responses.
    Also try a GET request if HEAD fails, as some servers handle HEAD differently.
    """
    try:
        # First try HEAD request
        async with session.head(url, follow_redirects=True) as response:
            if response.status_code in [200, 301, 302, 307, 308]:
                return url, True
                
        # If HEAD fails, try GET request
        async with session.get(url, follow_redirects=True) as response:
            return url, response.status_code in [200, 301, 302, 307, 308]
    except:
        return url, False

In [7]:
async def get_wiki_url_async(client, title, year):
    """Asynchronously generate and check Wikipedia URLs for the movie"""
    title = clean_title_for_wiki(title)
    
    # Add more common Wikipedia URL patterns
    url_variants = [
        f"{title}_({year}_film)",
        f"{title}_(film)",  # This pattern often works even without the year
        f"{title}_{year}_film",
        f"{title}_({year})",
        f"{title}_{year}",
        f"The_{title}_({year}_film)",
        f"The_{title}_{year}_film",
        f"{title}_(movie)",
        f"{title}"  # Last resort
    ]
    
    # Special cases for "The" titles
    if title.lower().startswith("the_"):
        base_title = title[4:]  # Remove "The_"
        url_variants.extend([
            f"The_{base_title}_({year}_film)",
            f"{base_title},_The_({year}_film)",
            f"{base_title}_({year}_film)",
            f"{base_title}_(film)"
        ])
    
    # Convert variants to full URLs and properly encode them
    urls = []
    for variant in url_variants:
        # Try both with and without percent encoding for certain characters
        urls.append(f"https://en.wikipedia.org/wiki/{quote(variant)}")
        urls.append(f"https://en.wikipedia.org/wiki/{variant}")
    
    # Remove duplicates while preserving order
    urls = list(dict.fromkeys(urls))
    
    # Create tasks for all URL checks
    tasks = [async_head_request(client, url) for url in urls]
    results = await asyncio.gather(*tasks)
    
    # Check results and return first successful URL
    for url, exists in results:
        if exists:
            return url
            
    # Fallback to search
    try:
        search_results = wikipedia.search(f"{title} {year} film")
        if search_results:
            page = wikipedia.page(search_results[0], auto_suggest=False)
            return page.url
    except:
        pass
    
    return None

In [8]:
def clean_plot_text(plot_text):
    """Clean and fix plot text (e.g., handling joined words like 'TheGuardians')"""
    # Fix spaces between joined words (e.g., "TheGuardians" should be "The Guardians")
    plot_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', plot_text) 
    plot_text = re.sub(r'([a-zA-Z])(\d)', r'\1 \2', plot_text)  
    plot_text = re.sub(r'\s([,\.!?;])', r'\1', plot_text)
    
    plot_text = re.sub(r'([\.!?;])([^\s])', r'\1 \2', plot_text)
    plot_text = re.sub(r'\s\'\s', r"'", plot_text)
    
    plot_text = re.sub(r'(\d)([^\w\s])', r'\1 \2', plot_text)
    return plot_text.strip()

In [9]:
def get_wiki_movie_plot(wiki, movie_url):
    """Extract plot from movie page using multiple methods"""
    if not movie_url:
        return None
    
    try:
        # Get page title from URL
        page_title = movie_url.split("/wiki/")[-1]
        page_title = unquote(page_title)
        
        # Try WikipediaAPI first
        page = wiki.page(page_title)
        if page.exists():
            plot_sections = ['Plot', 'Plot summary', 'Subject', 'Synopsis', 'Description', 'Story']
            for section in page.sections:
                if section.title in plot_sections:
                    plot = section.text
                    if plot and len(plot.strip()) > 100:  # Ensure we have meaningful content
                        return clean_plot_text(plot)
            
            # Fallback: If no plot section found and page is short
            if len(page.text) < 5000:
                response = requests.get(movie_url)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    content_div = soup.find('div', {'id': 'mw-content-text'})
                    
                    if content_div:
                        # Find all <p> tags within the div
                        p_tags = content_div.find_all('p')
                        
                        # Filter out empty paragraphs and those with very short text
                        valid_p_tags = [p for p in p_tags if len(p.get_text(strip=True)) > 50]
                        
                        # If we have at least two valid paragraphs
                        if len(valid_p_tags) >= 2:
                            plot_text = " ".join([p.get_text(strip=True) for p in valid_p_tags[:2]])
                            return clean_plot_text(plot_text)
                        # If only one valid paragraph is found
                        elif len(valid_p_tags) > 0:
                            return clean_plot_text(valid_p_tags[0].get_text(strip=True))
        
        # Try standard Wikipedia API as last resort
        try:
            wiki_page = wikipedia.page(page_title, auto_suggest=False)
            content = wiki_page.content
            sections = content.split('\n== ')
            
            for section in sections:
                if any(section.lower().startswith(header.lower()) for header in plot_sections):
                    plot = section.split('\n==')[0]
                    plot = re.sub(r'^(Plot|Subject|Synopsis|Description|Story)\s*=*\s*', '', plot, flags=re.IGNORECASE)
                    if len(plot.strip()) > 100:
                        return clean_plot_text(plot.strip())
        except:
            pass
                    
    except Exception as e:
        print(f"Error fetching plot for {movie_url}: {e}")
    
    return None

In [10]:
async def process_movie_batch(batch_df, wiki, session, progress_bar):
    plots_data = []
    failed_urls = []
    
    async def process_single_movie(row):
        title = row['title']
        year = row['release_date'].year
        
        wiki_url = await get_wiki_url_async(session, title, year)
        if wiki_url:
            plot = await asyncio.get_event_loop().run_in_executor(
                None, get_wiki_movie_plot, wiki, wiki_url
            )
            if plot:
                return {'id': row['id'], 'plot': plot}, None
        return None, {'title': title, 'year': year}
    
    tasks = [process_single_movie(row) for _, row in batch_df.iterrows()]
    results = await asyncio.gather(*tasks)
    
    for plot_data, failed_data in results:
        if plot_data:
            plots_data.append(plot_data)
        if failed_data:
            failed_urls.append(failed_data)
        progress_bar.update(1)
        progress_bar.set_postfix_str(f"Processing: {batch_df.iloc[0]['title']}")
    
    return plots_data, failed_urls

In [11]:
async def process_tmdb_movie_data_async(df, batch_size=10):
    df['release_date'] = pd.to_datetime(df['release_date'])
    filtered_df = df[
        (df['release_date'].dt.year >= 2000) &
        (df['vote_count'] >= 80)
    ].copy()
    
    filtered_df.loc[:, 'weighted_rating'] = (
        filtered_df['vote_average'] * filtered_df['vote_count']
    ) / filtered_df['vote_count'].max()
    
    filtered_df = filtered_df.sort_values('weighted_rating', ascending=False)
    
    metadata_columns = [
        'id', 'title', 'vote_count', 'vote_average', 'runtime',
        'adult', 'original_language', 'popularity', 'poster_path',
        'backdrop_path', 'genres', 'production_countries',
        'release_date', 'spoken_languages', 'keywords'
    ]
    metadata_df = filtered_df[metadata_columns].copy()
    
    wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI,
        user_agent='MoviePlotExtractor/1.0'
    )
    
    all_plots_data = []
    all_failed_urls = []
    
    progress_bar = tqdm(total=len(filtered_df), desc="Processing Movies", unit="movie")
    
    async with aiohttp.ClientSession() as session:
        batches = [filtered_df[i:i + batch_size] for i in range(0, len(filtered_df), batch_size)]
        
        for batch in batches:
            plots_data, failed_urls = await process_movie_batch(batch, wiki, session, progress_bar)
            all_plots_data.extend(plots_data)
            all_failed_urls.extend(failed_urls)
            await asyncio.sleep(0.2)
    
    progress_bar.close()
    plots_df = pd.DataFrame(all_plots_data)
    
    print(f"\nTotal failed URLs: {len(all_failed_urls)}")
    print("\nMovies without Wikipedia pages:")
    for movie in all_failed_urls:
        print(f"{movie['title']} ({movie['year']})")
    
    return metadata_df, plots_df

In [12]:
tmdb_metadata_df, tmdb_plots_df = await process_tmdb_movie_data_async(tmdb_df)

Processing Movies:  46%|████▌     | 6610/14500 [1:09:03<1:22:02,  1.60movie/s, Processing: Cold Blood]                                                              

Error fetching plot for https://en.wikipedia.org/wiki/Doctor_Sleep_(2019_film): ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Processing Movies:  86%|████████▌ | 12470/14500 [2:11:03<22:31,  1.50movie/s, Processing: Open 24 Hours]                                                                    

Error fetching plot for https://en.wikipedia.org/wiki/Code_Name_Banshee: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Processing Movies: 100%|██████████| 14500/14500 [2:32:27<00:00,  1.59movie/s, Processing: I Spit on Your Grave: Déjà Vu]                          


Total failed URLs: 987

Movies without Wikipedia pages:
Jurassic World (2015)
Star Wars: Episode III - Revenge of the Sith (2005)
Sin City (2005)
The Lego Movie (2014)
Moonlight (2016)
Don't Breathe (2016)
The Croods (2013)
The Handmaiden (2016)
The Ballad of Buster Scruggs (2018)
Black Mirror: Bandersnatch (2018)
Sin City: A Dame to Kill For (2014)
He's Just Not That Into You (2009)
The Tree of Life (2011)
Self/less (2015)
Bridget Jones's Baby (2016)
Don't Breathe 2 (2021)
Tinker Tailor Soldier Spy (2011)
You're Next (2013)
Traffic (2000)
Guy Ritchie's The Covenant (2023)
The Dead Don't Die (2019)
What If (2013)
Clifford the Big Red Dog (2021)
Triangle of Sadness (2022)
I'm Thinking of Ending Things (2020)
Batman: The Dark Knight Returns, Part 1 (2012)
Star Wars: The Clone Wars (2008)
To Rome with Love (2012)
The Man Who Wasn't There (2001)
Fear Street: 1666 (2021)
Batman: The Dark Knight Returns, Part 2 (2013)
Grindhouse (2007)
Bowling for Columbine (2002)
The Animatrix (2003)
What 




In [13]:
tmdb_plots_df.head(50)

Unnamed: 0,id,plot
0,27205,"Dom Cobb and Arthur are ""extractors"" who perfo..."
1,157336,"In the mid-21st century, humanity faces extinc..."
2,155,A gang of masked criminals rob a mafia-owned b...
3,299536,Having acquired the Power Stone—one of six Inf...
4,19995,"In 2154 , Earth suffers from resource exhausti..."
5,24428,"The Asgardian Loki encounters the Other, the r..."
6,293660,"As the vigilante Deadpool, Wade Wilson ambushe..."
7,118340,"In 1988 , following his mother's death, the ch..."
8,68718,"In 1858 Texas, brothers Ace and Dicky Speck dr..."
9,671,"Professors Albus Dumbledore, Minerva Mc Gonnag..."


In [14]:
tmdb_plots_df.to_csv('tmdb_movie_plots_v1.csv', index=False)
tmdb_metadata_df.to_csv('tmdb_movie_metadata_v1.csv', index=False)