In [6]:
import pandas as pd
import numpy as np
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup



In [7]:
def get_url():
    """Mengembalikan URL halaman IMDb Top 250."""
    return "https://www.imdb.com/chart/top/"

def save_to_csv(movies_data):
    """Menyimpan list of dictionaries ke file CSV."""
    df = pd.DataFrame(movies_data)
    # Menyesuaikan nama file agar lebih deskriptif
    df.to_csv('imdb_top_250_movies_with_ratings.csv', index=False, encoding='utf-8-sig')
    print(f'‚úÖ Sukses menyimpan {len(df)} data film ke imdb_top_250_movies_with_ratings.csv')

In [8]:
def scrape_movies():
    url = get_url()
    
    options = Options()
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36')
    options.add_argument("lang=en-US")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=options)
    movies_data = []
    
    try:
        print(f'Membuka halaman: {url}')
        driver.get(url)
        
        # Wait for page to load
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'ul.ipc-metadata-list'))
        )
        
        # Scroll to load all content
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        
        movie_items = driver.find_elements(By.CSS_SELECTOR, 'li.ipc-metadata-list-summary-item')
        print(f'‚úÖ Ditemukan {len(movie_items)} film. Memulai proses scraping...')

        for idx in range(len(movie_items)):
            try:
                # Re-fetch elements to avoid stale reference
                all_items = driver.find_elements(By.CSS_SELECTOR, 'li.ipc-metadata-list-summary-item')
                if idx >= len(all_items):
                    break
                    
                item = all_items[idx]
                
                # Get basic info from list page
                try:
                    title_elem = item.find_element(By.CLASS_NAME, 'ipc-title__text')
                    title_text = title_elem.text
                    title = title_text.split('. ', 1)[1] if '. ' in title_text else title_text
                except:
                    title = "N/A"
                
                try:
                    metadata = item.find_elements(By.CLASS_NAME, 'cli-title-metadata-item')
                    year = metadata[0].text if len(metadata) > 0 else "N/A"
                    duration = metadata[1].text if len(metadata) > 1 else "N/A"
                except:
                    year = "N/A"
                    duration = "N/A"
                
                try:
                    rating = item.find_element(By.CLASS_NAME, 'ipc-rating-star--rating').text
                except:
                    rating = "N/A"
                
                # Get link and navigate to detail page
                directors = "N/A"
                stars = "N/A"
                genres = "N/A"
                
                try:
                    link = item.find_element(By.CSS_SELECTOR, 'a.ipc-title-link-wrapper')
                    movie_url = link.get_attribute('href')
                    
                    # Open in new tab
                    driver.execute_script("window.open(arguments[0], '_blank');", movie_url)
                    driver.switch_to.window(driver.window_handles[1])
                    
                    # Wait for detail page
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.TAG_NAME, 'body'))
                    )
                    time.sleep(1)
                    
                    # Get all genres (no limit)
                    try:
                        genre_elements = driver.find_elements(By.CSS_SELECTOR, 'a.ipc-chip--on-baseAlt span.ipc-chip__text')
                        if not genre_elements:
                            # Try alternative selector
                            genre_elements = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="genres"] a span')
                        
                        if genre_elements:
                            genre_list = [g.text for g in genre_elements if g.text]  # Get all genres
                            genres = ', '.join(genre_list) if genre_list else "N/A"
                    except Exception as e:
                        pass
                    
                    # Get directors from principal credits
                    try:
                        credit_items = driver.find_elements(By.CSS_SELECTOR, 'li[data-testid="title-pc-principal-credit"]')
                        
                        for credit in credit_items:
                            try:
                                label = credit.find_element(By.CSS_SELECTOR, 'span.ipc-metadata-list-item__label').text
                                
                                if 'Director' in label:
                                    links = credit.find_elements(By.CSS_SELECTOR, 'a.ipc-metadata-list-item__list-content-item')
                                    names = [link.text for link in links if link.text]
                                    if names:
                                        directors = ', '.join(names)
                            except:
                                continue
                    except:
                        pass
                    
                    # Get stars from cast section
                    try:
                        cast_section = driver.find_element(By.CSS_SELECTOR, '[data-testid="title-cast"]')
                        cast_items = cast_section.find_elements(By.CSS_SELECTOR, '[data-testid="title-cast-item"]')
                        
                        star_names = []
                        for cast_item in cast_items[:3]:  # Get top 3 stars
                            try:
                                name = cast_item.find_element(By.CSS_SELECTOR, 'a[data-testid="title-cast-item__actor"]').text
                                if name:
                                    star_names.append(name)
                            except:
                                continue
                        
                        if star_names:
                            stars = ', '.join(star_names)
                    except Exception as e:
                        pass
                    
                    # Close detail tab
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                    
                except Exception as e:
                    print(f"    ‚ö†Ô∏è Error getting details: {str(e)[:50]}")
                    # Make sure we're back on main window
                    while len(driver.window_handles) > 1:
                        driver.close()
                        driver.switch_to.window(driver.window_handles[0])
                
                print(f"-> Scraped ({idx+1}/{len(all_items)}): {title}")
                movies_data.append({
                    'title': title,
                    'year': year,
                    'duration': duration,
                    'rating': rating,
                    'genres': genres,
                    'directors': directors,
                    'stars': stars
                })
                
                time.sleep(0.3)  # Small delay between requests
                
            except Exception as e:
                print(f"  ‚ö†Ô∏è Error scraping movie {idx+1}: {str(e)[:80]}")
                # Ensure we're on main window
                try:
                    while len(driver.window_handles) > 1:
                        driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                except:
                    pass
                continue
            
    except Exception as e:
        print(f"‚ö†Ô∏è Terjadi kesalahan fatal: {e}")
    finally:
        driver.quit()
        print("‚úÖ Proses scraping selesai.")

    return movies_data


#### **RUN SCRAPING AND STORE DATA**

In [9]:
if __name__ == "__main__":
    hasil_scraping = scrape_movies()
    if hasil_scraping:
        save_to_csv(hasil_scraping)
    else:
        print("Tidak ada data untuk disimpan. Proses scraping gagal.")

Membuka halaman: https://www.imdb.com/chart/top/
‚úÖ Ditemukan 250 film. Memulai proses scraping...
‚úÖ Ditemukan 250 film. Memulai proses scraping...
-> Scraped (1/250): The Shawshank Redemption
-> Scraped (1/250): The Shawshank Redemption
-> Scraped (2/250): The Godfather
-> Scraped (2/250): The Godfather
-> Scraped (3/250): The Dark Knight
-> Scraped (3/250): The Dark Knight
-> Scraped (4/250): The Godfather Part II
-> Scraped (4/250): The Godfather Part II
-> Scraped (5/250): 12 Angry Men
-> Scraped (5/250): 12 Angry Men
-> Scraped (6/250): The Lord of the Rings: The Return of the King
-> Scraped (6/250): The Lord of the Rings: The Return of the King
-> Scraped (7/250): Schindler's List
-> Scraped (7/250): Schindler's List
-> Scraped (8/250): The Lord of the Rings: The Fellowship of the Ring
-> Scraped (8/250): The Lord of the Rings: The Fellowship of the Ring
-> Scraped (9/250): Pulp Fiction
-> Scraped (9/250): Pulp Fiction
-> Scraped (10/250): The Good, the Bad and the Ugly
-> Sc

In [10]:
# Check the scraped data
df = pd.read_csv('imdb_top_250_movies_with_ratings.csv')
print(f"Total movies scraped: {len(df)}")
print("\nFirst few rows:")
display(df.head(10))
print("\nDataFrame info:")
df.info()


Total movies scraped: 250

First few rows:


Unnamed: 0,title,year,duration,rating,genres,directors,stars
0,The Shawshank Redemption,1994,2h 22m,9.3,"Epic, Period Drama, Prison Drama, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton"
1,The Godfather,1972,2h 55m,9.2,"Epic, Gangster, Tragedy, Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan"
2,The Dark Knight,2008,2h 32m,9.1,"Action Epic, Epic, Psychological Drama, Psycho...",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart"
3,The Godfather Part II,1974,3h 22m,9.0,"Epic, Gangster, Tragedy, Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert De Niro, Robert Duvall"
4,12 Angry Men,1957,1h 36m,9.0,"Legal Drama, Psychological Drama, Crime, Drama",Sidney Lumet,"Henry Fonda, Lee J. Cobb, Martin Balsam"
5,The Lord of the Rings: The Return of the King,2003,3h 21m,9.0,"Action Epic, Adventure Epic, Epic, Fantasy Epi...",Peter Jackson,"Elijah Wood, Viggo Mortensen, Ian McKellen"
6,Schindler's List,1993,3h 15m,9.0,"Docudrama, Epic, Historical Epic, Period Drama...",Steven Spielberg,"Liam Neeson, Ralph Fiennes, Ben Kingsley"
7,The Lord of the Rings: The Fellowship of the Ring,2001,2h 58m,8.9,"Action Epic, Adventure Epic, Dark Fantasy, Epi...",Peter Jackson,"Elijah Wood, Ian McKellen, Orlando Bloom"
8,Pulp Fiction,1994,2h 34m,8.8,"Dark Comedy, Drug Crime, Gangster, Crime, Drama",Quentin Tarantino,"John Travolta, Uma Thurman, Samuel L. Jackson"
9,"The Good, the Bad and the Ugly",1966,2h 58m,8.8,"Action Epic, Adventure Epic, Dark Comedy, Dese...",Sergio Leone,"Clint Eastwood, Eli Wallach, Lee Van Cleef"



DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      250 non-null    object 
 1   year       250 non-null    int64  
 2   duration   250 non-null    object 
 3   rating     250 non-null    float64
 4   genres     250 non-null    object 
 5   directors  249 non-null    object 
 6   stars      250 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 13.8+ KB


In [11]:
# Check the genres column
print("\nüìä Sample with genres:")
display(df[['title', 'year', 'rating', 'genres']].head(10))

print("\nüé≠ Check genre coverage:")
print(f"Movies with genres: {df['genres'].notna().sum()}/{len(df)}")
print(f"Movies with 'N/A' genres: {(df['genres'] == 'N/A').sum()}")



üìä Sample with genres:


Unnamed: 0,title,year,rating,genres
0,The Shawshank Redemption,1994,9.3,"Epic, Period Drama, Prison Drama, Drama"
1,The Godfather,1972,9.2,"Epic, Gangster, Tragedy, Crime, Drama"
2,The Dark Knight,2008,9.1,"Action Epic, Epic, Psychological Drama, Psycho..."
3,The Godfather Part II,1974,9.0,"Epic, Gangster, Tragedy, Crime, Drama"
4,12 Angry Men,1957,9.0,"Legal Drama, Psychological Drama, Crime, Drama"
5,The Lord of the Rings: The Return of the King,2003,9.0,"Action Epic, Adventure Epic, Epic, Fantasy Epi..."
6,Schindler's List,1993,9.0,"Docudrama, Epic, Historical Epic, Period Drama..."
7,The Lord of the Rings: The Fellowship of the Ring,2001,8.9,"Action Epic, Adventure Epic, Dark Fantasy, Epi..."
8,Pulp Fiction,1994,8.8,"Dark Comedy, Drug Crime, Gangster, Crime, Drama"
9,"The Good, the Bad and the Ugly",1966,8.8,"Action Epic, Adventure Epic, Dark Comedy, Dese..."



üé≠ Check genre coverage:
Movies with genres: 250/250
Movies with 'N/A' genres: 0


In [18]:
# Check sample of data
print("Sample of scraped data:")
print(df[['title', 'directors', 'stars']].head(10))
print("\n")
print(f"Movies with directors: {df['directors'].notna().sum()}")
print(f"Movies with stars: {df['stars'].notna().sum()}")


Sample of scraped data:
                                               title             directors  \
0                           The Shawshank Redemption        Frank Darabont   
1                                      The Godfather  Francis Ford Coppola   
2                                    The Dark Knight     Christopher Nolan   
3                              The Godfather Part II  Francis Ford Coppola   
4                                       12 Angry Men          Sidney Lumet   
5      The Lord of the Rings: The Return of the King         Peter Jackson   
6                                   Schindler's List      Steven Spielberg   
7  The Lord of the Rings: The Fellowship of the Ring         Peter Jackson   
8                                       Pulp Fiction     Quentin Tarantino   
9                     The Good, the Bad and the Ugly          Sergio Leone   

                                           stars  
0        Tim Robbins, Morgan Freeman, Bob Gunton  
1           Mar

In [14]:
# Test scraping one movie detail page to debug stars extraction
def test_movie_detail():
    options = Options()
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
    options.add_argument("--window-size=1920,1080")
    
    driver = webdriver.Chrome(options=options)
    
    try:
        # Test with The Shawshank Redemption
        test_url = "https://www.imdb.com/title/tt0111161/"
        print(f"Testing URL: {test_url}")
        driver.get(test_url)
        
        time.sleep(3)
        
        # Try different selectors for stars
        print("\n=== Testing different selectors ===")
        
        # Method 1: Principal credits
        try:
            credits = driver.find_elements(By.CSS_SELECTOR, 'li[data-testid="title-pc-principal-credit"]')
            print(f"Found {len(credits)} credit sections")
            
            for i, credit in enumerate(credits):
                try:
                    label = credit.find_element(By.CSS_SELECTOR, 'span.ipc-metadata-list-item__label').text
                    print(f"  Credit {i}: {label}")
                    
                    items = credit.find_elements(By.CSS_SELECTOR, 'a')
                    names = [item.text for item in items if item.text]
                    print(f"    Names: {', '.join(names[:5])}")
                except Exception as e:
                    print(f"    Error: {e}")
        except Exception as e:
            print(f"Method 1 failed: {e}")
        
        # Method 2: All cast
        print("\n=== Method 2: Cast section ===")
        try:
            cast_section = driver.find_element(By.CSS_SELECTOR, '[data-testid="title-cast"]')
            cast_items = cast_section.find_elements(By.CSS_SELECTOR, '[data-testid="title-cast-item"]')
            print(f"Found {len(cast_items)} cast items")
            for i, item in enumerate(cast_items[:5]):
                try:
                    name = item.find_element(By.CSS_SELECTOR, 'a[data-testid="title-cast-item__actor"]').text
                    print(f"  {i+1}. {name}")
                except:
                    pass
        except Exception as e:
            print(f"Method 2 failed: {e}")
            
    finally:
        driver.quit()

# Run test
test_movie_detail()


Testing URL: https://www.imdb.com/title/tt0111161/

=== Testing different selectors ===

=== Testing different selectors ===
Found 6 credit sections
  Credit 0: Director
    Names: Frank Darabont
  Credit 1: Writers
    Names: Stephen King, Frank Darabont
    Error: Message: no such element: Unable to locate element: {"method":"css selector","selector":"span.ipc-metadata-list-item__label"}
  (Session info: chrome=142.0.7444.176); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff7a6f9a235
	0x7ff7a6cf2630
	0x7ff7a6a816dd
	0x7ff7a6ada27e
	0x7ff7a6ada58c
	0x7ff7a6accd7c
	0x7ff7a6accc36
	0x7ff7a6b2baba
	0x7ff7a6acb0ed
	0x7ff7a6acbf63
	0x7ff7a6fc5d60
	0x7ff7a6fbfe8a
	0x7ff7a6fe1005
	0x7ff7a6d0d71e
	0x7ff7a6d14e1f
	0x7ff7a6cfb7c4
	0x7ff7a6cfb97f
	0x7ff7a6ce18e8
	0x7fff37c97374
	0x7fff395fcc91

  Credit 3: 
    Names: 
  Credit 4: 


### Test Genre Extraction
Test scraping genres from a single movie

In [None]:
# Test genre extraction on a single movie
def test_genre_extraction():
    options = Options()
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
    options.add_argument("--window-size=1920,1080")
    
    driver = webdriver.Chrome(options=options)
    
    try:
        # Test with The Shawshank Redemption
        test_url = "https://www.imdb.com/title/tt0111161/"
        print(f"Testing URL: {test_url}")
        driver.get(test_url)
        
        time.sleep(3)
        
        print("\n=== Testing Genre Extraction ===")
        
        # Method 1: Genre chips
        try:
            genre_elements = driver.find_elements(By.CSS_SELECTOR, 'a.ipc-chip--on-baseAlt span.ipc-chip__text')
            print(f"Method 1 - Found {len(genre_elements)} genre elements")
            genres = [g.text for g in genre_elements if g.text]  # Get all genres
            print(f"All Genres: {', '.join(genres)}")
        except Exception as e:
            print(f"Method 1 failed: {e}")
        
        # Method 2: Alternative selector
        try:
            genre_elements = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="genres"] a span')
            print(f"\nMethod 2 - Found {len(genre_elements)} genre elements")
            genres = [g.text for g in genre_elements if g.text]  # Get all genres
            print(f"All Genres: {', '.join(genres)}")
        except Exception as e:
            print(f"Method 2 failed: {e}")
            
    finally:
        driver.quit()
        print("\n‚úÖ Test completed")

# Run test
test_genre_extraction()


Testing URL: https://www.imdb.com/title/tt0111161/

=== Testing Genre Extraction ===

=== Testing Genre Extraction ===
Method 1 - Found 4 genre elements
Genres (max 3): Epic, Period Drama, Prison Drama

Method 2 - Found 0 genre elements
Genres (max 3): 
Method 1 - Found 4 genre elements
Genres (max 3): Epic, Period Drama, Prison Drama

Method 2 - Found 0 genre elements
Genres (max 3): 

‚úÖ Test completed

‚úÖ Test completed


## ‚úÖ Scraping Summary

Successfully scraped **IMDB Top 250 Movies** with the following information:
- **Title**: Movie name
- **Year**: Release year
- **Duration**: Movie runtime
- **Rating**: IMDB rating
- **Genres**: All genres for each movie
- **Directors**: Movie director(s)
- **Stars**: Top 3 cast members

**Output file**: `imdb_top_250_movies_with_ratings.csv`


In [12]:
# Final verification
df_final = pd.read_csv('imdb_top_250_movies_with_ratings.csv')

print("=" * 60)
print("SCRAPING RESULTS - IMDB TOP 250 MOVIES")
print("=" * 60)
print(f"\nüìä Total movies scraped: {len(df_final)}")
print(f"üìÅ File saved: imdb_top_250_movies_with_ratings.csv")
print(f"üì¶ File size: {df_final.memory_usage(deep=True).sum() / 1024:.2f} KB")

print("\nüìã Columns:")
for col in df_final.columns:
    non_null = df_final[col].notna().sum()
    print(f"  ‚Ä¢ {col}: {non_null}/{len(df_final)} filled ({non_null/len(df_final)*100:.1f}%)")

print("\nüé¨ Sample movies:")
display(df_final[['title', 'year', 'rating', 'directors']].head(10))

print("\n‚≠ê Top 10 rated movies:")
top_10 = df_final.nlargest(10, 'rating')[['title', 'year', 'rating', 'directors']]
display(top_10)


SCRAPING RESULTS - IMDB TOP 250 MOVIES

üìä Total movies scraped: 250
üìÅ File saved: imdb_top_250_movies_with_ratings.csv
üì¶ File size: 114.18 KB

üìã Columns:
  ‚Ä¢ title: 250/250 filled (100.0%)
  ‚Ä¢ year: 250/250 filled (100.0%)
  ‚Ä¢ duration: 250/250 filled (100.0%)
  ‚Ä¢ rating: 250/250 filled (100.0%)
  ‚Ä¢ genres: 250/250 filled (100.0%)
  ‚Ä¢ directors: 249/250 filled (99.6%)
  ‚Ä¢ stars: 250/250 filled (100.0%)

üé¨ Sample movies:


Unnamed: 0,title,year,rating,directors
0,The Shawshank Redemption,1994,9.3,Frank Darabont
1,The Godfather,1972,9.2,Francis Ford Coppola
2,The Dark Knight,2008,9.1,Christopher Nolan
3,The Godfather Part II,1974,9.0,Francis Ford Coppola
4,12 Angry Men,1957,9.0,Sidney Lumet
5,The Lord of the Rings: The Return of the King,2003,9.0,Peter Jackson
6,Schindler's List,1993,9.0,Steven Spielberg
7,The Lord of the Rings: The Fellowship of the Ring,2001,8.9,Peter Jackson
8,Pulp Fiction,1994,8.8,Quentin Tarantino
9,"The Good, the Bad and the Ugly",1966,8.8,Sergio Leone



‚≠ê Top 10 rated movies:


Unnamed: 0,title,year,rating,directors
0,The Shawshank Redemption,1994,9.3,Frank Darabont
1,The Godfather,1972,9.2,Francis Ford Coppola
2,The Dark Knight,2008,9.1,Christopher Nolan
3,The Godfather Part II,1974,9.0,Francis Ford Coppola
4,12 Angry Men,1957,9.0,Sidney Lumet
5,The Lord of the Rings: The Return of the King,2003,9.0,Peter Jackson
6,Schindler's List,1993,9.0,Steven Spielberg
7,The Lord of the Rings: The Fellowship of the Ring,2001,8.9,Peter Jackson
8,Pulp Fiction,1994,8.8,Quentin Tarantino
9,"The Good, the Bad and the Ugly",1966,8.8,Sergio Leone


In [13]:
# Display full sample with all columns
print("\nüé≠ Complete sample (with stars):")
display(df_final.head(10))



üé≠ Complete sample (with stars):


Unnamed: 0,title,year,duration,rating,genres,directors,stars
0,The Shawshank Redemption,1994,2h 22m,9.3,"Epic, Period Drama, Prison Drama, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton"
1,The Godfather,1972,2h 55m,9.2,"Epic, Gangster, Tragedy, Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan"
2,The Dark Knight,2008,2h 32m,9.1,"Action Epic, Epic, Psychological Drama, Psycho...",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart"
3,The Godfather Part II,1974,3h 22m,9.0,"Epic, Gangster, Tragedy, Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert De Niro, Robert Duvall"
4,12 Angry Men,1957,1h 36m,9.0,"Legal Drama, Psychological Drama, Crime, Drama",Sidney Lumet,"Henry Fonda, Lee J. Cobb, Martin Balsam"
5,The Lord of the Rings: The Return of the King,2003,3h 21m,9.0,"Action Epic, Adventure Epic, Epic, Fantasy Epi...",Peter Jackson,"Elijah Wood, Viggo Mortensen, Ian McKellen"
6,Schindler's List,1993,3h 15m,9.0,"Docudrama, Epic, Historical Epic, Period Drama...",Steven Spielberg,"Liam Neeson, Ralph Fiennes, Ben Kingsley"
7,The Lord of the Rings: The Fellowship of the Ring,2001,2h 58m,8.9,"Action Epic, Adventure Epic, Dark Fantasy, Epi...",Peter Jackson,"Elijah Wood, Ian McKellen, Orlando Bloom"
8,Pulp Fiction,1994,2h 34m,8.8,"Dark Comedy, Drug Crime, Gangster, Crime, Drama",Quentin Tarantino,"John Travolta, Uma Thurman, Samuel L. Jackson"
9,"The Good, the Bad and the Ugly",1966,2h 58m,8.8,"Action Epic, Adventure Epic, Dark Comedy, Dese...",Sergio Leone,"Clint Eastwood, Eli Wallach, Lee Van Cleef"


In [3]:
# Check which movie is missing director
df = pd.read_csv('imdb_top_250_movies_with_ratings.csv')

# Find movies with NULL directors
missing_directors = df[df['directors'].isnull()]

if len(missing_directors) > 0:
    print(f"Found {len(missing_directors)} movie(s) with missing director (NULL):\n")
    display(missing_directors[['title', 'year', 'duration', 'rating', 'directors', 'stars']])
else:
    # Try N/A as string
    missing_directors = df[df['directors'] == 'N/A']
    if len(missing_directors) > 0:
        print(f"Found {len(missing_directors)} movie(s) with missing director (N/A):\n")
        display(missing_directors[['title', 'year', 'directors', 'stars']])
    else:
        print("No movies with missing directors found!")

# Also check for any other potential missing values
print("\n\nMissing values summary:")
print(df.isnull().sum())


Found 1 movie(s) with missing director (NULL):



Unnamed: 0,title,year,duration,rating,directors,stars
234,The Wizard of Oz,1939,1h 42m,8.1,,"Judy Garland, Frank Morgan, Ray Bolger"




Missing values summary:
title        0
year         0
duration     0
rating       0
directors    1
stars        0
dtype: int64


In [4]:
movielist = pd.read_csv('movielists.csv')
display(movielist.head())

movielist.info()

Unnamed: 0,judul_film,tahun_rilis,durasi,rating
0,The Shawshank Redemption,1994,2h 22m,9.3
1,The Godfather,1972,2h 55m,9.2
2,The Dark Knight,2008,2h 32m,9.1
3,The Godfather Part II,1974,3h 22m,9.0
4,12 Angry Men,1957,1h 36m,9.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   judul_film   250 non-null    object 
 1   tahun_rilis  250 non-null    int64  
 2   durasi       250 non-null    object 
 3   rating       250 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 7.9+ KB
