In [120]:
# pip install selenium

In [121]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd

In [122]:
# setup 

In [123]:
# Path to your downloaded chromedriver
driver_path = 'chromedriver.exe'  # Replace with the actual path

# Set up the ChromeDriver service
service = Service(driver_path)

# Set up Chrome options (use non-headless mode)
chrome_options = Options()
chrome_options.headless = False  # Set to False for non-headless mode

# Set up the ChromeDriver
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open a website (e.g., Google)
driver.get("https://www.imdb.com/chart/top/")

# Wait for a few seconds to ensure the page loads
time.sleep(5)

# Infinite scrolling: Scroll to the bottom until all items are loaded
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Wait for new content to load
    time.sleep(3)
    
    # Check new page height after scrolling
    new_height = driver.execute_script("return document.body.scrollHeight")
    
    # If we have reached the bottom of the page, break the loop
    if new_height == last_height:
        break
    
    last_height = new_height

last_height : 32959


In [124]:
# movie data 

In [125]:
# After all items are loaded, extract the <ul> and <li> tags
ul_tag = driver.find_element(By.XPATH, '//ul[@class="ipc-metadata-list ipc-metadata-list--dividers-between sc-e22973a9-0 khSCXM compact-list-view ipc-metadata-list--base"]')

# Now find all <li> tags inside this <ul> tag
li_tags = ul_tag.find_elements(By.XPATH, './/li[@class="ipc-metadata-list-summary-item"]')

movies_data = []

for i, movie in enumerate(li_tags):
    # Find the <a> tag inside each <li> tag
    a_tag = movie.find_element(By.XPATH, './/a[@class="ipc-title-link-wrapper"]')
    
    # Extract the movie title (inside <h3> tag)
    title = a_tag.find_element(By.XPATH, './/h3[@class="ipc-title__text"]').text
    
    # Extract the movie URL (relative URL in the href attribute)
    relative_url = a_tag.get_attribute('href')
    
    # Find the <div> with metadata for year, duration, and MPAA rating
    metadata_div = movie.find_element(By.XPATH, './/div[@class="sc-d5ea4b9d-6 hBxwRe cli-title-metadata"]')
    
    print(f"{title}")
    
    # Extract year, duration, and MPAA rating from the <span> tags
    year = metadata_div.find_element(By.XPATH, './/span[@class="sc-d5ea4b9d-7 URyjV cli-title-metadata-item"]').text
    duration = metadata_div.find_elements(By.XPATH, './/span[@class="sc-d5ea4b9d-7 URyjV cli-title-metadata-item"]')[1].text
    try:
        mpaa_rating = metadata_div.find_elements(By.XPATH, './/span[@class="sc-d5ea4b9d-7 URyjV cli-title-metadata-item"]')[2].text
    except Exception as e:
        mpaa_rating = None
        
    # Find the <div> with metadata for rating and vote count
    rating_div = movie.find_element(By.XPATH, './/div[@class="sc-e2dbc1a3-0 jeHPdh sc-d5ea4b9d-3 eunRcy cli-ratings-container"]')
    
    # Extract rating and vote count
    rating = rating_div.find_element(By.XPATH, './/span[@class="ipc-rating-star--rating"]').text
    vote_count = rating_div.find_element(By.XPATH, './/span[@class="ipc-rating-star--voteCount"]').text.strip()
    
    # Append the data to the list
    movies_data.append({
        'Title': title,
        'URL': relative_url,
        'Year': year,
        'Duration': duration,
        'MPAA Rating': mpaa_rating,
        'Rating': rating,
        'Vote Count': vote_count
    })
    
# Create a DataFrame from the list of movie data
df = pd.DataFrame(movies_data)

driver.quit()

1. The Shawshank Redemption
2. The Godfather
3. The Dark Knight
4. The Godfather Part II
5. 12 Angry Men
6. The Lord of the Rings: The Return of the King
7. Schindler's List
8. Pulp Fiction
9. The Lord of the Rings: The Fellowship of the Ring
10. The Good, the Bad and the Ugly
11. Forrest Gump
12. The Lord of the Rings: The Two Towers
13. Fight Club
14. Inception
15. Star Wars: Episode V - The Empire Strikes Back
16. The Matrix
17. Goodfellas
18. One Flew Over the Cuckoo's Nest
19. Interstellar
20. Se7en
21. It's a Wonderful Life
22. Seven Samurai
23. The Silence of the Lambs
24. Saving Private Ryan
25. City of God
26. The Green Mile
27. Life Is Beautiful
28. Terminator 2: Judgment Day
29. Star Wars: Episode IV - A New Hope
30. Back to the Future
31. Spirited Away
32. The Pianist
33. Gladiator
34. Parasite
35. Psycho
36. The Lion King
37. Grave of the Fireflies
38. The Departed
39. Whiplash
40. Harakiri
41. American History X
42. The Prestige
43. Léon: The Professional
44. Spider-Man: 

In [126]:
df.head()

Unnamed: 0,Title,URL,Year,Duration,MPAA Rating,Rating,Vote Count
0,1. The Shawshank Redemption,https://www.imdb.com/title/tt0111161/?ref_=cht...,1994,2h 22m,R,9.3,(3M)
1,2. The Godfather,https://www.imdb.com/title/tt0068646/?ref_=cht...,1972,2h 55m,R,9.2,(2.1M)
2,3. The Dark Knight,https://www.imdb.com/title/tt0468569/?ref_=cht...,2008,2h 32m,PG-13,9.0,(3M)
3,4. The Godfather Part II,https://www.imdb.com/title/tt0071562/?ref_=cht...,1974,3h 22m,R,9.0,(1.4M)
4,5. 12 Angry Men,https://www.imdb.com/title/tt0050083/?ref_=cht...,1957,1h 36m,Approved,9.0,(909K)


In [127]:
df.columns

Index(['Title', 'URL', 'Year', 'Duration', 'MPAA Rating', 'Rating',
       'Vote Count'],
      dtype='object')

In [128]:
# setup

In [129]:
# Initialize WebDriver and open the browser (assuming chromedriver is in the PATH)
driver_path = 'chromedriver.exe'  # Replace with actual path to chromedriver
service = Service(driver_path)
chrome_options = Options()
chrome_options.headless = False  # Set to False if you want to see the browser
driver = webdriver.Chrome(service=service, options=chrome_options)

In [130]:
# other details 

In [131]:
all_temp_df = []

# filtered_df = df.iloc[212:216]

# Assuming 'df' is your DataFrame with movie details
for index, row in df.iterrows():
    # Get the movie URL from the DataFrame
    print(movie_title)
    movie_title = row['Title']
    movie_url = row['URL']
    
    # Open the movie page
    driver.get(movie_url)
    
    # Wait for the page to load
    time.sleep(3)
    
    # Extract the genre (example: Epic, Period Drama, etc.)
    genre_tags = driver.find_elements(By.XPATH, '//div[@class="ipc-chip-list__scroller"]/a/span')
    genres = [genre.text for genre in genre_tags]
    
    # Extract the director, writer, and stars
    metadata_list = driver.find_element(By.XPATH, '//ul[@class="ipc-metadata-list ipc-metadata-list--dividers-all title-pc-list ipc-metadata-list--baseAlt"]')
    
    # Find all the <ul> tags inside the metadata list
    ul_tags = metadata_list.find_elements(By.XPATH, './/ul[@class="ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--inline ipc-metadata-list-item__list-content baseAlt"]')
    
    directors = []
    writers = []
    stars = []
    
    # Loop through each <ul> to extract the director, writer, and stars
    for ul in [ul_tags[0]]:

        # Get the text and link for each <a> tag in the current <ul>
        a_tags = ul.find_elements(By.XPATH, './/a[@class="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link"]')
        
        for a_tag in a_tags:
        # Extract the text
            director_inner_html = a_tag.get_attribute('innerHTML').strip()
            directors.append(director_inner_html)
            
    # Loop through each <ul> to extract the director, writer, and stars
    for ul in [ul_tags[1]]:

        # Get the text and link for each <a> tag in the current <ul>
        a_tags = ul.find_elements(By.XPATH, './/a[@class="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link"]')
        
        for a_tag in a_tags:
        # Extract the text
            writer_inner_html = a_tag.get_attribute('innerHTML').strip()
            writers.append(writer_inner_html)
            
    try:
        # Loop through each <ul> to extract the director, writer, and stars
        for ul in [ul_tags[2]]:

            # Get the text and link for each <a> tag in the current <ul>
            a_tags = ul.find_elements(By.XPATH, './/a[@class="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link"]')

            for a_tag in a_tags:
            # Extract the text
                star_inner_html = a_tag.get_attribute('innerHTML').strip()
                stars.append(star_inner_html)
    except Exception as e:
        stars = []
            
    # Convert lists to '|' separated strings
    genres_str = ' | '.join(genres)
    directors_str = ' | '.join(directors)
    writers_str = ' | '.join(writers)
    stars_str = ' | '.join(stars)
    
    temp_df = pd.DataFrame({
    'Title': movie_title,
    'Genres': [genres_str],
    'Directors': [directors_str],
    'Writers': [writers_str],
    'Stars': [stars_str]
    })
        
    all_temp_df.append(temp_df)
    

final_df = pd.concat(all_temp_df, ignore_index=True)
    
# Close the browser after finishing
driver.quit()

8. Pulp Fiction
1. The Shawshank Redemption
2. The Godfather
3. The Dark Knight
4. The Godfather Part II
5. 12 Angry Men
6. The Lord of the Rings: The Return of the King
7. Schindler's List
8. Pulp Fiction
9. The Lord of the Rings: The Fellowship of the Ring
10. The Good, the Bad and the Ugly
11. Forrest Gump
12. The Lord of the Rings: The Two Towers
13. Fight Club
14. Inception
15. Star Wars: Episode V - The Empire Strikes Back
16. The Matrix
17. Goodfellas
18. One Flew Over the Cuckoo's Nest
19. Interstellar
20. Se7en
21. It's a Wonderful Life
22. Seven Samurai
23. The Silence of the Lambs
24. Saving Private Ryan
25. City of God
26. The Green Mile
27. Life Is Beautiful
28. Terminator 2: Judgment Day
29. Star Wars: Episode IV - A New Hope
30. Back to the Future
31. Spirited Away
32. The Pianist
33. Gladiator
34. Parasite
35. Psycho
36. The Lion King
37. Grave of the Fireflies
38. The Departed
39. Whiplash
40. Harakiri
41. American History X
42. The Prestige
43. Léon: The Professional


In [132]:
final_df.head()

Unnamed: 0,Title,Genres,Directors,Writers,Stars
0,1. The Shawshank Redemption,Epic | Period Drama | Prison Drama | Drama,Frank Darabont,Stephen King | Frank Darabont,Tim Robbins | Morgan Freeman | Bob Gunton
1,2. The Godfather,Epic | Gangster | Tragedy | Crime | Drama,Francis Ford Coppola,Mario Puzo | Francis Ford Coppola,Marlon Brando | Al Pacino | James Caan
2,3. The Dark Knight,Action Epic | Epic | Superhero | Action | Crim...,Christopher Nolan,Jonathan Nolan | Christopher Nolan | David S. ...,Christian Bale | Heath Ledger | Aaron Eckhart
3,4. The Godfather Part II,Epic | Gangster | Tragedy | Crime | Drama,Francis Ford Coppola,Francis Ford Coppola | Mario Puzo,Al Pacino | Robert De Niro | Robert Duvall
4,5. 12 Angry Men,Legal Drama | Psychological Drama | Crime | Drama,Sidney Lumet,Reginald Rose,Henry Fonda | Lee J. Cobb | Martin Balsam


In [133]:
df.head()

Unnamed: 0,Title,URL,Year,Duration,MPAA Rating,Rating,Vote Count
0,1. The Shawshank Redemption,https://www.imdb.com/title/tt0111161/?ref_=cht...,1994,2h 22m,R,9.3,(3M)
1,2. The Godfather,https://www.imdb.com/title/tt0068646/?ref_=cht...,1972,2h 55m,R,9.2,(2.1M)
2,3. The Dark Knight,https://www.imdb.com/title/tt0468569/?ref_=cht...,2008,2h 32m,PG-13,9.0,(3M)
3,4. The Godfather Part II,https://www.imdb.com/title/tt0071562/?ref_=cht...,1974,3h 22m,R,9.0,(1.4M)
4,5. 12 Angry Men,https://www.imdb.com/title/tt0050083/?ref_=cht...,1957,1h 36m,Approved,9.0,(909K)


In [134]:
final_df.shape , df.shape 

((250, 5), (250, 7))

In [136]:
final_res = pd.merge(left=df,
        right=final_df,
        how="left",
        on='Title')

final_res.head()

Unnamed: 0,Title,URL,Year,Duration,MPAA Rating,Rating,Vote Count,Genres,Directors,Writers,Stars
0,1. The Shawshank Redemption,https://www.imdb.com/title/tt0111161/?ref_=cht...,1994,2h 22m,R,9.3,(3M),Epic | Period Drama | Prison Drama | Drama,Frank Darabont,Stephen King | Frank Darabont,Tim Robbins | Morgan Freeman | Bob Gunton
1,2. The Godfather,https://www.imdb.com/title/tt0068646/?ref_=cht...,1972,2h 55m,R,9.2,(2.1M),Epic | Gangster | Tragedy | Crime | Drama,Francis Ford Coppola,Mario Puzo | Francis Ford Coppola,Marlon Brando | Al Pacino | James Caan
2,3. The Dark Knight,https://www.imdb.com/title/tt0468569/?ref_=cht...,2008,2h 32m,PG-13,9.0,(3M),Action Epic | Epic | Superhero | Action | Crim...,Christopher Nolan,Jonathan Nolan | Christopher Nolan | David S. ...,Christian Bale | Heath Ledger | Aaron Eckhart
3,4. The Godfather Part II,https://www.imdb.com/title/tt0071562/?ref_=cht...,1974,3h 22m,R,9.0,(1.4M),Epic | Gangster | Tragedy | Crime | Drama,Francis Ford Coppola,Francis Ford Coppola | Mario Puzo,Al Pacino | Robert De Niro | Robert Duvall
4,5. 12 Angry Men,https://www.imdb.com/title/tt0050083/?ref_=cht...,1957,1h 36m,Approved,9.0,(909K),Legal Drama | Psychological Drama | Crime | Drama,Sidney Lumet,Reginald Rose,Henry Fonda | Lee J. Cobb | Martin Balsam


In [137]:
final_res.shape

(250, 11)

In [138]:
final_res.to_csv("movie_final_data.csv",index=False)