In [26]:
# Import required libraries
from gazpacho import Soup
from requests import get
import pandas as pd

def scrape_imdb_top_100(url):
    # Send GET request
    response = get(url)

    # Check if request was successful
    if response.status_code != 200:
        return None

    # Parse response text
    imdb = Soup(response.text)

    # Find and clean movie titles
    titles = imdb.find("h3", {"class": "lister-item-header"})
    titles = [title.strip() for title in titles]

    # Find and clean movie release years
    years = imdb.find("span", {"class" : "lister-item-year"})
    years = [year.strip() for year in years]

    # Find and clean movie metascores
    metascores = imdb.find("span", {"class" : "metascore"})
    metascores = [metascore.strip() for metascore in metascores]

    # Create a dataframe
    df = pd.DataFrame({
        "title" : titles,
        "year" : years,
        "score" : metascores
    })

    # Return dataframe
    return df

# Call the function
url = "https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc"
df = scrape_imdb_top_100(url)

# Show the first few rows of the dataframe
print(df.head())

# Save the dataframe to a CSV file
df.to_csv("movies_imdb.csv")


                                           title    year score
0             1. The Shawshank Redemption (1994)  (1994)    82
1                        2. The Godfather (1972)  (1972)   100
2  3. Spider-Man: Across the Spider-Verse (2023)  (2023)    86
3                      4. The Dark Knight (2008)  (2008)    84
4                     5. Schindler's List (1993)  (1993)    95
