In [38]:
!pip3 install lxml
!pip3 install tmdbv3api
import pandas as pd
import numpy as np



In [39]:
from tmdbv3api import TMDb, Movie
import requests

In [None]:
tmdb = TMDb()
tmdb.api_key = os.getenv("API_KEY")
tmdb_movie = Movie()

In [41]:
def get_genre(x):
    genres = []
    result = tmdb_movie.search(x)
    movie_id = result[0].id
    response = requests.get(
        "https://api.themoviedb.org/3/movie/{}?api_key={}".format(
            movie_id, tmdb.api_key
        ),
        timeout=20,
    )
    data_json = response.json()
    if data_json["genres"]:
        genre_str = " "
        for i in range(0, len(data_json["genres"])):
            genres.append(data_json["genres"][i]["name"])
        return genre_str.join(genres)
    else:
        np.nan

In [None]:
def get_director(x):
    if " (director)" in x:
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (directors)")[0]
    else:
        return x.split(" (director/screenplay)")[0]

In [43]:
def get_actor1(x):
    return (x.split("screenplay); ")[-1]).split(", ")[0]

In [44]:
def get_actor2(x):
    return (
        (x.split("screenplay); ")[-1]).split(", ")[1]
        if len((x.split("screenplay); ")[-1]).split(", ")) > 1
        else "unknown"
    )

In [45]:
def get_actor3(x):
    return (
        (x.split("screenplay); ")[-1]).split(", ")[2]
        if len((x.split("screenplay); ")[-1]).split(", ")) > 2
        else "unknown"
    )

In [None]:
def process_movies(year):
    try:
        link = f"https://en.wikipedia.org/wiki/List_of_American_films_of_{year}"
        dfs = [pd.read_html(link, header=0)[i] for i in range(2, 6)]
        df = pd.concat(dfs, ignore_index=True)

        df["genres"] = df["Title"].map(lambda x: get_genre(str(x)))
        df = df[["Title", "Cast and crew", "genres"]]
        df["director_name"] = df["Cast and crew"].map(lambda x: get_director(str(x)))
        df["actor_1_name"] = df["Cast and crew"].map(lambda x: get_actor1(str(x)))
        df["actor_2_name"] = df["Cast and crew"].map(lambda x: get_actor2(str(x)))
        df["actor_3_name"] = df["Cast and crew"].map(lambda x: get_actor3(str(x)))

        df = df.rename(columns={"Title": "movie_title"})
        df["actor_2_name"] = df["actor_2_name"].replace(np.nan, "unknown")
        df["actor_3_name"] = df["actor_3_name"].replace(np.nan, "unknown")
        df["movie_title"] = df["movie_title"].str.lower()
        df["comb"] = (
            df["actor_1_name"]
            + " "
            + df["actor_2_name"]
            + " "
            + df["actor_3_name"]
            + " "
            + df["director_name"]
            + " "
            + df["genres"]
        )
        return df[
            [
                "director_name",
                "actor_1_name",
                "actor_2_name",
                "actor_3_name",
                "genres",
                "movie_title",
                "comb",
            ]
        ]
    except Exception as e:
        print(f"Error processing movies for {year}: {e}")
        return pd.DataFrame()

In [47]:
movies_2018 = process_movies(2018)

Error processing movies for 2018: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


In [48]:
movies_2019 = process_movies(2019)

KeyboardInterrupt: 

In [None]:
movies_2021 = process_movies(2021)

In [None]:
movies_2022 = process_movies(2022)

In [15]:
movies_2023 = process_movies(2023)

In [16]:
movies_2024 = process_movies(2024)

In [17]:
# Combine data
combined_movies = pd.concat(
    [movies_2018, movies_2019, movies_2021, movies_2022, movies_2023, movies_2024],
    ignore_index=True,
)

In [19]:
# Load old dataset, combine with new, and clean up
old_df = pd.read_csv("./datasets/new_data.csv")
final_df = pd.concat([old_df, combined_movies], ignore_index=True)

In [20]:
# Remove rows with NaN values
final_df = final_df.dropna(how="any")

In [None]:
# Save final dataset
final_df.to_csv("./datasets/final_data.csv", index=False)

In [None]:
# Check for missing values and print the result
print(final_df.isna().sum())