In [2]:
import pandas as pd

In [3]:
df_movies = pd.read_csv("Movies_all.csv")
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86493 entries, 0 to 86492
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    86493 non-null  int64  
 1   title                 86493 non-null  object 
 2   vote_average          86493 non-null  float64
 3   vote_count            86493 non-null  int64  
 4   status                86493 non-null  object 
 5   release_date          86427 non-null  object 
 6   revenue               86493 non-null  int64  
 7   runtime               86493 non-null  int64  
 8   adult                 86493 non-null  bool   
 9   backdrop_path         70442 non-null  object 
 10  budget                86493 non-null  int64  
 11  homepage              17774 non-null  object 
 12  imdb_id               86467 non-null  object 
 13  original_language     86493 non-null  object 
 14  original_title        86493 non-null  object 
 15  overview           

In [4]:
df_crew = pd.read_csv(
    "Crew_dataset.gz",
    sep="\t",
    na_values="\\N",
    usecols=["tconst", "directors"]
)

In [5]:
df_name = pd.read_csv(
    "Basic_names_crew_dataset.gz",
    sep="\t",
    na_values="\\N",
    usecols=["nconst", "primaryName"]
)

In [6]:
df_principals = pd.read_csv(
    "title.principals.tsv.gz",
    sep="\t",
    na_values="\\N",
    usecols=["tconst", "ordering", "nconst", "category"]
)

In [7]:
df_crew = df_crew.dropna(subset=["directors"]).copy()
df_crew["directors"] = df_crew["directors"].str.split(",")

# Exploding records
df_directors = df_crew.explode("directors").rename(columns={"directors": "nconst"})

# Merge
df_directors = df_directors.merge(df_name, on="nconst", how="inner")

# Combing, as multiple outputs
df_directors_grouped = (
    df_directors
    .groupby("tconst", as_index=False)
    .agg({"primaryName": lambda x: ", ".join(x.dropna().unique())})
    .rename(columns={"primaryName": "Directors"})
)

In [8]:
# Filter title.principals for rows where category is "actor" or "actress"
df_cast = df_principals[
    df_principals["category"].isin(["actor", "actress"])
].copy()

df_cast = df_cast.merge(df_name, on="nconst", how="inner")

df_cast_grouped = (
    df_cast
    .groupby("tconst", as_index=False)
    .agg({"primaryName": lambda x: ", ".join(x.dropna().unique())})
    .rename(columns={"primaryName": "Cast"})
)

In [9]:
df_cast["ordering"] = pd.to_numeric(df_cast["ordering"], errors="coerce")
df_cast.sort_values(["tconst", "ordering"], inplace=True)

# First 3 members matter, validation of a star (who is mentioned first) of the movie
df_top3 = df_cast.groupby("tconst").head(3).copy()

df_star_grouped = (
    df_top3
    .groupby("tconst", as_index=False)
    .agg({"primaryName": lambda x: ", ".join(x.dropna().unique())})
    .rename(columns={"primaryName": "StarActors"})
)

In [10]:
df_dir_cast = df_directors_grouped.merge(df_cast_grouped, on="tconst", how="inner")
df_dir_cast_star = df_dir_cast.merge(df_star_grouped, on="tconst", how="inner")

In [11]:
df_final = df_movies.merge(
    df_dir_cast_star,
    left_on="imdb_id",
    right_on="tconst",
    how="left"
)

In [12]:
columns_to_drop = ["tconst", "nconst", "category", "ordering"]
for col in columns_to_drop:
    if col in df_final.columns:
        df_final.drop(columns=[col], inplace=True)

In [13]:
df_final.head()
df_final.to_csv("Movies_final.csv", index=False)
