In [21]:
import pandas as pd

In [22]:
df_movies = pd.read_csv("Movies_all_ML.csv")
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movieId               87585 non-null  int64  
 1   imdbId                87585 non-null  object 
 2   vote_average          86493 non-null  float64
 3   vote_count            86493 non-null  float64
 4   status                86493 non-null  object 
 5   release_date          87455 non-null  object 
 6   revenue               86493 non-null  float64
 7   runtime               86493 non-null  float64
 8   adult                 86493 non-null  object 
 9   backdrop_path         70442 non-null  object 
 10  budget                86493 non-null  float64
 11  homepage              17774 non-null  object 
 12  original_language     86493 non-null  object 
 13  original_title        86493 non-null  object 
 14  overview              85667 non-null  object 
 15  popularity         

In [23]:
df_crew = pd.read_csv(
    "Crew_dataset.gz",
    sep="\t",
    na_values="\\N",
    usecols=["tconst", "directors"]
)

In [24]:
df_name = pd.read_csv(
    "Basic_names_crew_dataset.gz",
    sep="\t",
    na_values="\\N",
    usecols=["nconst", "primaryName"]
)

In [25]:
df_principals = pd.read_csv(
    "title.principals.tsv.gz",
    sep="\t",
    na_values="\\N",
    usecols=["tconst", "ordering", "nconst", "category"]
)

In [26]:
df_crew = df_crew.dropna(subset=["directors"]).copy()
df_crew["directors"] = df_crew["directors"].str.split(",")

df_directors = df_crew.explode("directors").rename(columns={"directors": "nconst"})
df_directors = df_directors.merge(df_name, on="nconst", how="inner")

df_directors_grouped = (
    df_directors
    .groupby("tconst", as_index=False)
    .agg({"primaryName": lambda x: ", ".join(x.dropna().unique())})
    .rename(columns={"primaryName": "Directors"})
)

In [27]:
# Szukamy aktorów i aktorek
df_cast = df_principals[
    df_principals["category"].isin(["actor", "actress"])
].copy()

df_cast = df_cast.merge(df_name, on="nconst", how="inner")

df_cast_grouped = (
    df_cast
    .groupby("tconst", as_index=False)
    .agg({"primaryName": lambda x: ", ".join(x.dropna().unique())})
    .rename(columns={"primaryName": "Cast"})
)

In [28]:
df_cast["ordering"] = pd.to_numeric(df_cast["ordering"], errors="coerce")
df_cast.sort_values(["tconst", "ordering"], inplace=True)

# TOP 3 aktorów lub aktorek
df_top3 = df_cast.groupby("tconst").head(3).copy()

df_star_grouped = (
    df_top3
    .groupby("tconst", as_index=False)
    .agg({"primaryName": lambda x: ", ".join(x.dropna().unique())})
    .rename(columns={"primaryName": "StarActors"})
)

In [29]:
df_dir_cast = df_directors_grouped.merge(df_cast_grouped, on="tconst", how="inner")
df_dir_cast_star = df_dir_cast.merge(df_star_grouped, on="tconst", how="inner")

In [30]:
df_final = df_movies.merge(
    df_dir_cast_star,
    left_on="imdbId",
    right_on="tconst",
    how="left"
)

In [31]:
columns_to_drop = ["tconst", "nconst", "category", "ordering"]
for col in columns_to_drop:
    if col in df_final.columns:
        df_final.drop(columns=[col], inplace=True)

In [32]:
df_final.head()
df_final.to_csv("Movies_final_ML.csv", index=False)


In [33]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movieId               87585 non-null  int64  
 1   imdbId                87585 non-null  object 
 2   vote_average          86493 non-null  float64
 3   vote_count            86493 non-null  float64
 4   status                86493 non-null  object 
 5   release_date          87455 non-null  object 
 6   revenue               86493 non-null  float64
 7   runtime               86493 non-null  float64
 8   adult                 86493 non-null  object 
 9   backdrop_path         70442 non-null  object 
 10  budget                86493 non-null  float64
 11  homepage              17774 non-null  object 
 12  original_language     86493 non-null  object 
 13  original_title        86493 non-null  object 
 14  overview              85667 non-null  object 
 15  popularity         