In [1]:
import os
import re
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)

In [None]:
ML_DIR = "../Data/ml-32m"  # folder containing links.csv, movies.csv, tags.csv, ratings.csv
TMDB_CSV = "../Data/TMDB_movie_dataset_v11.csv"  # full Kaggle TMDB dataset

OUT_DIR = "../Data/outputs"
os.makedirs(OUT_DIR, exist_ok=True)

In [3]:
# Helper functions 

def imdb_numeric_to_tt(x):
    """MovieLens imdbId is numeric ->  to 'tt#######'."""
    if pd.isna(x):
        return np.nan
    s = re.sub(r"[^0-9]", "", str(x))
    if not s:
        return np.nan
    return "tt" + s.zfill(7)

def extract_year_from_title(title):
    """MovieLens title often ends with (YYYY)."""
    if pd.isna(title):
        return np.nan
    m = re.search(r"\((\d{4})\)\s*$", str(title))
    return int(m.group(1)) if m else np.nan

def strip_year_from_title(title):
    if pd.isna(title):
        return ""
    return re.sub(r"\s*\(\d{4}\)\s*$", "", str(title)).strip()

def clean_cols(df):
    """Remove BOM and whitespace from column names."""
    df.columns = df.columns.astype(str).str.replace("\ufeff", "", regex=False).str.strip()
    return df

In [4]:
# Load TMDB dataset

tmdb = pd.read_csv(TMDB_CSV, low_memory=False)
tmdb = clean_cols(tmdb)

print("TMDB shape:", tmdb.shape)
print("TMDB columns:", tmdb.columns.tolist())
tmdb.head(3)

TMDB shape: (1368726, 24)
TMDB columns: ['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date', 'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords']


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,160000000,https://www.warnerbros.com/movies/inception,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,165000000,http://www.interstellarmovie.net/,tt0816692,en,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,185000000,https://www.warnerbros.com/movies/dark-knight/,tt0468569,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."


In [5]:
#Build TMDB JOIN INDEX 

tmdb_join_index = tmdb[["id", "imdb_id", "title", "original_language", "release_date", "adult"]].copy()

tmdb_join_index["id"] = pd.to_numeric(tmdb_join_index["id"], errors="coerce")
tmdb_join_index["imdb_id"] = tmdb_join_index["imdb_id"].astype(str).where(tmdb_join_index["imdb_id"].notna(), np.nan)

tmdb_join_index["adult"] = tmdb_join_index["adult"].astype(str).str.lower().map({"true": True, "false": False}).fillna(False)

tmdb_join_index = tmdb_join_index.drop_duplicates(subset=["id"])

tmdb_join_index.head(3)

Unnamed: 0,id,imdb_id,title,original_language,release_date,adult
0,27205,tt1375666,Inception,en,2010-07-15,False
1,157336,tt0816692,Interstellar,en,2014-11-05,False
2,155,tt0468569,The Dark Knight,en,2008-07-16,False


In [None]:
#Build TMDB SEMANTIC CATALOG (MovieDoc for all movies)

tmdb_sem = tmdb.copy()
tmdb_sem = clean_cols(tmdb_sem)

tmdb_sem["id"] = pd.to_numeric(tmdb_sem["id"], errors="coerce")
tmdb_sem["adult"] = tmdb_sem["adult"].astype(str).str.lower().map({"true": True, "false": False}).fillna(False)

# parse year from release_date
tmdb_sem["release_date_parsed"] = pd.to_datetime(tmdb_sem["release_date"], errors="coerce")
tmdb_sem["year"] = tmdb_sem["release_date_parsed"].dt.year

for col in ["title", "original_title", "overview", "tagline", "genres", "keywords", "spoken_languages", "original_language"]:
    if col in tmdb_sem.columns:
        tmdb_sem[col] = tmdb_sem[col].fillna("").astype(str).str.replace(r"\s+", " ", regex=True).str.strip()

# Build MovieDoc
tmdb_sem["movieDoc"] = (
    "Title: " + tmdb_sem["title"].astype(str) + "\n" +
    "Year: " + tmdb_sem["year"].fillna("").astype(str).str.replace(r"\.0$", "", regex=True) + "\n" +
    "Original language: " + tmdb_sem["original_language"].astype(str) + "\n" +
    "Vote average: " + tmdb_sem["vote_average"].fillna("").astype(str) + "\n" +
    "Vote count: " + tmdb_sem["vote_count"].fillna("").astype(str) + "\n" +
    "Popularity: " + tmdb_sem["popularity"].fillna("").astype(str) + "\n" +
    "Genres: " + tmdb_sem["genres"].astype(str) + "\n" +
    "Keywords: " + tmdb_sem["keywords"].astype(str) + "\n" +
    "Tagline: " + tmdb_sem["tagline"].astype(str) + "\n" +
    "Plot: " + tmdb_sem["overview"].astype(str)
)

tmdb_sem = tmdb_sem.drop(columns=["release_date_parsed"]).drop_duplicates(subset=["id"])

tmdb_sem_path = os.path.join(OUT_DIR, "tmdb_semantic_catalog_alllangs.csv")
tmdb_sem.to_csv(tmdb_sem_path, index=False)

print("Saved:", tmdb_sem_path, "rows:", len(tmdb_sem))
print("Columns:", len(tmdb_sem.columns))
tmdb_sem.head(2)

Saved: Data/outputs/tmdb_semantic_catalog_alllangs.csv rows: 1367545
Columns: 26


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,year,movieDoc
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,160000000,https://www.warnerbros.com/movies/inception,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",2010.0,Title: Inception\nYear: 2010\nOriginal languag...
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,165000000,http://www.interstellarmovie.net/,tt0816692,en,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",2014.0,Title: Interstellar\nYear: 2014\nOriginal lang...


In [7]:
print(tmdb_sem.columns.tolist())

['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date', 'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords', 'year', 'movieDoc']


In [None]:
# print sample of movieDoc for sanity check
print(tmdb_sem["movieDoc"].iloc[0])

Title: Inception
Year: 2010
Original language: en
Vote average: 8.364
Vote count: 34495
Popularity: 83.952
Genres: Action, Science Fiction, Adventure
Keywords: rescue, mission, dream, airplane, paris, france, virtual reality, kidnapping, philosophy, spy, allegory, manipulation, car crash, heist, memory, architecture, los angeles, california, dream world, subconscious
Tagline: Your mind is the scene of the crime.
Plot: Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person's idea into a target's subconscious.


In [9]:
#Load MovieLens (links/movies/tags)

links = pd.read_csv(os.path.join(ML_DIR, "links.csv"))
movies = pd.read_csv(os.path.join(ML_DIR, "movies.csv"))
tags = pd.read_csv(os.path.join(ML_DIR, "tags.csv"))

print("links:", links.shape, "movies:", movies.shape, "tags:", tags.shape)
links.head()

links: (87585, 3) movies: (87585, 3) tags: (2000072, 4)


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [10]:
#Prepare MovieLens fields

links["tmdbId"] = pd.to_numeric(links["tmdbId"], errors="coerce")
links["imdb_tt"] = links["imdbId"].apply(imdb_numeric_to_tt)

movies["year"] = movies["title"].apply(extract_year_from_title)
movies["clean_title"] = movies["title"].apply(strip_year_from_title)
movies["genres_list"] = movies["genres"].fillna("(no genres listed)").apply(lambda s: str(s).split("|"))

movies.head()

Unnamed: 0,movieId,title,genres,year,clean_title,genres_list
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0,Jumanji,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0,Grumpier Old Men,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0,Waiting to Exhale,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),Comedy,1995.0,Father of the Bride Part II,[Comedy]


In [11]:
# Aggregate MovieLens tags per movie (Top 20)

tag_counts = (
    tags.groupby(["movieId", "tag"])
        .size()
        .reset_index(name="count")
        .sort_values(["movieId", "count"], ascending=[True, False])
)

top_tags = (
    tag_counts.groupby("movieId")
              .head(20)
              .groupby("movieId")["tag"]
              .apply(list)
              .reset_index()
              .rename(columns={"tag": "tags_agg"})
)

print("top_tags:", top_tags.shape)
top_tags.head()

top_tags: (51323, 2)


Unnamed: 0,movieId,tags_agg
0,1,"[Pixar, animation, Disney, funny, Tom Hanks, p..."
1,2,"[Robin Williams, fantasy, time travel, board g..."
2,3,"[CLV, Funniest Movies, Jack Lemmon, Minnesota,..."
3,4,"[chick flick, characters, CLV, based on novel ..."
4,5,"[Steve Martin, family, pregnancy, Diane Keaton..."


In [12]:
#Merge MovieLens and TMDB (JOIN INDEX) with fallback IMDb

# Primary join on tmdbId -> id
ml_map = links.merge(
    tmdb_join_index,
    left_on="tmdbId",
    right_on="id",
    how="left"
)

missing = ml_map["id"].isna()
print("Missing after tmdbId join:", missing.sum())

# Fallback join on imdb_tt -> imdb_id
if missing.any():
    tmdb_by_imdb = (
        tmdb_join_index.dropna(subset=["imdb_id"])
                       .drop_duplicates(subset=["imdb_id"])
    )
    fb = ml_map.loc[missing, ["movieId", "imdb_tt"]].merge(
        tmdb_by_imdb,
        left_on="imdb_tt",
        right_on="imdb_id",
        how="left"
    ).set_index("movieId")

    ml_map = ml_map.set_index("movieId")
    for col in ["id","imdb_id","title","original_language","release_date","adult"]:
        ml_map.loc[fb.index, col] = ml_map.loc[fb.index, col].fillna(fb[col])
    ml_map = ml_map.reset_index()

print("Missing after imdb fallback:", ml_map["id"].isna().sum())
ml_map.head()

Missing after tmdbId join: 1092
Missing after imdb fallback: 568


Unnamed: 0,movieId,imdbId,tmdbId,imdb_tt,id,imdb_id,title,original_language,release_date,adult
0,1,114709,862.0,tt0114709,862.0,tt0114709,Toy Story,en,1995-10-30,False
1,2,113497,8844.0,tt0113497,8844.0,tt0113497,Jumanji,en,1995-12-15,False
2,3,113228,15602.0,tt0113228,15602.0,tt0113228,Grumpier Old Men,en,1995-12-22,False
3,4,114885,31357.0,tt0114885,31357.0,tt0114885,Waiting to Exhale,en,1995-12-22,False
4,5,113041,11862.0,tt0113041,11862.0,tt0113041,Father of the Bride Part II,en,1995-12-08,False


In [13]:
#Attach TMDB Semantic Catalog fields (movieDoc, overview, etc.)

tmdb_sem_small = tmdb_sem[["id", "movieDoc", "overview", "genres", "keywords", "popularity", "vote_average", "vote_count"]].copy()
tmdb_sem_small["id"] = pd.to_numeric(tmdb_sem_small["id"], errors="coerce")

ml_enriched = ml_map.merge(
    tmdb_sem_small,
    on="id",
    how="left",
    suffixes=("", "_sem")
)

print("Rows with mapping (have TMDB id):", ml_enriched["id"].notna().sum())
print("Rows with movieDoc available:", ml_enriched["movieDoc"].notna().sum())
ml_enriched.head()

Rows with mapping (have TMDB id): 87017
Rows with movieDoc available: 87017


Unnamed: 0,movieId,imdbId,tmdbId,imdb_tt,id,imdb_id,title,original_language,release_date,adult,movieDoc,overview,genres,keywords,popularity,vote_average,vote_count
0,1,114709,862.0,tt0114709,862.0,tt0114709,Toy Story,en,1995-10-30,False,Title: Toy Story\nYear: 1995\nOriginal languag...,"Led by Woody, Andy's toys live happily in his ...","Animation, Adventure, Family, Comedy","rescue, friendship, mission, martial arts, jea...",78.404,7.971,17152.0
1,2,113497,8844.0,tt0113497,8844.0,tt0113497,Jumanji,en,1995-12-15,False,Title: Jumanji\nYear: 1995\nOriginal language:...,When siblings Judy and Peter discover an encha...,"Adventure, Fantasy, Family","giant insect, board game, disappearance, jungl...",13.444,7.239,9833.0
2,3,113228,15602.0,tt0113228,15602.0,tt0113228,Grumpier Old Men,en,1995-12-22,False,Title: Grumpier Old Men\nYear: 1995\nOriginal ...,A family wedding reignites the ancient feud be...,"Romance, Comedy","fishing, sequel, old man, best friend, wedding...",14.815,6.476,347.0
3,4,114885,31357.0,tt0114885,31357.0,tt0114885,Waiting to Exhale,en,1995-12-22,False,Title: Waiting to Exhale\nYear: 1995\nOriginal...,"Cheated on, mistreated and stepped on, the wom...","Comedy, Drama, Romance","based on novel or book, interracial relationsh...",14.451,6.183,142.0
4,5,113041,11862.0,tt0113041,11862.0,tt0113041,Father of the Bride Part II,en,1995-12-08,False,Title: Father of the Bride Part II\nYear: 1995...,Just when George Banks has recovered from his ...,"Comedy, Family","daughter, baby, parent child relationship, mid...",14.537,6.228,659.0


In [14]:
#Add MovieLens movie metadata,tags, build movieDoc_full

ml_enriched = ml_enriched.merge(
    movies[["movieId", "clean_title", "year", "genres_list"]],
    on="movieId",
    how="left"
).merge(
    top_tags,
    on="movieId",
    how="left"
)
def add_movielens_tags_to_doc(row):
    base = row.get("movieDoc", "")
    tags_list = row.get("tags_agg", [])
    if not isinstance(tags_list, list):
        tags_list = []
    if tags_list:
        return (base if isinstance(base, str) else "") + "\nUser tags: " + ", ".join(tags_list[:10])
    return base

ml_enriched["movieDoc_full"] = ml_enriched.apply(add_movielens_tags_to_doc, axis=1)

ml_enriched[["movieId","tmdbId","id","imdb_tt","imdb_id","clean_title","original_language","adult"]].head(10)

Unnamed: 0,movieId,tmdbId,id,imdb_tt,imdb_id,clean_title,original_language,adult
0,1,862.0,862.0,tt0114709,tt0114709,Toy Story,en,False
1,2,8844.0,8844.0,tt0113497,tt0113497,Jumanji,en,False
2,3,15602.0,15602.0,tt0113228,tt0113228,Grumpier Old Men,en,False
3,4,31357.0,31357.0,tt0114885,tt0114885,Waiting to Exhale,en,False
4,5,11862.0,11862.0,tt0113041,tt0113041,Father of the Bride Part II,en,False
5,6,949.0,949.0,tt0113277,tt0113277,Heat,en,False
6,7,11860.0,11860.0,tt0114319,tt0114319,Sabrina,en,False
7,8,45325.0,45325.0,tt0112302,tt0112302,Tom and Huck,en,False
8,9,9091.0,9091.0,tt0114576,tt0114576,Sudden Death,en,False
9,10,710.0,710.0,tt0113189,tt0113189,GoldenEye,en,False


In [15]:
movielens_tmdb_out = os.path.join(OUT_DIR, "movielens_tmdb_merged.csv")
ml_enriched.to_csv(movielens_tmdb_out, index=False)

print("Saved:", movielens_tmdb_out)
print("Final merged rows:", len(ml_enriched))
print("Mapped rows:", ml_enriched["id"].notna().sum())
print("MovieDoc_full available:", ml_enriched["movieDoc_full"].notna().sum())

Saved: Data/outputs/movielens_tmdb_merged.csv
Final merged rows: 87585
Mapped rows: 87017
MovieDoc_full available: 87281


In [16]:
ml_enriched.head()

Unnamed: 0,movieId,imdbId,tmdbId,imdb_tt,id,imdb_id,title,original_language,release_date,adult,movieDoc,overview,genres,keywords,popularity,vote_average,vote_count,clean_title,year,genres_list,tags_agg,movieDoc_full
0,1,114709,862.0,tt0114709,862.0,tt0114709,Toy Story,en,1995-10-30,False,Title: Toy Story\nYear: 1995\nOriginal languag...,"Led by Woody, Andy's toys live happily in his ...","Animation, Adventure, Family, Comedy","rescue, friendship, mission, martial arts, jea...",78.404,7.971,17152.0,Toy Story,1995.0,"[Adventure, Animation, Children, Comedy, Fantasy]","[Pixar, animation, Disney, funny, Tom Hanks, p...",Title: Toy Story\nYear: 1995\nOriginal languag...
1,2,113497,8844.0,tt0113497,8844.0,tt0113497,Jumanji,en,1995-12-15,False,Title: Jumanji\nYear: 1995\nOriginal language:...,When siblings Judy and Peter discover an encha...,"Adventure, Fantasy, Family","giant insect, board game, disappearance, jungl...",13.444,7.239,9833.0,Jumanji,1995.0,"[Adventure, Children, Fantasy]","[Robin Williams, fantasy, time travel, board g...",Title: Jumanji\nYear: 1995\nOriginal language:...
2,3,113228,15602.0,tt0113228,15602.0,tt0113228,Grumpier Old Men,en,1995-12-22,False,Title: Grumpier Old Men\nYear: 1995\nOriginal ...,A family wedding reignites the ancient feud be...,"Romance, Comedy","fishing, sequel, old man, best friend, wedding...",14.815,6.476,347.0,Grumpier Old Men,1995.0,"[Comedy, Romance]","[CLV, Funniest Movies, Jack Lemmon, Minnesota,...",Title: Grumpier Old Men\nYear: 1995\nOriginal ...
3,4,114885,31357.0,tt0114885,31357.0,tt0114885,Waiting to Exhale,en,1995-12-22,False,Title: Waiting to Exhale\nYear: 1995\nOriginal...,"Cheated on, mistreated and stepped on, the wom...","Comedy, Drama, Romance","based on novel or book, interracial relationsh...",14.451,6.183,142.0,Waiting to Exhale,1995.0,"[Comedy, Drama, Romance]","[chick flick, characters, CLV, based on novel ...",Title: Waiting to Exhale\nYear: 1995\nOriginal...
4,5,113041,11862.0,tt0113041,11862.0,tt0113041,Father of the Bride Part II,en,1995-12-08,False,Title: Father of the Bride Part II\nYear: 1995...,Just when George Banks has recovered from his ...,"Comedy, Family","daughter, baby, parent child relationship, mid...",14.537,6.228,659.0,Father of the Bride Part II,1995.0,[Comedy],"[Steve Martin, family, pregnancy, Diane Keaton...",Title: Father of the Bride Part II\nYear: 1995...


In [17]:
print("Columns in final enriched dataset:")
print(ml_enriched.columns.tolist())

Columns in final enriched dataset:
['movieId', 'imdbId', 'tmdbId', 'imdb_tt', 'id', 'imdb_id', 'title', 'original_language', 'release_date', 'adult', 'movieDoc', 'overview', 'genres', 'keywords', 'popularity', 'vote_average', 'vote_count', 'clean_title', 'year', 'genres_list', 'tags_agg', 'movieDoc_full']


In [18]:
# sample last column in movielens_tmdb_merged.csv

print("\nSample movieDoc_full:\n")
sample = ml_enriched["movieDoc_full"].dropna().sample(10, random_state=42)
for i, doc in enumerate(sample, 1):
    print(f"--- MovieDoc_full Sample {i} ---")
    print(doc)
    print()



Sample movieDoc_full:

--- MovieDoc_full Sample 1 ---
Title: The Goddess of Spring
Year: 1934
Original language: en
Vote average: 6.065
Vote count: 46
Popularity: 3.424
Genres: Animation
Keywords: cartoon, short film
Tagline: 
Plot: The goddess is greeted by dancing flowers and fairies. The devil comes and takes her away to be his queen. She's despondent, as winter settles in above ground. But the devil isn't happy either, and offers anything to make her happy. They reach an agreement: she'll spend six months above ground and six below. Thus we have seasons.
User tags: short, silly symphony

--- MovieDoc_full Sample 2 ---
Title: Daffodils
Year: 2019
Original language: en
Vote average: 5.4
Vote count: 14
Popularity: 2.394
Genres: Drama, Romance, Music
Keywords: 
Tagline: A love story unlike any you've heard before.
Plot: A bittersweet love story told through enchanting re-imaginings of popular and iconic New Zealand songs.

--- MovieDoc_full Sample 3 ---
Title: Judy
Year: 2019
Original

In [19]:
# movieDocfull for Inception movie

movieDocfull = ml_enriched[ml_enriched['title'] == 'Inception']
print("\nMovieDoc_full for Inception:\n")
print(movieDocfull["movieDoc_full"].iloc[0])


MovieDoc_full for Inception:

Title: Inception
Year: 2010
Original language: en
Vote average: 8.364
Vote count: 34495
Popularity: 83.952
Genres: Action, Science Fiction, Adventure
Keywords: rescue, mission, dream, airplane, paris, france, virtual reality, kidnapping, philosophy, spy, allegory, manipulation, car crash, heist, memory, architecture, los angeles, california, dream world, subconscious
Tagline: Your mind is the scene of the crime.
Plot: Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person's idea into a target's subconscious.
User tags: alternate reality, thought-provoking, visually appealing, mindfuck, surreal, Leonardo DiCaprio, intellectual, sci-fi, complicated, twist ending
