In [42]:
import pandas as pd
import ast

In [44]:
ml = pd.read_csv("../data/processed/movies.csv")
print(len(ml))
ml['CleanTitle'] = ml["Title"].str.extract(r"^(.*)\s\(\d{4}\)", expand=False).str.strip()
ml.dropna(subset=['CleanTitle'], inplace=True)
ml['Year'] = ml['Title'].str.extract(r"\((\d{4})\)", expand=False)
ml['Genres'] = ml['Genres'].str.replace('|', ', ', regex=False)

print(len(ml))
ml.head()

3883
3882


Unnamed: 0,MovieID,Title,Genres,CleanTitle,Year
0,1,Toy Story (1995),"Animation, Children's, Comedy",Toy Story,1995
1,2,Jumanji (1995),"Adventure, Children's, Fantasy",Jumanji,1995
2,3,Grumpier Old Men (1995),"Comedy, Romance",Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),"Comedy, Drama",Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995


In [45]:
ml['CleanTitle'][ml.CleanTitle.str.contains(r"[()]", na=False)]

29      Shanghai Triad (Yao a yao yao dao waipo qiao)
46                                      Seven (Se7en)
57                          Postino, Il (The Postman)
58               Confessional, The (Le Confessionnal)
67                        French Twist (Gazon maudit)
                            ...                      
3794          Godzilla 2000 (Gojira ni-sen mireniamu)
3797              All the Rage (a.k.a. It's the Rage)
3822                               Anatomy (Anatomie)
3832                Goya in Bordeaux (Goya en Bodeos)
3850     Faraway, So Close (In Weiter Ferne, So Nah!)
Name: CleanTitle, Length: 268, dtype: object

In [46]:
ml['CleanTitle'] = ml['CleanTitle'].str.replace(r"\s*\(.*?\)", "", regex=True).str.strip()

In [47]:
ml['CleanTitle'][ml.CleanTitle.str.contains(r"[()]", na=False)]

Series([], Name: CleanTitle, dtype: object)

In [48]:
ml['Year'].astype(int).isna().value_counts()

Year
False    3882
Name: count, dtype: int64

In [49]:
ml

Unnamed: 0,MovieID,Title,Genres,CleanTitle,Year
0,1,Toy Story (1995),"Animation, Children's, Comedy",Toy Story,1995
1,2,Jumanji (1995),"Adventure, Children's, Fantasy",Jumanji,1995
2,3,Grumpier Old Men (1995),"Comedy, Romance",Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),"Comedy, Drama",Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995
...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents,2000
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream,2000
3880,3950,Tigerland (2000),Drama,Tigerland,2000
3881,3951,Two Family House (2000),Drama,Two Family House,2000


In [50]:
tmdb = pd.read_csv('../data/raw/tmdb/movies_metadata.csv', low_memory=False, 
                   usecols=["original_title", "title", "overview", "genres", "release_date", "poster_path"])
print(len(tmdb))
tmdb.dropna(subset=['title', 'overview'], inplace=True)
tmdb["title_clean"] = tmdb["title"].str.lower().str.strip()
tmdb['release_year'] = tmdb['release_date'].str[:4]
tmdb['genres'] = tmdb['genres'].apply(
    lambda x: ', '.join(d['name'] for d in ast.literal_eval(x) if 'name' in d)
)
# tmdb.dtypes
print(len(tmdb))

(tmdb['title'] == tmdb['original_title']).value_counts()

45466
44506


True     33333
False    11173
Name: count, dtype: int64

In [51]:
'les miserables' in tmdb['title_clean'][tmdb['release_date'].isna()].tolist()

False

In [52]:
tmdb[tmdb['title_clean']=='les miserables']

Unnamed: 0,genres,original_title,overview,poster_path,release_date,title,title_clean,release_year
72,"Drama, History",Les misérables,"In France during World War II, a poor and illi...",/ePJCk8puXx28CtvpSpmezpwfVt8.jpg,1995-03-22,Les Miserables,les miserables,1995
20157,"Drama, Romance",Les Miserables,"Jean Valjean, a Frenchman of good character, h...",/koNAHstzDDN8Sw4ojX8f2FcN225.jpg,1952-08-14,Les Miserables,les miserables,1952


In [53]:
tmdb[['original_title', 'title']][~(tmdb['title'] == tmdb['original_title'])].head()

Unnamed: 0,original_title,title
28,La Cité des Enfants Perdus,The City of Lost Children
29,摇啊摇，摇到外婆桥,Shanghai Triad
57,Il postino,The Postman
58,Le confessionnal,The Confessional
67,Gazon maudit,French Twist


In [54]:
from fuzzywuzzy import fuzz
from tqdm import tqdm

tqdm.pandas()

In [55]:
# Function to find best fuzzy match
def get_best_match(title, year):
    global no_good_match_counter, no_good_match, empty_match
    no_good_match_counter = 0 
    no_good_match = []
    empty_match = []
    choices = tmdb["title_clean"][tmdb["release_year"].astype(str)==year].tolist()
    # print(year, ' : ', len(choices), 'choices')
    scores = [(choice, fuzz.token_sort_ratio(str(title).lower(), choice)) for choice in choices]
    best = max(scores, key=lambda x: x[1])
    if best[1] > 80:
        if best[0] is None or best[0] == '' or best[0].lower() == 'nan' or best[0].lower() == 'none':
            empty_match.append((title, year, best))
        return best
    else:
        no_good_match_counter += 1
        no_good_match.append((title, year, best))
        # print((title, year, best))
        return (None, 0)
    return best if best[1] > 80 else (None, 0)  # threshold

In [56]:
# x = tmdb["title_clean"][tmdb["release_year"].astype(str)=='1995'].tolist()
# x_scores = [(choice, fuzz.token_sort_ratio('les miserables', choice)) for choice in x]
# max(x_scores, key=lambda x: x[1])
str(None)

'None'

In [31]:
matches = ml[["CleanTitle", "Year"]].progress_apply(
    lambda x: get_best_match(
        x['CleanTitle'], 
        x['Year']
        ), 
    axis=1)
print(f"No good matches found: {no_good_match_counter}")

100%|██████████| 3882/3882 [00:35<00:00, 108.86it/s]

No good matches found: 0





In [32]:
no_good_match

[]

In [33]:
empty_match

[]

In [57]:
x = matches.apply(lambda x: x[0] if x[0] is not None else 0)
pd.DataFrame(x, columns=['x'])['x'].value_counts()

x
0                     475
hamlet                  5
the mummy               3
the scarlet letter      2
mighty joe young        2
                     ... 
burnt offerings         1
candyman                1
carrie                  1
cat people              1
the contender           1
Name: count, Length: 3375, dtype: int64

In [58]:
ml["matched_title"] = matches.apply(lambda x: x[0])
ml["match_score"] = matches.apply(lambda x: x[1])

In [59]:
ml["matched_title"].isna().value_counts()

matched_title
False    3407
True      475
Name: count, dtype: int64

In [60]:
ml[ml["matched_title"].isna()]

Unnamed: 0,MovieID,Title,Genres,CleanTitle,Year,matched_title,match_score
32,33,Wings of Courage (1995),"Adventure, Romance",Wings of Courage,1995,,0
46,47,Seven (Se7en) (1995),"Crime, Thriller",Seven,1995,,0
55,56,Kids of the Round Table (1995),"Adventure, Children's, Fantasy",Kids of the Round Table,1995,,0
57,58,"Postino, Il (The Postman) (1994)","Drama, Romance","Postino, Il",1994,,0
82,83,Once Upon a Time... When We Were Colored (1995),Drama,Once Upon a Time... When We Were Colored,1995,,0
...,...,...,...,...,...,...,...
3824,3894,Solas (1999),Drama,Solas,1999,,0
3834,3904,"Uninvited Guest, An (2000)",Drama,"Uninvited Guest, An",2000,,0
3837,3907,"Prince of Central Park, The (1999)",Drama,"Prince of Central Park, The",1999,,0
3844,3914,"Broken Hearts Club, The (2000)",Drama,"Broken Hearts Club, The",2000,,0


In [61]:
ml.dropna(subset=['matched_title'], inplace=True)
ml["matched_title"].isna().value_counts()

matched_title
False    3407
Name: count, dtype: int64

In [62]:
merged = ml.merge(tmdb, left_on="matched_title", right_on="title_clean", how="left")
merged.head()

Unnamed: 0,MovieID,Title,Genres,CleanTitle,Year,matched_title,match_score,genres,original_title,overview,poster_path,release_date,title,title_clean,release_year
0,1,Toy Story (1995),"Animation, Children's, Comedy",Toy Story,1995,toy story,100,"Animation, Comedy, Family",Toy Story,"Led by Woody, Andy's toys live happily in his ...",/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,Toy Story,toy story,1995
1,2,Jumanji (1995),"Adventure, Children's, Fantasy",Jumanji,1995,jumanji,100,"Adventure, Fantasy, Family",Jumanji,When siblings Judy and Peter discover an encha...,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,Jumanji,jumanji,1995
2,3,Grumpier Old Men (1995),"Comedy, Romance",Grumpier Old Men,1995,grumpier old men,100,"Romance, Comedy",Grumpier Old Men,A family wedding reignites the ancient feud be...,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,Grumpier Old Men,grumpier old men,1995
3,4,Waiting to Exhale (1995),"Comedy, Drama",Waiting to Exhale,1995,waiting to exhale,100,"Comedy, Drama, Romance",Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,Waiting to Exhale,waiting to exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,father of the bride part ii,100,Comedy,Father of the Bride Part II,Just when George Banks has recovered from his ...,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,Father of the Bride Part II,father of the bride part ii,1995


In [63]:
merged[['CleanTitle', 'Genres', 'Year', 'matched_title', 'genres', 'release_year', 'match_score']][merged["match_score"] != 100]

Unnamed: 0,CleanTitle,Genres,Year,matched_title,genres,release_year,match_score
91,"Misérables, Les","Drama, Musical",1995,les miserables,"Drama, History",1995,96
92,"Misérables, Les","Drama, Musical",1995,les miserables,"Drama, Romance",1952,96
146,"Silence of the Palace, The",Drama,1994,the silences of the palace,Drama,1994,98
160,Up Close and Personal,"Drama, Romance",1996,up close & personal,"Drama, Romance",1996,89
263,Dumb & Dumber,Comedy,1994,dumb and dumber,Comedy,1994,85
...,...,...,...,...,...,...,...
4150,Crime and Punishment in Suburbia,"Comedy, Drama",2000,crime + punishment in suburbia,Drama,2000,93
4180,"Creature From the Black Lagoon, The",Horror,1954,creature from the black lagoon,"Adventure, Horror, Science Fiction",1954,94
4186,"Phantom of the Opera, The","Drama, Thriller",1943,phantom of the opera,"Music, Horror, Romance",1943,91
4191,"Slumber Party Massacre II, The",Horror,1987,slumber party massacre ii,"Horror, Comedy",1987,93


In [64]:
merged[merged['matched_title']=='les miserables']

Unnamed: 0,MovieID,Title,Genres,CleanTitle,Year,matched_title,match_score,genres,original_title,overview,poster_path,release_date,title,title_clean,release_year
91,73,"Misérables, Les (1995)","Drama, Musical","Misérables, Les",1995,les miserables,96,"Drama, History",Les misérables,"In France during World War II, a poor and illi...",/ePJCk8puXx28CtvpSpmezpwfVt8.jpg,1995-03-22,Les Miserables,les miserables,1995
92,73,"Misérables, Les (1995)","Drama, Musical","Misérables, Les",1995,les miserables,96,"Drama, Romance",Les Miserables,"Jean Valjean, a Frenchman of good character, h...",/koNAHstzDDN8Sw4ojX8f2FcN225.jpg,1952-08-14,Les Miserables,les miserables,1952


In [68]:
merged[['Genres', 'genres']]

Unnamed: 0,Genres,genres
0,"Animation, Children's, Comedy","Animation, Comedy, Family"
1,"Adventure, Children's, Fantasy","Adventure, Fantasy, Family"
2,"Comedy, Romance","Romance, Comedy"
3,"Comedy, Drama","Comedy, Drama, Romance"
4,Comedy,Comedy
...,...,...
4202,Comedy,"Comedy, Romance"
4203,Drama,"Crime, Drama"
4204,Drama,"Drama, War"
4205,Drama,"Drama, Romance"


In [69]:
def merge_unique_tags(col1, col2):
    # Split by comma and strip whitespace
    list1 = [x.strip() for x in col1.split(',')] if pd.notna(col1) else []
    list2 = [x.strip() for x in col2.split(',')] if pd.notna(col2) else []

    # Combine while preserving order and removing duplicates
    seen = set()
    merged = []
    for item in list1 + list2:
        if item not in seen:
            seen.add(item)
            merged.append(item)
    return ', '.join(merged)


In [71]:
merged['genres_merged'] = merged.apply(lambda row: merge_unique_tags(row['Genres'], row['genres']), axis=1)

In [72]:
merged

Unnamed: 0,MovieID,Title,Genres,CleanTitle,Year,matched_title,match_score,genres,original_title,overview,poster_path,release_date,title,title_clean,release_year,genres_merged
0,1,Toy Story (1995),"Animation, Children's, Comedy",Toy Story,1995,toy story,100,"Animation, Comedy, Family",Toy Story,"Led by Woody, Andy's toys live happily in his ...",/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,Toy Story,toy story,1995,"Animation, Children's, Comedy, Family"
1,2,Jumanji (1995),"Adventure, Children's, Fantasy",Jumanji,1995,jumanji,100,"Adventure, Fantasy, Family",Jumanji,When siblings Judy and Peter discover an encha...,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,Jumanji,jumanji,1995,"Adventure, Children's, Fantasy, Family"
2,3,Grumpier Old Men (1995),"Comedy, Romance",Grumpier Old Men,1995,grumpier old men,100,"Romance, Comedy",Grumpier Old Men,A family wedding reignites the ancient feud be...,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,Grumpier Old Men,grumpier old men,1995,"Comedy, Romance"
3,4,Waiting to Exhale (1995),"Comedy, Drama",Waiting to Exhale,1995,waiting to exhale,100,"Comedy, Drama, Romance",Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,Waiting to Exhale,waiting to exhale,1995,"Comedy, Drama, Romance"
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,father of the bride part ii,100,Comedy,Father of the Bride Part II,Just when George Banks has recovered from his ...,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,Father of the Bride Part II,father of the bride part ii,1995,Comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4202,3948,Meet the Parents (2000),Comedy,Meet the Parents,2000,meet the parents,100,"Comedy, Romance",Meet the Parents,"Greg Focker is ready to marry his girlfriend, ...",/wVjtQtzv9IcNRGnOOdcK797Sdxx.jpg,2000-10-06,Meet the Parents,meet the parents,2000,"Comedy, Romance"
4203,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream,2000,requiem for a dream,100,"Crime, Drama",Requiem for a Dream,The hopes and dreams of four ambitious people ...,/muym4jTjdLx7E6as09d1wlC3sOB.jpg,2000-10-27,Requiem for a Dream,requiem for a dream,2000,"Drama, Crime"
4204,3950,Tigerland (2000),Drama,Tigerland,2000,tigerland,100,"Drama, War",Tigerland,A group of recruits go through Advanced Infant...,/yyW54WcjMFX4NTRy1NnKIeaWhLS.jpg,2000-09-22,Tigerland,tigerland,2000,"Drama, War"
4205,3951,Two Family House (2000),Drama,Two Family House,2000,two family house,100,"Drama, Romance",Two Family House,Buddy Visalo (Michael Rispoli) is a factory wo...,/bTtPrdB25Tidhl7Z04Xi3qLj2co.jpg,2000-01-21,Two Family House,two family house,2000,"Drama, Romance"
