In [47]:
import pandas as pd

In [48]:
df = pd.read_csv("top10K-TMDB-movies.csv")

In [49]:
# display first 15 movies
df.head(15)

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811
5,667257,Impossible Things,"Family,Drama",es,"Matilde is a woman who, after the death of her...",14.358,2021-06-17,8.6,255
6,129,Spirited Away,"Animation,Family,Fantasy",ja,"A young girl, Chihiro, becomes trapped in a st...",92.056,2001-07-20,8.5,13093
7,730154,Your Eyes Tell,"Romance,Drama",ja,"A tragic accident lead to Kaori's blindness, b...",51.345,2020-10-23,8.5,339
8,372754,Dou kyu sei – Classmates,"Romance,Animation",ja,"Rihito Sajo, an honor student with a perfect s...",14.285,2016-02-20,8.5,239
9,372058,Your Name.,"Romance,Animation,Drama",ja,High schoolers Mitsuha and Taki are complete s...,158.27,2016-08-26,8.5,8895


In [50]:
# print the columns
print(df.columns)

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')


In [51]:
# check for missing value in eacch column
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
genre,3
original_language,0
overview,13
popularity,0
release_date,0
vote_average,0
vote_count,0


In [52]:
# remove rows with missing values
df.dropna(subset=['genre', 'overview'], inplace=True)
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
genre,0
original_language,0
overview,0
popularity,0
release_date,0
vote_average,0
vote_count,0


In [53]:
df['combined'] = df['title'].astype(str) + ' ' + df['genre'].astype(str) + ' ' + df['overview'].astype(str) # added a new feature
df = df[['id','title','genre','original_language','overview','combined','vote_average']] # droped unused columns
df.head()

Unnamed: 0,id,title,genre,original_language,overview,combined,vote_average
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,"The Shawshank Redemption Drama,Crime Framed in...",8.7
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...","Dilwale Dulhania Le Jayenge Comedy,Drama,Roman...",8.7
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...","The Godfather Drama,Crime Spanning the years 1...",8.7
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,"Schindler's List Drama,History,War The true st...",8.6
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,"The Godfather: Part II Drama,Crime In the cont...",8.6


In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer # to vectorize the feature
tf_idf_vectorizer = TfidfVectorizer(stop_words = 'english') # initialize the vectorizer

# get tf-idf matrix
tf_matrix = tf_idf_vectorizer.fit_transform(df['combined'])

from sklearn.metrics.pairwise import cosine_similarity # calculating similarities between movies
cosine_simil = cosine_similarity(tf_matrix,tf_matrix)



In [55]:
def get_recom_ids(title, cosine_sim=cosine_simil):
    try:
        idx = df.index[df['title'] == title].tolist()[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
        return [df.iloc[i[0]]['id'] for i in sim_scores]
    except IndexError:
        return [None]*10

def get_recom_by_lang_ids(title, cosine_sim=cosine_simil):
    try:
        idx = df.index[df['title'] == title].tolist()[0]
        input_language = df.iloc[idx]['original_language']
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        movie_ids = []
        for i in sim_scores[1:]:
            movie_idx = i[0]
            if df.iloc[movie_idx]['original_language'] == input_language:
                movie_ids.append(df.iloc[movie_idx]['id'])
            if len(movie_ids) == 10:
                break

        # If less than 10 found, pad with None
        while len(movie_ids) < 10:
            movie_ids.append(None)

        return movie_ids
    except IndexError:
        return [None]*10


In [56]:
# Add columns r1 to r10 and rl1 to rl10
for i in range(1, 11):
    df[f'r{i}'] = None
    df[f'rl{i}'] = None


In [58]:
# Fill in recommendations
for i, row in df.iterrows():
    recs = get_recom_ids(row['title'])
    recs_lang = get_recom_by_lang_ids(row['title'])

    for j in range(10):
        df.at[i, f'r{j+1}'] = recs[j]
        df.at[i, f'rl{j+1}'] = recs_lang[j]


In [59]:
df.head(15)

Unnamed: 0,id,title,genre,original_language,overview,combined,vote_average,r1,rl1,r2,...,r6,rl6,r7,rl7,r8,rl8,r9,rl9,r10,rl10
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,"The Shawshank Redemption Drama,Crime Framed in...",8.7,19277,19277,9809,...,992,992,411405,411405,303991,303991,5528,6957,6957,811592
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...","Dilwale Dulhania Le Jayenge Comedy,Drama,Roman...",8.7,15927,370665,370665,...,3549,4251,432527,205022,11359,15774,8390,118628,4538,20453
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...","The Godfather Drama,Crime Spanning the years 1...",8.7,240,240,242,...,442064,27579,27579,157847,157847,8292,8292,11702,11702,339095
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,"Schindler's List Drama,History,War The true st...",8.6,491926,491926,340,...,1716,5925,5925,736069,736069,304357,7862,127560,304357,11422
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,"The Godfather: Part II Drama,Crime In the cont...",8.6,238,238,242,...,11419,11419,241239,241239,299,299,10990,10990,356296,167073
5,667257,Impossible Things,"Family,Drama",es,"Matilde is a woman who, after the death of her...","Impossible Things Family,Drama Matilde is a wo...",8.6,283726,429191,354912,...,3877,542921,40720,726208,30127,52274,339994,219,157351,811072
6,129,Spirited Away,"Animation,Family,Fantasy",ja,"A young girl, Chihiro, becomes trapped in a st...","Spirited Away Animation,Family,Fantasy A young...",8.5,2114,8392,843906,...,560057,15916,252512,10515,465086,242828,14830,10228,406785,475215
7,730154,Your Eyes Tell,"Romance,Drama",ja,"A tragic accident lead to Kaori's blindness, b...","Your Eyes Tell Romance,Drama A tragic accident...",8.5,582014,110420,8338,...,122906,315011,167581,7500,10885,513434,70577,9323,23049,11953
8,372754,Dou kyu sei – Classmates,"Romance,Animation",ja,"Rihito Sajo, an honor student with a perfect s...","Dou kyu sei – Classmates Romance,Animation Rih...",8.5,13505,241863,241863,...,51481,776305,11824,7500,5528,198375,606876,12720,73108,364111
9,372058,Your Name.,"Romance,Animation,Drama",ja,High schoolers Mitsuha and Taki are complete s...,"Your Name. Romance,Animation,Drama High school...",8.5,111083,4241,556803,...,586333,652837,322240,242828,352492,507569,11058,37933,19186,492719


In [61]:
df = df[['id','title','genre','original_language','overview','vote_average','r1','r2','r3','r4','r5','r6','r7','r8','r9','r10','rl1','rl2','rl3','rl4','rl5','rl6','rl7','rl8','rl9','rl10']]
df.head(15)

Unnamed: 0,id,title,genre,original_language,overview,vote_average,r1,r2,r3,r4,...,rl1,rl2,rl3,rl4,rl5,rl6,rl7,rl8,rl9,rl10
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,8.7,19277,9809,10398,1623,...,19277,9809,10398,1623,107846,992,411405,303991,6957,811592
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",8.7,15927,370665,42433,19666,...,370665,19666,14163,432527,348892,4251,205022,15774,118628,20453
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",8.7,240,242,190955,11659,...,240,242,190955,92060,442064,27579,157847,8292,11702,339095
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,8.6,491926,340,13701,403300,...,491926,340,13701,403300,13813,5925,736069,304357,127560,11422
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,8.6,238,242,451,137093,...,238,242,451,137093,265208,11419,241239,299,10990,167073
5,667257,Impossible Things,"Family,Drama",es,"Matilde is a woman who, after the death of her...",8.6,283726,354912,362057,7442,...,429191,437311,818647,6537,513223,542921,726208,52274,219,811072
6,129,Spirited Away,"Animation,Family,Fantasy",ja,"A young girl, Chihiro, becomes trapped in a st...",8.5,2114,843906,8392,322240,...,8392,92321,823,430447,100271,15916,10515,242828,10228,475215
7,730154,Your Eyes Tell,"Romance,Drama",ja,"A tragic accident lead to Kaori's blindness, b...",8.5,582014,8338,575604,8270,...,110420,37910,672322,15080,21506,315011,7500,513434,9323,11953
8,372754,Dou kyu sei – Classmates,"Romance,Animation",ja,"Rihito Sajo, an honor student with a perfect s...",8.5,13505,241863,87368,20139,...,241863,18861,15283,919,431819,776305,7500,198375,12720,364111
9,372058,Your Name.,"Romance,Animation,Drama",ja,High schoolers Mitsuha and Taki are complete s...,8.5,111083,556803,11660,10875,...,4241,420426,594188,64246,776305,652837,242828,507569,37933,492719


In [65]:
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
genre,0
original_language,0
overview,0
vote_average,0
r1,11
r2,11
r3,11
r4,11


In [64]:
import json
import math

# Make sure rating is float
df['vote_average'] = df['vote_average'].astype(float)
df['id'] = df['id'].astype(float)

for i in range(1, 11):
    df[f'r{i}'] = df[f'r{i}'].astype(float)
    df[f'rl{i}'] = df[f'rl{i}'].astype(float)

# Convert to list of dicts
movies = df.to_dict(orient='records')

# Clean NaNs from r and rl keys
cleaned_movies = []
for movie in movies:
    movie_clean = movie.copy()
    for i in range(1, 11):
        r_key = f'r{i}'
        rl_key = f'rl{i}'

        # Remove the key if value is NaN
        if r_key in movie_clean and (movie_clean[r_key] is None or (isinstance(movie_clean[r_key], float) and math.isnan(movie_clean[r_key]))):
            del movie_clean[r_key]
        if rl_key in movie_clean and (movie_clean[rl_key] is None or (isinstance(movie_clean[rl_key], float) and math.isnan(movie_clean[rl_key]))):
            del movie_clean[rl_key]

    cleaned_movies.append(movie_clean)

# Write cleaned data to JSON
with open("movies.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_movies, f, indent=2, ensure_ascii=False)

print("movies.json has been created successfully with NaNs discarded!")


movies.json has been created successfully with NaNs discarded!
