## ETL Process for obtaining a clean database

In [1]:
import pandas as pd
import numpy as np

import re
import unicodedata

In [2]:
TMDB_Data = pd.read_csv('TMDB_movie_dataset_v11.csv')

In [3]:
TMDB_Data.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


In [4]:
# Null data in title, release date, backdrop path, homepage, imdb_id, original_title, overview, poster_path, tagline, genres, production companies, production countries, spoken language, keywords
TMDB_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1289061 entries, 0 to 1289060
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1289061 non-null  int64  
 1   title                 1289046 non-null  object 
 2   vote_average          1289061 non-null  float64
 3   vote_count            1289061 non-null  int64  
 4   status                1289061 non-null  object 
 5   release_date          1035149 non-null  object 
 6   revenue               1289061 non-null  int64  
 7   runtime               1289061 non-null  int64  
 8   adult                 1289061 non-null  bool   
 9   backdrop_path         330482 non-null   object 
 10  budget                1289061 non-null  int64  
 11  homepage              134319 non-null   object 
 12  imdb_id               638723 non-null   object 
 13  original_language     1289061 non-null  object 
 14  original_title        1289046 non-

In [5]:
# Checking the titles, as there are no info about title or original title, the rows are useless and are deleted.  
# #TMDB_Data[TMDB_Data['title'].isnull()]

TMDB_Data = TMDB_Data.dropna(subset = ['title'])
TMDB_Data = TMDB_Data.dropna(subset = ['original_title'] )
TMDB_Data = TMDB_Data.dropna(subset = ['release_date'] )

In [6]:
TMDB_Data["release_date"] = TMDB_Data["release_date"].str[:4]

In [7]:
TMDB_Data["release_date"]= TMDB_Data["release_date"].astype(int)
TMDB_Data = TMDB_Data[TMDB_Data['release_date'] <= 2025]

In [8]:
# The recommendation system will be based on the movies currently released. The ones in production, planned, rumored or cancelled will be excluded.
print(TMDB_Data['status'].unique())
TMDB_Data = TMDB_Data[TMDB_Data['status'] == 'Released']

['Released' 'Planned' 'In Production' 'Post Production' 'Rumored'
 'Canceled']


In [9]:
# Poster path is irrelevant to the recommendation system. Same for backdrop path and others. That columns are deleted.
columns_to_remove = ['poster_path', 'backdrop_path', 'id','status', 'homepage', 'title', 'tagline']

for column in columns_to_remove:
    try:
        TMDB_Data = TMDB_Data.drop(columns=column)
    except:
        pass

In [10]:
TMDB_Data

Unnamed: 0,vote_average,vote_count,release_date,revenue,runtime,adult,budget,imdb_id,original_language,original_title,overview,popularity,genres,production_companies,production_countries,spoken_languages,keywords
0,8.364,34495,2010,825532764,148,False,160000000,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,8.417,32571,2014,701729206,169,False,165000000,tt0816692,en,Interstellar,The adventures of a group of explorers who mak...,140.241,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,8.512,30619,2008,1004558444,152,False,185000000,tt0468569,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,7.573,29815,2009,2923706026,162,False,237000000,tt0499549,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,7.710,29166,2012,1518815515,143,False,220000000,tt0848228,en,The Avengers,When an unexpected enemy emerges and threatens...,98.082,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1289054,0.000,0,1991,0,1,False,0,,en,Lawn Butch,Being a codependent doesn't mean you can't tri...,0.600,,,United States of America,,
1289055,0.000,0,1995,0,3,False,0,,en,Lessons In Baby Dyke Theory,In 1995 when Thirza Cuthand was 16 she felt li...,0.600,Comedy,,Canada,,
1289056,0.000,0,2021,0,0,False,0,,en,La Falsa Noche,,0.600,,,,,
1289058,0.000,0,2018,0,4,False,0,,en,Blade Brigade,Blades has to defeat his ex rival Ajax for the...,0.600,"Comedy, Action, Western",Cult Classic,United States of America,English,rollerblades


In [11]:
TMDB_Data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1019644 entries, 0 to 1289059
Data columns (total 17 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   vote_average          1019644 non-null  float64
 1   vote_count            1019644 non-null  int64  
 2   release_date          1019644 non-null  int32  
 3   revenue               1019644 non-null  int64  
 4   runtime               1019644 non-null  int64  
 5   adult                 1019644 non-null  bool   
 6   budget                1019644 non-null  int64  
 7   imdb_id               572905 non-null   object 
 8   original_language     1019644 non-null  object 
 9   original_title        1019644 non-null  object 
 10  overview              839920 non-null   object 
 11  popularity            1019644 non-null  float64
 12  genres                678311 non-null   object 
 13  production_companies  527071 non-null   object 
 14  production_countries  643714 non-null  

In [12]:
Numerical_colums = ['vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity', 'release_date']
Text_columns = ['original_title','original_language', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords']
# Bool columns = adult

In [13]:
TMDB_Data["original_title"] = TMDB_Data["original_title"] .str.strip()

In [14]:
TMDB_Data['original_title'] = TMDB_Data['original_title'].str.replace("_", "", regex=False)
TMDB_Data['original_title'] = TMDB_Data['original_title'].str.replace(r'\s*:{2,}\s*', ':', regex=True)
TMDB_Data['original_title'] = TMDB_Data['original_title'].str.replace(r'\s*\+{2,}\s*', '+', regex=True)

In [15]:
# Text_columns = ['original_title', 'genres', 'production_companies', 'keywords']

# Preprocessing for text columns. 
for column in Text_columns:
    TMDB_Data[column] = TMDB_Data[column].fillna("")
    TMDB_Data[column] = TMDB_Data[column].str.replace(r'[^\w\s%&#$+<=>^:]', "", regex = True).str.strip()

In [16]:
def clean_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)

def normalize_text(text):
    return unicodedata.normalize('NFKC', text)

In [17]:
TMDB_Data.loc[:,'original_title'] = TMDB_Data.loc[:,'original_title'].apply(normalize_text)
TMDB_Data.loc[:,'original_title'] = TMDB_Data.loc[:,'original_title'].apply(clean_html_tags)

In [18]:
TMDB_Data["original_title"] = TMDB_Data["original_title"] .str.strip()

In [19]:
pattern = r'^[^\s]{1}$' 
TMDB_Data = TMDB_Data[~TMDB_Data['original_title'].str.fullmatch(pattern)]
TMDB_Data = TMDB_Data[~TMDB_Data['original_title'].isin(['', 'Untitled', '0'])]

In [20]:
TMDB_Data['original_title'].sort_values(ascending = True).head(40)

1177439                                                 # 32
1249724                                          #0 sisocisP
1093649                                                 #001
1093650                                                 #002
1093651                                                 #003
1270203                                                 #004
1232903                                                 #005
1093331                                                 #006
438283                                                  #007
538545                                                  #008
760187                                                  #009
582186                                      #01 Hygienikerin
38118                                                     #1
179691                                                    #1
146755                                                    #1
216823                                                    #1
588501                  

In [37]:
TMDB_Data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1018562 entries, 0 to 1289059
Data columns (total 17 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   vote_average          1018562 non-null  float64
 1   vote_count            1018562 non-null  int64  
 2   release_date          1018562 non-null  int32  
 3   revenue               1018562 non-null  int64  
 4   runtime               1018562 non-null  int64  
 5   adult                 1018562 non-null  bool   
 6   budget                1018562 non-null  int64  
 7   imdb_id               1018562 non-null  object 
 8   original_language     1018562 non-null  object 
 9   original_title        1018562 non-null  object 
 10  overview              838989 non-null   object 
 11  popularity            1018562 non-null  float64
 12  genres                1018562 non-null  object 
 13  production_companies  1018562 non-null  object 
 14  production_countries  1018562 non-null 

In [38]:
# Manage NaN in imdb key column
TMDB_Data['imdb_id'].fillna('Unknown', inplace=True)
TMDB_Data['overview'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  TMDB_Data['overview'].fillna('Unknown', inplace=True)


In [None]:
# TMDB_Data = TMDB_Data.drop_duplicates()

In [None]:
# TMDB_Data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1018348 entries, 0 to 1289059
Data columns (total 17 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   vote_average          1018348 non-null  float64
 1   vote_count            1018348 non-null  int64  
 2   release_date          1018348 non-null  int32  
 3   revenue               1018348 non-null  int64  
 4   runtime               1018348 non-null  int64  
 5   adult                 1018348 non-null  bool   
 6   budget                1018348 non-null  int64  
 7   imdb_id               1018348 non-null  object 
 8   original_language     1018348 non-null  object 
 9   original_title        1018348 non-null  object 
 10  overview              1018348 non-null  object 
 11  popularity            1018348 non-null  float64
 12  genres                1018348 non-null  object 
 13  production_companies  1018348 non-null  object 
 14  production_countries  1018348 non-null 

In [None]:
# TMDB_Data[TMDB_Data['imdb_id'] == 'tt32094375']

Unnamed: 0,vote_average,vote_count,release_date,revenue,runtime,adult,budget,imdb_id,original_language,original_title,overview,popularity,genres,production_companies,production_countries,spoken_languages,keywords
197973,4.2,2,2024,0,92,False,0,tt32094375,en,Die Hart: Die Harter,"In this sequel, Kevin Hart, now the world's to...",0.0,,HartBeat Productions,United States of America,English,
199874,5.0,2,2024,0,0,False,0,tt32094375,en,Die Hart 2: Die Harter,Unknown,1.4,Action Comedy,,,English,
228110,2.0,1,2024,0,0,False,0,tt32094375,en,Die Hart 2: Die Harter,Hart plans an innovative action film with unsc...,1.272,Action Comedy,,,,
235757,8.0,1,2024,0,0,False,0,tt32094375,en,Die Hart 2: Die Harter,Unknown,0.0,Comedy Action,,United States of America,,
249552,2.0,1,2024,0,92,False,0,tt32094375,en,Die Hart 2: Die Harter,Hart plans an innovative action film with unsc...,1.4,Action Comedy,HartBeat Productions Distribuidora: Amazon Pri...,,English,
255129,6.0,1,2024,0,0,False,0,tt32094375,pt,Die Hart: Die Harter,Unknown,0.6,Action Comedy,,,English,
267253,2.0,1,2024,0,0,False,0,tt32094375,pt,Duro de Atuar 2,Unknown,1.4,Action Comedy,HartBeat Productions,,,
300397,3.0,1,2024,0,92,False,0,tt32094375,en,Die Hart 2: Die Harter,Hart plans an innovative action film with unsc...,0.15,Action Comedy,HartBeat Productions,United States of America,English,
324786,5.0,1,2024,0,0,False,0,tt32094375,en,Die Hart 2,Die Hart: Die Harter full movie,0.0,Action Comedy,,,German,
333482,7.0,1,2024,0,0,False,0,tt32094375,en,Die Hart 2: Die Harter,"In this sequel, Kevin Hart, now the world's bi...",0.6,Comedy Action,HartBeat Productions,,,


In [None]:
# Remove duplicates in BOTH imdb ID's and original title

# TMDB_Data['original_title'].value_counts()

original_title
Home                      90
Alone                     66
Carmen                    63
Limbo                     55
Solo                      50
                          ..
Trannymals Go To Court     1
El Carbón Chileno          1
The Theatre                1
埋もれる                       1
Heisser Sand               1
Name: count, Length: 908907, dtype: int64

In [23]:
# Cleaned database for storing in mySQL
# TMDB_Data.to_csv('TMDB_movie_dataset_cleaned.csv', index=False)

## Vectorization with TF-IDF for textural features and standarization of numerical features

In [24]:
import scipy
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

In [516]:
# Scaling for numerical columns
scaler = MinMaxScaler()
TMDB_Data[Numerical_colums] = scaler.fit_transform( TMDB_Data[Numerical_colums] )

In [517]:
# TF-IDF Vectorization for each column and posterior concatenation of all vectors.

tfidf_vectors = []

for col in Text_columns:

    scaler_text = TfidfVectorizer(max_features=50)
    X = scaler_text.fit_transform(TMDB_Data[col])

    tfidf_vectors.append(X)

X_combined = hstack(tfidf_vectors)

print('TF-IDF VECTOR SIZE: ', X_combined.shape)

TF-IDF VECTOR SIZE:  (1018562, 321)


In [518]:
dim1 = X_combined.shape[1]

In [519]:
X_num_sparse = scipy.sparse.csr_matrix(TMDB_Data[Numerical_colums].values)
X_num_sparse= X_num_sparse*np.sqrt(dim1/len(Numerical_colums)*0.3)
X_final = hstack([X_num_sparse, X_combined])

print('FINAL VECTOR SIZE: ', X_final.shape)

FINAL VECTOR SIZE:  (1018562, 328)


In [520]:
items = X_final.shape[0]
dim = X_final.shape[1]

## ANN with faiss for recommendation system (content-based)

In [None]:
import faiss
from sklearn.preprocessing import normalize
from scipy.sparse import save_npz, load_npz

In [521]:
X_sparse = normalize(X_final, norm='l2', axis=1)

In [None]:
# Save the sparse matrix to a .npz file for storage and later use
# save_npz("X_sparse.npz", X_sparse)

In [None]:
query_idx = 26
movie = TMDB_Data.iloc[query_idx]['original_title']
print(f'Movie selected: {query_idx})', movie)

Movie selected: 26) Iron Man 3


In [559]:
index = faiss.IndexFlatL2(dim)
index.add(X_sparse.astype(np.float32).toarray())

In [560]:
xq = X_sparse[query_idx].reshape(1, -1).toarray().astype('float32')
Distances, Indexes = index.search(xq, k=6)  # 5 closer ANN

print("Indexes:", Indexes)
print("Distances:", Distances)

Indexes: [[26 52 42  6 12 40]]
Distances: [[0.         0.05973065 0.06383241 0.06395502 0.07796714 0.07932597]]


In [561]:
print(f"Recommended movies since you've watched {movie} : ")

for i, idx in enumerate(Indexes[0]):
    if i > 0:
         print(f"{TMDB_Data.iloc[idx]['original_title']}")

Recommended movies since you've watched Iron Man 3 : 
AntMan
Thor: Ragnarok
Avengers: Infinity War
Iron Man
Iron Man 2
