In [1]:
import pandas as pd
import numpy as np

- A movie can be on multiple title or genre
- Because of that, two different id can represent a single movie.
- Hence, we need to check for duplicates data from combination of title and genre first

In [2]:
def load_movies_data(movies_path, sample_frac=0.05, seed=42):
    """
    Function to load movie data
        & removing duplicate data on movie data

    Parameters
    ----------
    movies_path : str
        The path of movies data (.csv)

    sample_frac : float
        % fraction of sample loaded
        Only for this course purposes

    seed : int, default=123
        For reproducibility

    Returns
    -------
    movies_data : pandas DataFrame
        movie data
    """
    # Load data
    movies_data = pd.read_csv(movies_path)
    print('Original data shape                 :', movies_data.shape)

    # Drop drop duplicate & keep the first item with highest popularity
    movies_data = (movies_data.sort_values('popularity', ascending=False)
                              .drop_duplicates(subset=['title', 'genre'],
                                               keep='first'))

    print('Data shape after dropping duplicate :', movies_data.shape)

    # Set movie_id as index
    movies_data = movies_data.set_index(keys=['id'])

    # Sample the data
    movies_data = movies_data.sample(frac=sample_frac,
                                     replace=False,
                                     random_state=seed)
    print('Data shape final                    :', movies_data.shape)

    return movies_data

In [3]:
movie_data = load_movies_data(movies_path = 'top10K-TMDB-movies.csv')

Original data shape                 : (10000, 9)
Data shape after dropping duplicate : (9970, 9)
Data shape final                    : (498, 8)


In [4]:
movie_data.head()

Unnamed: 0_level_0,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11205,Wheels on Meals,"Romance,Action,Comedy,Crime",cn,"Cousins Thomas and David, owners of a mobile r...",19.549,1984-08-17,7.0,282
51822,Love Hurts,"Drama,Romance",es,Family and friends try to sabotage the budding...,26.541,2002-11-08,7.6,204
7326,Juno,"Comedy,Drama,Romance",en,"Faced with an unplanned pregnancy, an offbeat ...",15.327,2007-01-22,7.0,6187
892153,Tom and Jerry Cowboy Up!,"Animation,Comedy,Family,Western",en,"This time, the rivals team up to help a cowgir...",249.664,2022-01-24,6.9,364
16198,My Neighbors the Yamadas,"Animation,Family",ja,The Yamadas are a typical middle class Japanes...,13.545,1999-07-17,7.0,378


We don't need data original_language, release_date and overview, drop now!

In [5]:
movie_data = movie_data.drop(columns=['original_language', 'release_date', 'overview'])

In [6]:
# missing nulls
movie_data.isnull().sum()

title           0
genre           1
popularity      0
vote_average    0
vote_count      0
dtype: int64

In [7]:
# drop missing nulls
movie_data = movie_data.dropna()

In [8]:
# sanichek missing nulls
movie_data.isnull().sum()

title           0
genre           0
popularity      0
vote_average    0
vote_count      0
dtype: int64

In [9]:
# Set the numerical feature columns
num_cols = ['popularity', 'vote_average', 'vote_count']

In [10]:
movie_data_num = movie_data[num_cols]
movie_data_num.head()

Unnamed: 0_level_0,popularity,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11205,19.549,7.0,282
51822,26.541,7.6,204
7326,15.327,7.0,6187
892153,249.664,6.9,364
16198,13.545,7.0,378


In [11]:
movie_data_num.shape

(497, 3)

In [12]:
movie_data_num.describe()

Unnamed: 0,popularity,vote_average,vote_count
count,497.0,497.0,497.0
mean,51.636668,6.672435,1738.211268
std,350.700867,0.737083,2933.115187
min,3.946,4.7,201.0
25%,9.13,6.1,319.0
50%,13.545,6.7,601.0
75%,26.014,7.2,1492.0
max,7567.017,8.5,18835.0


# Feature Engineering Numeric and Category
---

Finally, we normalize the data. We choose MinaMaxScaler for obtain positive values!

In [13]:
# Normalize
from sklearn.preprocessing import MinMaxScaler

In [14]:
def numerical_vectorizer(data, num_cols):
    """
    Create numerical vector from a given data

    Parameters
    ----------
    data : pandas DataFrame
        The sample data

    num_cols : list
        The choosen numerical columns

    Returns
    -------
    data_num_clean : pandas DataFrame
        The sample data with choosen numerical columns
    """
    data = data.copy()

    # Filter data
    data_num = data[num_cols]

    # Transform data
    scaler = MinMaxScaler()
    scaler.fit(data_num)

    data_num_clean = pd.DataFrame(scaler.transform(data_num))
    data_num_clean.index = data_num.index
    data_num_clean.columns = data_num.columns

    print('Shape of original data  :', data.shape)
    print('Shape of numerical data :', data_num_clean.shape)

    return data_num_clean


In [15]:
movie_data_num = numerical_vectorizer(data = movie_data,
                                       num_cols = num_cols)

movie_data_num.head()

Shape of original data  : (497, 5)
Shape of numerical data : (497, 3)


Unnamed: 0_level_0,popularity,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11205,0.002063,0.605263,0.004347
51822,0.002988,0.763158,0.000161
7326,0.001505,0.605263,0.321241
892153,0.032489,0.578947,0.008747
16198,0.001269,0.605263,0.009499


In [16]:
# lets go to category
movie_data_cat = movie_data[['title', 'genre']]

In [17]:
movie_data_cat.head()

Unnamed: 0_level_0,title,genre
id,Unnamed: 1_level_1,Unnamed: 2_level_1
11205,Wheels on Meals,"Romance,Action,Comedy,Crime"
51822,Love Hurts,"Drama,Romance"
7326,Juno,"Comedy,Drama,Romance"
892153,Tom and Jerry Cowboy Up!,"Animation,Comedy,Family,Western"
16198,My Neighbors the Yamadas,"Animation,Family"


In [18]:
class OHEVectorizer:
    """
    Transform data to OHE
    """
    def __init__(self, sep=';'):
        self.sep = sep

    def _generate_unique_values(self, data):
        """Generate unique values from a given data for each columns"""
        # Set a placeholder
        self.values_unique = {}

        # Iterate over columns
        for col in data.columns:
            values_list = []

            # Iterate over data
            for values in data[col].values:
                # Split the values by `sep`
                values_split = values.split(self.sep)

                # Append
                values_list += values_split

            # Get the unique artists
            values_unique = set(values_list)
            self.values_unique[col] = sorted(values_unique)

    def _generate_values_map(self, data):
        """Get the value to index and index to value map"""
        # Set a placeholder
        self.values_mapping = {}

        # Iterate over columns
        for col in data.columns:
            # Get the unique values
            values_unique = self.values_unique[col]

            # Create the mapping
            val_to_idx = {val:idx for idx, val in enumerate(values_unique)}
            idx_to_val = {idx:val for val, idx in val_to_idx.items()}

            # Save
            self.values_mapping[col] = {
                'val_to_idx': val_to_idx,
                'idx_to_val': idx_to_val
            }

    def fit(self, data):
        """
        Fit the OHE of all data
        """
        # 1. Generate the unique values
        self._generate_unique_values(data)

        # 2. Generate the mapping id
        self._generate_values_map(data)

    def transform(self, data):
        """
        Transform a sample data
        """
        # Set a placeholder
        mapped_cols = []
        n_rows = data.shape[0]

        # Iterate over columns
        for col in data.columns:
            # Extract encoder
            values_unique = self.values_unique[col]
            val_to_idx = self.values_mapping[col]['val_to_idx']

            # Create a mapped value for col data
            n_cols = len(values_unique)
            mapped_col = np.zeros((n_rows, n_cols), dtype=int)

            # Iterate over data
            for i, values in enumerate(data[col].values):
                # Split the artists
                values_split = values.split(';')

                # Convert the artists to index for mapping
                values_split_idx = [val_to_idx[val] for val in values_split]

                # Assign value 1 on the index in the placeholder
                mapped_col[i, values_split_idx] = 1

            # Convert to pandas DataFrame
            mapped_ohe = pd.DataFrame(mapped_col)
            mapped_ohe.columns = values_unique
            mapped_ohe.index = data.index

            # Append
            mapped_cols.append(mapped_ohe)

        # Concat the new data
        mapped_data = pd.concat([mapped_col for mapped_col in mapped_cols],
                                axis = 1)

        return mapped_data


In [19]:
# Create an object
ohe_vectorizer = OHEVectorizer()

# Fit the object with our current data
ohe_vectorizer.fit(data = movie_data_cat)

In [20]:
# Transform current data with ohe vectorizer
movie_data_ohe = ohe_vectorizer.transform(movie_data_cat)

print('Original data shape :', movie_data_cat.shape)
print('OHE data shape      :', movie_data_cat.shape)

Original data shape : (497, 2)
OHE data shape      : (497, 2)


In [21]:
movie_data_ohe.head()

Unnamed: 0_level_0,10 Cloverfield Lane,100 Feet,12 Rounds 2: Reloaded,1BR,3:10 to Yuma,A Bay of Blood,A Bug's Life,A Dog's Way Home,A Fantastic Woman,A Gang Story,...,"Thriller,Horror","Thriller,Horror,Drama,Mystery","Thriller,Science Fiction,Drama,Horror","Thriller,Science Fiction,Mystery","War,Drama,Action,Thriller","War,Drama,Thriller",Western,"Western,Adventure","Western,Drama,Action","Western,Thriller"
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11205,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51822,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7326,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
892153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's join numeric and category data

In [22]:
movie_data_num_ohe = pd.concat((movie_data_num, movie_data_ohe),
                                axis = 1)

print('Combined data shape :', movie_data_num_ohe.shape)
movie_data_num_ohe.head()

Combined data shape : (497, 764)


Unnamed: 0_level_0,popularity,vote_average,vote_count,10 Cloverfield Lane,100 Feet,12 Rounds 2: Reloaded,1BR,3:10 to Yuma,A Bay of Blood,A Bug's Life,...,"Thriller,Horror","Thriller,Horror,Drama,Mystery","Thriller,Science Fiction,Drama,Horror","Thriller,Science Fiction,Mystery","War,Drama,Action,Thriller","War,Drama,Thriller",Western,"Western,Adventure","Western,Drama,Action","Western,Thriller"
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11205,0.002063,0.605263,0.004347,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51822,0.002988,0.763158,0.000161,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7326,0.001505,0.605263,0.321241,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
892153,0.032489,0.578947,0.008747,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16198,0.001269,0.605263,0.009499,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Modeling : Content Based Recommendations

**Cosine**

In [23]:
def cosine_similarity(vec_A, vec_B):
    """Calculate the cosine similarity between vec A and vec B"""
    # Find the norm
    norm_A = np.linalg.norm(vec_A)
    norm_B = np.linalg.norm(vec_B)

    # Find the dot
    dot_AB = np.dot(vec_A, vec_B)

    # Calculate the similarity
    sim = dot_AB / (norm_A * norm_B)

    return sim


In [24]:
# add a progress bar, it need some times to finish.
from tqdm import tqdm

In [25]:
def track_recommendation(track_id, n, track_data, similarity_func):
    """
    Recommend n item based on latest played track_id
    """
    # Generate the similarity score
    n_tracks = len(track_data.index)
    similarity_score = np.zeros(n_tracks)

    # Iterate the whole tracks
    track_target = track_data.loc[track_id]
    for i, track_id_i in enumerate(tqdm(track_data.index)):
        # Extract track_i
        track_i = track_data.loc[track_id_i]

        # Calculate the similarity
        sim_i = similarity_func(vec_A = track_target,
                                vec_B = track_i)

        # Append
        similarity_score[i] = sim_i

    # Sort in descending orders of similarity_score
    sorted_idx = np.argsort(similarity_score)[::-1]

    # Return the n top similar track_id
    top_tracks_id = track_data.index[sorted_idx[1:n+1]]

    return top_tracks_id

Find recomendation movie name

In [26]:
from fuzzywuzzy import process

def get_movie_id_fuzzy(movie_name, movie_data):
    """
    Get the movie ID based on the movie name using fuzzy matching
    """
    # Create a list of movie titles from the dataset
    movie_titles = movie_data['title'].tolist()
    
    # Use fuzzy matching to find the closest match
    match = process.extractOne(movie_name, movie_titles)
    
    # Check if a match is found and get the movie ID
    if match[1] >= 70:  # Adjust the threshold as needed
        movie_id = movie_data[movie_data['title'] == match[0]].index.values[0]
        return movie_id
    else:
        print("No close match found for the movie name.")
        return None



In [27]:
movie_name_partial = "Wheels on"
movie_id = get_movie_id_fuzzy(movie_name_partial, movie_data)
if movie_id is not None:
    print("Movie ID:", movie_id)
    print(movie_data.loc[[movie_id]].T)

Movie ID: 11205
id                                  11205
title                     Wheels on Meals
genre         Romance,Action,Comedy,Crime
popularity                         19.549
vote_average                          7.0
vote_count                            282


In [28]:
top_movie_id = track_recommendation(track_id = movie_id,
                                     n = 5,
                                     track_data = movie_data_ohe,
                                     similarity_func = cosine_similarity)

top_movie_id

100%|██████████| 497/497 [00:00<00:00, 1245.96it/s]


Index([531306, 16161, 5991, 23706, 70577], dtype='int64', name='id')

In [29]:
# result of movie
selected_movies = movie_data.loc[top_movie_id].T

selected_movies

id,531306,16161,5991,23706,70577
title,Rim of the World,Baby Boy,The Last Laugh,All About Steve,Faces in the Crowd
genre,"Science Fiction,Adventure,Action,Comedy","Crime,Drama,Romance,Thriller",Drama,Comedy,"Thriller,Crime"
popularity,19.647,10.768,5.843,9.393,8.844
vote_average,6.0,7.1,7.8,4.9,6.1
vote_count,808,236,224,843,402


In [31]:
# Save data vectorize
movie_data_ohe.to_csv('backend/movie_data_ohe.csv', index=False) 
movie_data.to_csv('backend/movie_data.csv', index=False)