<a href="https://colab.research.google.com/github/salmacmpeg/SMR_movies_recommendation/blob/main/Movies_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers  diffusers   pandas

In [None]:
!pip install -U datasets huggingface_hub fsspec

In [None]:
import torch
from google.colab import userdata
from huggingface_hub import login
from transformers import pipeline
import pandas as pd
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import os
import re
from google.colab import drive


In [None]:
#credintials
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
drive.mount('/content/drive')
saving_embeddings_path ='/content/drive/MyDrive/Recommendation/embeddings_movies.npy'
data_saving_path ='/content/drive/MyDrive/Recommendation/pandas_data.csv'
pd.set_option('display.max_colwidth', 100)

In [None]:
try:
  #loading the movies data from local derive
  pandas_data = pd.read_csv(data_saving_path)

except:
  #an alternative way to load data using huggingface
  dataset = load_dataset('bloc4488/TMDB-all-movies')
  pandas_data = pd.DataFrame(dataset['train'])

# pandas_data.to_csv(data_saving_path, index=False)

In [None]:
#preprocess_data
def preprocess(df_movie_orig: pd.DataFrame, verbose:bool = False) -> pd.DataFrame:

    droped_columns = ['Unnamed: 0', 'id', 'revenue', 'runtime', 'vote_count',
                  'budget', 'imdb_id', 'original_language', 'original_title',
                  'tagline', 'production_countries', 'spoken_languages',
                  'producers','music_composer', 'director_of_photography', 'status']
    if verbose: print(f'Dropping columns: {droped_columns}')

    beneficial_data=df_movie_orig.drop(droped_columns, axis=1)
    if verbose: print(f'Columns left: {beneficial_data.columns}, and shape is {beneficial_data.shape}')

    beneficial_data.dropna(inplace=True)
    if verbose: print(f'Dropping nan rows .... , and shape is {beneficial_data.shape}')

    beneficial_data = beneficial_data[beneficial_data['vote_average'] != 0.0]
    if verbose: print(f'Dropping rows  that has zero voting values .... , and shape is {beneficial_data.shape}')

    beneficial_data['release_date'] = beneficial_data['release_date'].apply(lambda x: x.strip().split('-')[0])# i will keep the year only
    beneficial_data['release_date'] = beneficial_data['release_date'].astype(int) #converting the release year to int
    if verbose: print('keeping only the year in the realse date, save it as int .....')

    beneficial_data['popularity'] = beneficial_data['popularity'].round().astype(int) #convert popularity to nearest integer
    if verbose: print('converting popularity float values to the nearest integers .....')

    beneficial_data['production_companies'] = beneficial_data['production_companies'].apply(lambda x: x.split(',')[0])  #take only the first company
    beneficial_data['production_companies'] = beneficial_data['production_companies'].apply(lambda x: x.replace(' ', ''))#join the text
    if verbose: print('Taking only the first production_company and remove the spaces in the name .....')

    beneficial_data.reset_index(drop=True, inplace=True)
    return beneficial_data


#createing clean data frame to work on in the two recommendation models
clean_data = preprocess(pandas_data, verbose=False)
clean_data['data_combined'] = 'title: '+clean_data['title']+ ' overview: ' + clean_data['overview'] + ', genres: ' + clean_data['genres']

In [None]:
#build count vectorizer matrix to get representation for the columns overview and genres
def create_count_matrix( series: pd.Series, verbose:bool = True, name:str ='') -> pd.DataFrame:
    cv = CountVectorizer(stop_words='english', ngram_range=(1,1), min_df=0.05)
    tdm = cv.fit_transform(series)
    vector_word_df = pd.DataFrame(tdm.toarray(), columns=cv.get_feature_names_out())
    if verbose: print(f' created count matrix for series {name} with shape {vector_word_df.shape}')
    return vector_word_df

vector_word_df_overview = create_count_matrix(clean_data['overview'], name='overview')
vector_word_df_generes = create_count_matrix(clean_data['genres'], name='genres')

In [None]:
#prepare feature data frame for the knn model
KNN_features_db= pd.concat([vector_word_df_generes,vector_word_df_overview,clean_data[['vote_average', 'popularity']]], axis=1)
KNN_features_db.shape

In [None]:
#training the knn remcommender model
def fit_knn(features_db: pd.DataFrame):
  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(features_db)
  knn_model = NearestNeighbors(n_neighbors=5, algorithm='auto')
  knn_model.fit(X_scaled)
  distances, indices = knn_model.kneighbors(X_scaled)
  return knn_model,X_scaled
knn_model,KNN_features_db_scaled = fit_knn(KNN_features_db)

In [None]:
#query the knn recommender model with movie index
def get_knn_movies_recommendation(knn_model,scaled_features_db,movie_data,movie_index,nmovies=5):
    neighbours_dis, neighbours_ind= knn_model.kneighbors(scaled_features_db[movie_index].reshape(1, -1), return_distance=True, n_neighbors=nmovies)
    neighbor_rows = movie_data.iloc[neighbours_ind[0]]

    scaler2 = MinMaxScaler(feature_range=(0,0.15))
    distance_scaled = scaler2.fit_transform(neighbours_dis[:].reshape(-1, 1))
    sentiment = (1-distance_scaled).flatten()

    return neighbours_ind[0].tolist(),sentiment.tolist()

movie_index=710
neigh_ind, neigh_sentiment = get_knn_movies_recommendation(knn_model,KNN_features_db_scaled,clean_data[['title', 'overview', 'genres']],movie_index,5)
print(f"neigh_ind for movie {movie_index}: ",neigh_ind[1:])
print(f"neigh_sentiment for movie {movie_index}: ",neigh_sentiment[1:])


In [None]:
# A helper function to find the movie index by its name
def search_movies_by_name(name:str, data:pd.DataFrame):
    escaped_name = re.escape(name)
    return clean_data.title[clean_data.title.str.contains(escaped_name, case=False, regex=True)]
search_movies_by_name('mermaid', clean_data)

In [None]:
#helper functions for the feature extraction cosine semiliraty model
def _building_feature_extractor_model():
  feature_extractor = pipeline("feature-extraction",
                             model="sentence-transformers/all-MiniLM-L6-v2",
                             device_map="auto")
  return feature_extractor
def _generating_saving_embeddings(feature_extractor, series:pd.Series, save_path:str, save:bool =True):
  tqdm.pandas()
  embeddings = series.progress_apply(lambda x: feature_extractor(x)[0][0])
  embeddings_movies = np.vstack(embeddings)
  print(embeddings_movies.shape)
  if save:
    np.save(saving_embeddings_path, embeddings_movies)
  return embeddings_movies

In [None]:
# loading or creating the embeedings out of the for the feature extraction model
def load_or_creat_embeddings(data:pd.Series, path:str=''):
  if os.path.exists(path):
    print("embeddings_movies loaded sucessfully")
    embeddings_movies = np.load(path)
  else:
    print("couldnt find embeddings_movies file, loading feature extractor model ")
    feature_extractor = _building_feature_extractor_model()
    print("building embeddings , this might take a while ..... ")
    embeddings_movies = _generating_saving_embeddings(feature_extractor, clean_data['data_combined'],'embeddings_movies.npy', True)
  return embeddings_movies
embeddings_movies = load_or_creat_embeddings(clean_data['data_combined'], saving_embeddings_path)


In [None]:
#query the second recommendation model by getting the cosine similaraty for a given movie index
def get_cosine_similarity_recom(embeddings, movie_index, movie_details, nmovies=3):
    #get movie embedding
    m_embedding = np.array(embeddings[movie_index]).reshape(1, -1)

    # calculate cosine similarity scores
    similarity_scores = cosine_similarity(m_embedding, embeddings)
    similarity_scores_series = pd.Series(similarity_scores.flatten(), name='similarity_score')

    # combine with movie info and their similarity scores
    similarity_df = pd.concat([movie_details, similarity_scores_series], axis=1)

    top_movies=similarity_df.sort_values('similarity_score', ascending=False).iloc[0:nmovies+1]
    # sort and return top n most similar books
    return top_movies.index.tolist(), top_movies['similarity_score'].tolist()
# get_cosine_similarity_recom(embeddings_movies, 710, clean_data[['title', 'overview', 'genres']], nmovies=5)

In [None]:
#merge the recommedations in one list
def mix_recommendation(movie_index:int, name:str, search_by:str, verbose:bool =False):

  if search_by == 'name':
    named_movies = search_movies_by_name(name, clean_data)
    try:
      movie_index = named_movies.index.tolist()[0]
      print("found movies names with similar name :\n", named_movies)
      print(f'searching for movies similar to movie with index {movie_index}, and title: {named_movies.iloc[0]}')
    except:
      print("couldnot find a similar movie name .. ")
      return None, None

  neigh_ind, neigh_sentiment = get_knn_movies_recommendation(knn_model,KNN_features_db_scaled,clean_data[['title', 'overview', 'genres']],movie_index,5)
  if verbose: print(f"knn recommended ind for movie {movie_index}: ",neigh_ind[1:])
  if verbose: print(f"knn recommended sentiment for movie {movie_index}: ",neigh_sentiment[1:])

  f_index, f_sentiment = get_cosine_similarity_recom(embeddings_movies, movie_index, clean_data[['title', 'overview', 'genres']], nmovies=5)
  if verbose: print(f"Feature cos similarity ind for movie {movie_index}: ",f_index[0:])
  if verbose: print(f"Feature cos similarity sentiment for movie {movie_index}: ",f_sentiment[0:])

  all_indexes = neigh_ind[1:] + f_index[0:]
  all_sentiment = neigh_sentiment[1:] + f_sentiment[0:]
  if verbose: print(f"all recommended ind for movie {movie_index}: ",all_indexes)
  if verbose: print(f"all recommended sentiment for movie {movie_index}: ",all_sentiment)

  return all_indexes,all_sentiment


In [None]:
def get_similar_movies(data, movie_index:int, name:str, search_by:str):

  try:
    all_indexes,all_sentiment= mix_recommendation(movie_index=710, name=name, search_by=search_by)
    df = pd.DataFrame(list(zip(all_indexes, all_sentiment)), columns=['all_indexes', 'Similarity Score'])
    df_no_duplicates = df.drop_duplicates(subset=['all_indexes'], keep='first')
    df_sorted = df_no_duplicates.sort_values(by='Similarity Score', ascending=False)
    df_sorted.set_index('all_indexes', inplace=True)
    df_sorted.index.name = None

    print("list of movies indices", df_sorted.index)

    movies= pd.concat([data.iloc[df_sorted.index],df_sorted], axis=1)

    return movies

  except:
    print("try again with a different name .. ")
    return None

# mname= input("enter a movie name ...")
# get_similar_movies(clean_data[['title', 'overview','genres', 'vote_average']], movie_index=710, name=mname, search_by='name')

In [None]:
res = search_movies_by_name('cars',clean_data)
res.tolist()

In [None]:
import gradio as gr
import pandas as pd

# Gradio interface
def search_titles(query):
    titles = search_movies_by_name(query, clean_data)
    return gr.update(choices=titles.tolist())

def generate_dataframe(selected_title):
    movies =get_similar_movies(clean_data[['title', 'overview','genres', 'vote_average']], movie_index=710, name=selected_title, search_by='name')
    movies["overview"] = movies["overview"].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)
    return movies

with gr.Blocks() as demo:
    gr.HTML(
        """
        <style>
            #my-btn {
                background-color: #000080 !important; /* Green */
                color: white !important;
                margin: 5 auto;
                display: block;
            }
        </style>
        """
    )

    gr.Markdown("## 🎬 Movie Finder")

    with gr.Row():
        with gr.Column():
          query_input = gr.Textbox(label="Enter movie keyword", placeholder="e.g. cars, matrix")
        with gr.Column():
          search_button = gr.Button("Search" , elem_id="my-btn")
        with gr.Column():
          title_dropdown = gr.Dropdown(label="Select one of the following matching titles", choices=[])
    #result_df = gr.Dataframe(label="Similar Movies")

    placeholder_df = pd.DataFrame(columns=["title", "overview", "genres", "vote_average"])
    result_df = gr.Dataframe(label="Movies recommendations", value=placeholder_df)

    search_button.click(fn=search_titles, inputs=query_input, outputs=title_dropdown)
    title_dropdown.change(fn=generate_dataframe, inputs=title_dropdown, outputs=result_df)

demo.launch(debug= False)