<a href="https://colab.research.google.com/github/salmacmpeg/Assignment1_PR/blob/master/Movies_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers  diffusers   pandas

In [2]:
!pip install -U datasets huggingface_hub fsspec

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.33.5-py3-none-any.whl.metadata (14 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.33.5-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.7/515.7 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, huggingface_hub, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.7.0
    Uninstalling fsspec-2025.7.0:
      Suc

In [3]:
import torch
from google.colab import userdata
from huggingface_hub import login
from transformers import pipeline
import pandas as pd
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import os
import re
from google.colab import drive


In [4]:
#credintials
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [92]:
drive.mount('/content/drive')
saving_embeddings_path ='/content/drive/MyDrive/Recommendation/embeddings_movies.npy'
data_saving_path ='/content/drive/MyDrive/Recommendation/pandas_data.csv'
pd.set_option('display.max_colwidth', 100)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [94]:
try:
  #loading the movies data from local derive
  pandas_data = pd.read_csv(data_saving_path)

except:
  #an alternative way to load data using huggingface
  dataset = load_dataset('bloc4488/TMDB-all-movies')
  pandas_data = pd.DataFrame(dataset['train'])

# pandas_data.to_csv(data_saving_path, index=False)

In [42]:
#preprocess_data
def preprocess(df_movie_orig: pd.DataFrame, verbose:bool = False) -> pd.DataFrame:

    droped_columns = ['Unnamed: 0', 'id', 'revenue', 'runtime', 'vote_count',
                  'budget', 'imdb_id', 'original_language', 'original_title',
                  'tagline', 'production_countries', 'spoken_languages',
                  'producers','music_composer', 'director_of_photography', 'status']
    if verbose: print(f'Dropping columns: {droped_columns}')

    beneficial_data=df_movie_orig.drop(droped_columns, axis=1)
    if verbose: print(f'Columns left: {beneficial_data.columns}, and shape is {beneficial_data.shape}')

    beneficial_data.dropna(inplace=True)
    if verbose: print(f'Dropping nan rows .... , and shape is {beneficial_data.shape}')

    beneficial_data = beneficial_data[beneficial_data['vote_average'] != 0.0]
    if verbose: print(f'Dropping rows  that has zero voting values .... , and shape is {beneficial_data.shape}')

    beneficial_data['release_date'] = beneficial_data['release_date'].apply(lambda x: x.strip().split('-')[0])# i will keep the year only
    beneficial_data['release_date'] = beneficial_data['release_date'].astype(int) #converting the release year to int
    if verbose: print('keeping only the year in the realse date, save it as int .....')

    beneficial_data['popularity'] = beneficial_data['popularity'].round().astype(int) #convert popularity to nearest integer
    if verbose: print('converting popularity float values to the nearest integers .....')

    beneficial_data['production_companies'] = beneficial_data['production_companies'].apply(lambda x: x.split(',')[0])  #take only the first company
    beneficial_data['production_companies'] = beneficial_data['production_companies'].apply(lambda x: x.replace(' ', ''))#join the text
    if verbose: print('Taking only the first production_company and remove the spaces in the name .....')

    beneficial_data.reset_index(drop=True, inplace=True)
    return beneficial_data


#createing clean data frame to work on in the two recommendation models
clean_data = preprocess(pandas_data, verbose=False)
clean_data['data_combined'] = 'title: '+clean_data['title']+ ' overview: ' + clean_data['overview'] + ', genres: ' + clean_data['genres']

In [43]:
#build count vectorizer matrix to get representation for the columns overview and genres
def create_count_matrix( series: pd.Series, verbose:bool = True, name:str ='') -> pd.DataFrame:
    cv = CountVectorizer(stop_words='english', ngram_range=(1,1), min_df=0.05)
    tdm = cv.fit_transform(series)
    vector_word_df = pd.DataFrame(tdm.toarray(), columns=cv.get_feature_names_out())
    if verbose: print(f' created count matrix for series {name} with shape {vector_word_df.shape}')
    return vector_word_df

vector_word_df_overview = create_count_matrix(clean_data['overview'], name='overview')
vector_word_df_generes = create_count_matrix(clean_data['genres'], name='genres')

 created count matrix for series overview with shape (8322, 19)
 created count matrix for series genres with shape (8322, 14)


In [44]:
#prepare feature data frame for the knn model
KNN_features_db= pd.concat([vector_word_df_generes,vector_word_df_overview,clean_data[['vote_average', 'popularity']]], axis=1)
KNN_features_db.shape

(8322, 35)

In [45]:
#training the knn remcommender model
def fit_knn(features_db: pd.DataFrame):
  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(features_db)
  knn_model = NearestNeighbors(n_neighbors=5, algorithm='auto')
  knn_model.fit(X_scaled)
  distances, indices = knn_model.kneighbors(X_scaled)
  return knn_model,X_scaled
knn_model,KNN_features_db_scaled = fit_knn(KNN_features_db)

In [46]:
#query the knn recommender model with movie index
def get_knn_movies_recommendation(knn_model,scaled_features_db,movie_data,movie_index,nmovies=5):
    neighbours_dis, neighbours_ind= knn_model.kneighbors(scaled_features_db[movie_index].reshape(1, -1), return_distance=True, n_neighbors=nmovies)
    neighbor_rows = movie_data.iloc[neighbours_ind[0]]

    scaler2 = MinMaxScaler(feature_range=(0,0.15))
    distance_scaled = scaler2.fit_transform(neighbours_dis[:].reshape(-1, 1))
    sentiment = (1-distance_scaled).flatten()

    return neighbours_ind[0].tolist(),sentiment.tolist()

movie_index=710
neigh_ind, neigh_sentiment = get_knn_movies_recommendation(knn_model,KNN_features_db_scaled,clean_data[['title', 'overview', 'genres']],movie_index,5)
print(f"neigh_ind for movie {movie_index}: ",neigh_ind[1:])
print(f"neigh_sentiment for movie {movie_index}: ",neigh_sentiment[1:])


neigh_ind for movie 710:  [3118, 4279, 7687, 4278]
neigh_sentiment for movie 710:  [0.896815595661713, 0.8858058859390034, 0.861610898781777, 0.85]


In [50]:
# A helper function to find the movie index by its name
def search_movies_by_name(name:str, data:pd.DataFrame):
    escaped_name = re.escape(name)
    return clean_data.title[clean_data.title.str.contains(escaped_name, case=False, regex=True)]
search_movies_by_name('mermaid', clean_data)

Unnamed: 0,title
1015,Mississippi Mermaid
2281,Mermaids
4390,The Little Mermaid
5014,The Little Mermaid II: Return to the Sea
6474,Barbie: Fairytopia - Mermaidia
6728,The Little Mermaid: Ariel's Beginning


In [14]:
#helper functions for the feature extraction cosine semiliraty model
def _building_feature_extractor_model():
  feature_extractor = pipeline("feature-extraction",
                             model="sentence-transformers/all-MiniLM-L6-v2",
                             device_map="auto")
  return feature_extractor
def _generating_saving_embeddings(feature_extractor, series:pd.Series, save_path:str, save:bool =True):
  tqdm.pandas()
  embeddings = series.progress_apply(lambda x: feature_extractor(x)[0][0])
  embeddings_movies = np.vstack(embeddings)
  print(embeddings_movies.shape)
  if save:
    np.save(saving_embeddings_path, embeddings_movies)
  return embeddings_movies

In [16]:
# loading or creating the embeedings out of the for the feature extraction model
def load_or_creat_embeddings(data:pd.Series, path:str=''):
  if os.path.exists(path):
    print("embeddings_movies loaded sucessfully")
    embeddings_movies = np.load(path)
  else:
    print("couldnt find embeddings_movies file, loading feature extractor model ")
    feature_extractor = _building_feature_extractor_model()
    print("building embeddings , this might take a while ..... ")
    embeddings_movies = _generating_saving_embeddings(feature_extractor, clean_data['data_combined'],'embeddings_movies.npy', True)
  return embeddings_movies
embeddings_movies = load_or_creat_embeddings(clean_data['data_combined'], saving_embeddings_path)


couldnt find embeddings_movies file, loading feature extractor model 


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


building embeddings , this might take a while ..... 


  0%|          | 0/8322 [00:00<?, ?it/s]

(8322, 384)


In [51]:
#query the second recommendation model by getting the cosine similaraty for a given movie index
def get_cosine_similarity_recom(embeddings, movie_index, movie_details, nmovies=3):
    #get movie embedding
    m_embedding = np.array(embeddings[movie_index]).reshape(1, -1)

    # calculate cosine similarity scores
    similarity_scores = cosine_similarity(m_embedding, embeddings)
    similarity_scores_series = pd.Series(similarity_scores.flatten(), name='similarity_score')

    # combine with movie info and their similarity scores
    similarity_df = pd.concat([movie_details, similarity_scores_series], axis=1)

    top_movies=similarity_df.sort_values('similarity_score', ascending=False).iloc[0:nmovies+1]
    # sort and return top n most similar books
    return top_movies.index.tolist(), top_movies['similarity_score'].tolist()
# get_cosine_similarity_recom(embeddings_movies, 710, clean_data[['title', 'overview', 'genres']], nmovies=5)

In [53]:
#merge the recommedations in one list
def mix_recommendation(movie_index:int, name:str, search_by:str, verbose:bool =False):

  if search_by == 'name':
    named_movies = search_movies_by_name(name, clean_data)
    try:
      movie_index = named_movies.index.tolist()[0]
      print("found movies names with similar name :\n", named_movies)
      print(f'searching for movies similar to movie with index {movie_index}, and title: {named_movies.iloc[0]}')
    except:
      print("couldnot find a similar movie name .. ")
      return None, None

  neigh_ind, neigh_sentiment = get_knn_movies_recommendation(knn_model,KNN_features_db_scaled,clean_data[['title', 'overview', 'genres']],movie_index,5)
  if verbose: print(f"knn recommended ind for movie {movie_index}: ",neigh_ind[1:])
  if verbose: print(f"knn recommended sentiment for movie {movie_index}: ",neigh_sentiment[1:])

  f_index, f_sentiment = get_cosine_similarity_recom(embeddings_movies, movie_index, clean_data[['title', 'overview', 'genres']], nmovies=5)
  if verbose: print(f"Feature cos similarity ind for movie {movie_index}: ",f_index[0:])
  if verbose: print(f"Feature cos similarity sentiment for movie {movie_index}: ",f_sentiment[0:])

  all_indexes = neigh_ind[1:] + f_index[0:]
  all_sentiment = neigh_sentiment[1:] + f_sentiment[0:]
  if verbose: print(f"all recommended ind for movie {movie_index}: ",all_indexes)
  if verbose: print(f"all recommended sentiment for movie {movie_index}: ",all_sentiment)

  return all_indexes,all_sentiment


In [None]:
def get_similar_movies(data, movie_index:int, name:str, search_by:str):

  try:
    all_indexes,all_sentiment= mix_recommendation(movie_index=710, name=name, search_by=search_by)
    df = pd.DataFrame(list(zip(all_indexes, all_sentiment)), columns=['all_indexes', 'Similarity Score'])
    df_no_duplicates = df.drop_duplicates(subset=['all_indexes'], keep='first')
    df_sorted = df_no_duplicates.sort_values(by='Similarity Score', ascending=False)
    df_sorted.set_index('all_indexes', inplace=True)
    df_sorted.index.name = None

    print("list of movies indices", df_sorted.index)

    movies= pd.concat([data.iloc[df_sorted.index],df_sorted], axis=1)

    return movies

  except:
    print("try again with a different name .. ")
    return None

# mname= input("enter a movie name ...")
# get_similar_movies(clean_data[['title', 'overview','genres', 'vote_average']], movie_index=710, name=mname, search_by='name')

In [59]:
res = search_movies_by_name('cars',clean_data)
res.tolist()

['Cars',
 'Old Men in New Cars: In China They Eat Dogs II',
 'Riding in Cars with Boys',
 'Used Cars']

In [80]:
import gradio as gr
import pandas as pd

# Gradio interface
def search_titles(query):
    titles = search_movies_by_name(query, clean_data)
    return gr.update(choices=titles.tolist())

def generate_dataframe(selected_title):
    movies =get_similar_movies(clean_data[['title', 'overview','genres', 'vote_average']], movie_index=710, name=selected_title, search_by='name')
    movies["overview"] = movies["overview"].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)
    return movies

with gr.Blocks() as demo:
    gr.HTML(
        """
        <style>
            #my-btn {
                background-color: #000080 !important; /* Green */
                color: white !important;
                margin: 5 auto;
                display: block;
            }
        </style>
        """
    )

    gr.Markdown("## 🎬 Movie Finder")

    with gr.Row():
        with gr.Column():
          query_input = gr.Textbox(label="Enter movie keyword", placeholder="e.g. cars, matrix")
        with gr.Column():
          search_button = gr.Button("Search" , elem_id="my-btn")
        with gr.Column():
          title_dropdown = gr.Dropdown(label="Select one of the following matching titles", choices=[])
    #result_df = gr.Dataframe(label="Similar Movies")

    placeholder_df = pd.DataFrame(columns=["title", "overview", "genres", "vote_average"])
    result_df = gr.Dataframe(label="Movies recommendations", value=placeholder_df)

    search_button.click(fn=search_titles, inputs=query_input, outputs=title_dropdown)
    title_dropdown.change(fn=generate_dataframe, inputs=title_dropdown, outputs=result_df)

demo.launch(debug= False)

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://01c03e0bf4723229a2.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
'''Data Exploration code

pandas_data.columns

#non beneficial -> to be removed
pandas_data['tagline'].isnull().sum() #contains null , i dont think will be beneficial in our motive
pandas_data['genres'] #needs splitting on comma and changing into list
print(pandas_data['production_companies'].isna().sum()) #contains some na values
print(pandas_data['production_countries'].isna().sum()) #i dont need that i think
pandas_data['spoken_languages'].unique()[:10] #i dont need that i think
pandas_data['producers'][:2]# i dont think this is beneficial
pandas_data['music_composer'].isna().sum()# i dont think this is beneficial

#beneficial -> needs processing
pandas_data['cast'][:2] #beneficial and needs processing
pandas_data['director'].isna().sum() #beneficial and needs processing
pandas_data['director_of_photography'].isna().sum() #beneficial and needs processing
pandas_data['writers'].isna().sum()#beneficial and needs processing
pandas_data['status'].unique()#beneficial and needs processing
pandas_data['release_date'].head(2)#beneficial and needs processing


columnsdrop = ['Unnamed: 0', 'revenue', 'runtime', 'vote_count',
                  'budget', 'imdb_id', 'original_language',
                  'tagline', 'production_countries', 'spoken_languages',
                  'producers','music_composer', 'director_of_photography', 'status']
beneficial_data=pandas_data.drop(columnsdrop, axis=1)

beneficial_data.columns


# beneficial_data['cast'] = beneficial_data['cast'].apply(lambda x: x.split(',')[0:3]) #take the first three comma separated names in the cast only
# beneficial_data['cast'] = beneficial_data['cast'].apply(lambda x: [y.replace(' ', '') for y in x] )#string join each entity in the cast list
# beneficial_data['cast'] = beneficial_data['cast'].apply(lambda x: ','.join(x)) #convert the list in each entity to a comma separated string
# if verbose: print('Taking only the first three cast members and remove the spaces in the name .....')

# beneficial_data['director'] = beneficial_data['director'].apply(lambda x: x.split(',')[0])
# beneficial_data['director'] = beneficial_data['director'].apply(lambda x: x.replace(' ', ''))
# if verbose: print('Taking only the first director and remove the spaces in the name .....')

# beneficial_data['writers'] = beneficial_data['writers'].apply(lambda x: x.split(',')[0])
# beneficial_data['writers'] = beneficial_data['writers'].apply(lambda x: x.replace(' ', ''))
# if verbose: print('Taking only the first writer and remove the spaces in the name .....')

#helping plots
from matplotlib import pyplot as plt
pandas_data.plot(kind='scatter', x='vote_average', y='vote_count', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

from matplotlib import pyplot as plt
beneficial_data['popularity'].plot(kind='hist', bins=20, title='popularity', xlim=(0, 150))
plt.gca().spines[['top', 'right',]].set_visible(False)

'''