<a href="https://colab.research.google.com/github/ranesh88/YouTube-Videos-Recommendation-System-Using-BERT-Embeddings-/blob/main/Youtube_Videos_Recommendation_System_Using_Bert_Embeddings___Bert_Recommendation_System_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from transformers import BertTokenizer, TFBertModel
import torch
import re
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")


In [None]:
# Load the CSV file
df_yt = pd.read_csv('/content/US_videos_data.csv')
df_yt = df_yt[['title','channelTitle','likes','dislikes','thumbnail_link','description']]
df_yt.head()

In [None]:
df_yt.shape

In [None]:
# Drop duplicate titles
df_yt = df_yt.drop_duplicates(subset=['title'])

In [None]:
df_yt.isnull().sum()

In [None]:
df_yt.dropna(inplace=True)

In [None]:
df_yt['clean_title'] = df_yt['title'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x) if isinstance(x, str) else '')

In [None]:
# Load the Hugging Face BERT tokenizer and model for TensorFlow
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

In [None]:
def get_bert_embeddings(text, tokenizer, model):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='tf', padding=True, truncation=True, max_length=512)

    # Use the Hugging Face TFBertModel to get the embeddings
    outputs = model(inputs)

    # Return the pooled output (embedding for the [CLS] token)
    return outputs.pooler_output.numpy()

# Apply the function to each cleaned title in the DataFrame
df_yt['embeddings'] = df_yt['clean_title'].apply(lambda x: get_bert_embeddings(x, tokenizer, model))

In [None]:
df_yt.head()

In [None]:
# save final df
df_yt.to_csv('final_df.csv', index=False)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(embedding, embeddings):
    """
    Compute cosine similarity between a single embedding and all other embeddings.

    Args:
    - embedding (np.ndarray): The embedding vector for the input title.
    - embeddings (list of np.ndarray): List of all embedding vectors in the dataset.

    Returns:
    - similarities (np.ndarray): Array of cosine similarities.
    """
    similarities = cosine_similarity(embedding.reshape(1, -1), np.vstack(embeddings)).flatten()
    return similarities

def recommend_videos(title, df, tokenizer, model, top_n=5):
    """
    Recommend top N similar videos based on the embeddings of the cleaned titles.

    Args:
    - title (str): The title of the video for which we want to find similar videos.
    - df (pd.DataFrame): DataFrame containing video data and embeddings.
    - tokenizer (BertTokenizer): Hugging Face tokenizer.
    - model (TFBertModel): Hugging Face BERT model.
    - top_n (int): Number of similar videos to recommend.

    Returns:
    - recommendations (pd.DataFrame): DataFrame of recommended videos.
    """
    # Preprocess and get the embedding for the input title
    cleaned_title = re.sub('[^A-Za-z0-9]+', ' ', title.lower())
    embedding = get_bert_embeddings(cleaned_title, tokenizer, model)

    # Compute similarities between the input embedding and all other embeddings
    similarities = compute_cosine_similarity(embedding, df['embeddings'].tolist())

    # Add the similarity scores to the DataFrame
    df['similarity'] = similarities

    # Sort the DataFrame based on similarity scores in descending order
    df_sorted = df.sort_values(by='similarity', ascending=False)

    # Return the top N recommendations (excluding the input title itself if it's present)
    recommendations = df_sorted[df_sorted['title'] != title].head(top_n)

    return recommendations[['title', 'channelTitle','likes','dislikes','thumbnail_link', 'similarity']]

In [None]:
from IPython.display import display, HTML
import pandas as pd

def display_recommendations(recommendations):
    # Generate HTML for displaying thumbnails and metadata
    html = '<div style="display: flex; flex-wrap: wrap; justify-content: space-around;">'

    # Iterate over recommendations in pairs (2 thumbnails per row)
    for i in range(0, len(recommendations), 2):
        row_html = '<div style="display: flex; justify-content: space-around; width: 100%;">'

        # Iterate through each item in the pair (2 thumbnails per row)
        for j in range(2):
            if i + j < len(recommendations):
                # Fetch data for each recommendation
                recommendation = recommendations.iloc[i + j]
                title = recommendation['title']
                thumbnail = recommendation['thumbnail_link']
                likes = recommendation['likes']
                dislikes = recommendation['dislikes']
                similarity = recommendation['similarity']
                channelTitle = recommendation['channelTitle']

                # Generate HTML for each video thumbnail and details
                row_html += f'''
                <div style="width: 20%; margin: 2px; text-align: center; border: 1px solid #ddd; padding: 2px; border-radius: 10px;">
                    <img src="{thumbnail}" alt="{title}" style="width: 100%; border-radius: 5px;">
                    <h4>{title}</h4>
                    <p>Channel: {channelTitle}</p>
                    <p>Likes: {likes} | Dislikes: {dislikes}</p>
                    <p>Similarity: {similarity:.2f}</p>
                </div>
                '''
        row_html += '</div>'
        html += row_html

    html += '</div>'
    display(HTML(html))

In [None]:
# Example usage
title_to_recommend = input("Your Input Video Title Here : ")
top_recommendations = recommend_videos(title_to_recommend, df_yt, tokenizer, model, top_n=10)

# Display the recommendations
display_recommendations(top_recommendations)

In [None]:
# Example usage
title_to_recommend = input("Your Input Video Title Here")
top_recommendations = recommend_videos(title_to_recommend, df_yt, tokenizer, model, top_n=10)

# Display the recommendations
display_recommendations(top_recommendations)