# Load Dataset

In [18]:
import pickle
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.preprocessing import normalize

data_path = "embedded_podcast_data.pkl"

with open(data_path, "rb") as f:
    embedded_df  = pickle.load(f)

# Transformer Filtering Function

In [None]:

def cosine_similarity_matrix_raw(vec, matrix):
    """
    Compute cosine similarity between a single vector `vec` and each row vector in `matrix`.
    vec: 1D numpy array, shape (d,)
    matrix: 2D numpy array, shape (n, d)
    Returns:
        similarities: 1D numpy array, shape (n,)
    """
    # Normalize the input vector and matrix row-wise
    vec_norm = vec / np.linalg.norm(vec)
    matrix_norm = matrix / np.linalg.norm(matrix, axis=1, keepdims=True)
    
    # Dot product between vec and each row in matrix (broadcasting)
    similarities = np.dot(matrix_norm, vec_norm)
    return similarities

def cosine_similarity_matrix_normalized(vec, matrix):
    """
    Compute cosine similarity between normalized vector `vec` and each normalized row in `matrix`.
    Inputs are assumed normalized.
    """
    return np.dot(matrix, vec)

def filtering(user_input, original_df, model, max_min=None, top_n=100, column='metadata_embedding'):
    """
    Filter DataFrame based on length of episodes and return top_n most similar.
    """
    df = original_df.copy()
    
    if max_min is not None:
        df = df[df['duration_min'] <= max_min]

    # Encode user input
    user_emb = model.encode(user_input)
    
    user_emb_norm = normalize( user_emb.reshape(1, -1))[0]

    # Stack embeddings into a numpy matrix
    normalized_embeddings_matrix = np.vstack(df[column].values)

    # Compute all cosine similarities at once
    similarities = cosine_similarity_matrix_normalized(user_emb_norm, normalized_embeddings_matrix)

    # Add similarity scores
    df = df.assign(similarity=similarities)

    # Get top_n most similar rows
    top = df.sort_values(by="similarity", ascending=False).head(top_n)

    # Display top 5 results
    for idx, row in top.iloc[:5].iterrows():
        print(f"\n🎯 Title: {row['title']}")
        print(f"🎙️ Host: {row['host']}")
        print(f"🧠 Similarity: {row['similarity']:.4f}")

    return top

In [8]:
# User input expressing a goal about sleep and anxiety
user_input = "optimizing my overall female health."

# load the model
model = SentenceTransformer("models/embedding_model")

prefiltered_df = filtering(user_input, embedded_df, model, max_min=None, top_n=100)


🎯 Title: Dr. Kyle Gillett How to Optimize Your Hormones for Health & Vitality
🎙️ Host: huberman
🧠 Similarity: 0.5942

🎯 Title: Dr. Sara Gottfried How to Optimize Female Hormone Health for Vitality & Longevity | Huberman Lab
🎙️ Host: huberman
🧠 Similarity: 0.5732

🎯 Title: How to Optimize Fertility in Males & Females
🎙️ Host: huberman
🧠 Similarity: 0.5581

🎯 Title: Dr. Stacy Sims Female-Specific Exercise & Nutrition for Health, Performance & Longevity
🎙️ Host: huberman
🧠 Similarity: 0.5191

🎯 Title: Most Efficient Way for Women to Train for Overall Fitness | Dr. Stacy Sims & Dr. Andrew Huberman
🎙️ Host: huberman
🧠 Similarity: 0.5129


In [9]:
# User input expressing a goal about sleep and anxiety
user_input = "What can I do to balance hormones naturally?"

# load the model
model = SentenceTransformer("models/embedding_model")

prefiltered_df = filtering(user_input, embedded_df, model, max_min=None, top_n=100, column='transcript_embedding_mean')


🎯 Title: How to Optimize Testosterone & Estrogen | Huberman Lab Essentials
🎙️ Host: huberman
🧠 Similarity: 0.4783

🎯 Title: Dr. Kyle Gillett How to Optimize Your Hormones for Health & Vitality
🎙️ Host: huberman
🧠 Similarity: 0.4717

🎯 Title: The Science of How to Optimize Testosterone & Estrogen
🎙️ Host: huberman
🧠 Similarity: 0.4707

🎯 Title: How to Control Your Metabolism by Thyroid & Growth Hormone
🎙️ Host: huberman
🧠 Similarity: 0.4511

🎯 Title: Testosterone & Testosterone Replacement Therapy (TRT) | Dr. Peter Attia & Dr. Andrew Huberman
🎙️ Host: huberman
🧠 Similarity: 0.4455


In [10]:
# User input expressing a goal about sleep and anxiety
user_input = "What supplements can help reduce PMS symptoms like fatigue and mood swings?"

# load the model
model = SentenceTransformer("models/embedding_model")

prefiltered_df = filtering(user_input, embedded_df, model, max_min=None, top_n=100, column='transcript_embedding_weighted_mean')


🎯 Title: Developing a Rational Approach to Supplementation for Health & Performance | Huberman Lab Podcast
🎙️ Host: huberman
🧠 Similarity: 0.4531

🎯 Title: Maximize Productivity, Physical & Mental Health With Daily Tools | Huberman Lab Essentials
🎙️ Host: huberman
🧠 Similarity: 0.4441

🎯 Title: Using Caffeine to Optimize Mental & Physical Performance | Huberman Lab Podcast 101
🎙️ Host: huberman
🧠 Similarity: 0.4427

🎯 Title: Using Cortisol & Adrenaline to Boost Our Energy & Immune System Function
🎙️ Host: huberman
🧠 Similarity: 0.4359

🎯 Title: Understanding & Conquering Depression | Huberman Lab Essentials
🎙️ Host: huberman
🧠 Similarity: 0.4327


In [11]:
# User input expressing a goal about sleep and anxiety
user_input = "i want to learn about dopamine and its effects on the brain"

# load the model
model = SentenceTransformer("models/embedding_model")

prefiltered_df = filtering(user_input, embedded_df, model, max_min=None, top_n=100, column='transcript_embedding_mean')


🎯 Title: Controlling Your Dopamine For Motivation, Focus & Satisfaction
🎙️ Host: huberman
🧠 Similarity: 0.6217

🎯 Title: How to Increase Motivation & Drive | Huberman Lab Essentials
🎙️ Host: huberman
🧠 Similarity: 0.5874

🎯 Title: Dopamine Baseline, Impulsivity  & Addiction | Dr. Anna Lempke & Dr.Andrew Huberman
🎙️ Host: huberman
🧠 Similarity: 0.5691

🎯 Title: How to Increase Motivation & Drive
🎙️ Host: huberman
🧠 Similarity: 0.5659

🎯 Title: Leverage Dopamine to Overcome Procrastination & Optimize Effort | Huberman Lab Podcast
🎙️ Host: huberman
🧠 Similarity: 0.5551


In [12]:
# User input expressing a goal about sleep and anxiety
user_input = "how to balance my progesterone levels"

# load the model
model = SentenceTransformer("models/embedding_model")

prefiltered_df = filtering(user_input, embedded_df, model, max_min=None, top_n=100)


🎯 Title: The Science of How to Optimize Testosterone & Estrogen
🎙️ Host: huberman
🧠 Similarity: 0.4579

🎯 Title: How to Optimize Testosterone & Estrogen | Huberman Lab Essentials
🎙️ Host: huberman
🧠 Similarity: 0.4361

🎯 Title: Dr. Kyle Gillett How to Optimize Your Hormones for Health & Vitality
🎙️ Host: huberman
🧠 Similarity: 0.3961

🎯 Title: How to Optimize Fertility in Males & Females
🎙️ Host: huberman
🧠 Similarity: 0.3949

🎯 Title: Dr. Sara Gottfried How to Optimize Female Hormone Health for Vitality & Longevity | Huberman Lab
🎙️ Host: huberman
🧠 Similarity: 0.3881


# Using TfidfVectorizer

In [25]:
import numpy as np

def vectorizers_filtering(user_input, original_df, vectorizer, max_min=None, top_n=100, column='transcript_embedding_weighted_mean_TfidfVectorizer'):
    df = original_df.copy()

    if max_min is not None:
        df = df[df['duration_min'] <= max_min]

    # Vectorize user input and normalize
    user_emb_sparse = vectorizer.transform([user_input])
    user_emb = normalize(user_emb_sparse).toarray()[0]

    # Filter out rows with None embeddings
    df = df[df[column].apply(lambda x: x is not None)]

    embeddings_matrix = np.vstack(df[column].values)
    embeddings_matrix_norm = embeddings_matrix / np.linalg.norm(embeddings_matrix, axis=1, keepdims=True)

    similarities = np.dot(embeddings_matrix_norm, user_emb)

    df = df.assign(similarity=similarities)
    top = df.sort_values(by="similarity", ascending=False).head(top_n)

    for idx, row in top.iloc[:5].iterrows():
        print(f"\n🎯 Title: {row['title']}")
        print(f"🎙️ Host: {row['host']}")
        print(f"🧠 Similarity: {row['similarity']:.4f}")

    return top

In [28]:
import pickle

data_path = "TfidfVectorizer_embedded_podcast_data.pkl"

with open(data_path, "rb") as f:
    TfidfVectorizer_embedded_df  = pickle.load(f)
    
    # Load vectorizer
with open('vectorizers/metadata_vectorizer.pkl', 'rb') as f:
    metadata_vectorizer = pickle.load(f)

with open('vectorizers/chunk_vectorizer.pkl', 'rb') as f:
    chunk_vectorizer = pickle.load(f)


In [31]:

# User input expressing a goal about sleep and anxiety
user_input = "I want to optimise my hormone health"

prefiltered_df = vectorizers_filtering(user_input, TfidfVectorizer_embedded_df, chunk_vectorizer, max_min=None, top_n=100)


🎯 Title: How to Control Your Metabolism by Thyroid & Growth Hormone | Huberman Lab Essentials
🎙️ Host: huberman
🧠 Similarity: 0.4053

🎯 Title: How to Control Your Metabolism by Thyroid & Growth Hormone
🎙️ Host: huberman
🧠 Similarity: 0.3484

🎯 Title: Developing a Rational Approach to Supplementation for Health & Performance | Huberman Lab Podcast
🎙️ Host: huberman
🧠 Similarity: 0.2703

🎯 Title: The Science of How to Optimize Testosterone & Estrogen
🎙️ Host: huberman
🧠 Similarity: 0.2638

🎯 Title: Dr. Kyle Gillett How to Optimize Your Hormones for Health & Vitality
🎙️ Host: huberman
🧠 Similarity: 0.2632


In [13]:
import pandas as pd
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

df = pd.read_csv('youtube_extract_Andrew_Huberman.csv', encoding='ISO-8859-1')

# Load the list of video IDs
with open('videos.txt', 'r', encoding='utf-8') as f:
    video_ids = [line.strip() for line in f if line.strip()]

# Create a mapping from video ID to transcript file path
id_to_transcript_path = {
    video_id: f"{index}__transcript.txt"
    for index, video_id in enumerate(video_ids)
}

# Example: get path for video ID 'cp9GXl9Qk_s'
print(id_to_transcript_path['cp9GXl9Qk_s'])  # Output: '0__transcript.txt'

# Example: get transcript path for index 0
print(f"{0}__transcript.txt")  # Same as above


df["Transcript"] = df["Video ID"].map(id_to_transcript_path)
df["Duration"] = df["Duration"] / 60  # Convert seconds to minutes




def preprocess(text):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Optionally remove digits (if numbers are not useful)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def recommend_transcripts(user_input, top_n=5):

    # Step 1: Load all .txt transcripts in the directory
    transcript_dir = 'transcripts/transcripts-all'
    transcript_paths = [
        os.path.join(transcript_dir, fname)
        for fname in os.listdir(transcript_dir)
        if fname.endswith('.txt')
    ]

    transcripts = []
    doc_names = []
    
    for path in transcript_paths:
        with open(path, 'r', encoding='utf-8') as file:
            text = file.read()

            # Split into sentences and filter short ones
            sentences = re.split(r'(?<=[\.\?\!])\s+', text)
            sentences = [s for s in sentences if len(s.strip()) > 20]

            # Join and preprocess
            reduced_text = preprocess(" ".join(sentences))

            transcripts.append(reduced_text)
            doc_names.append(os.path.basename(path))


    # Step 3: TF-IDF vectorization for similarity
    vectorizer = TfidfVectorizer(
        stop_words='english',
        max_features=10000,
        ngram_range=(1, 2),         # Use unigrams + bigrams
        min_df=5,                   # Remove rare terms
        max_df=0.8,                 # Remove very frequent terms
        sublinear_tf=True
    )
    X = vectorizer.fit_transform(transcripts + [user_input])

    # Step 4: Similarity matching 
    # Compares the user input (last row) against all transcripts (first rows) using cosine similarity
    similarities = cosine_similarity(X[-1], X[:-1]).flatten()

    # Get indices of top 5 most similar transcripts
    top_indices = similarities.argsort()[::-1][:top_n]

    print(f"\n🔍 Top {top_n} most relevant transcripts:")

    for rank, idx in enumerate(top_indices, start=1):
        similarity_score = similarities[idx]
        matched_filename = doc_names[idx]
        matched_transcript = transcripts[idx]

        print(f"\n{rank}. {matched_filename} (similarity: {similarity_score:.3f})")

        # Fetch episode details from df (assuming df is loaded and has 'Transcript' column matching filenames)
        episode_df = df[df['Transcript'] == matched_filename]

        if not episode_df.empty:
            print(f"   Title: {episode_df['title'].values[0]}")
            print(f"   Link: {episode_df['webpage_url'].values[0]}")
            print(f"   Duration: {episode_df['Duration'].values[0]} minutes")
            print(f"   Uploaded on: {episode_df['Upload Date'].values[0]}")
        else:
            print("   Episode details not found in dataframe.")
  
    # print the matched terms with the user input
    print("\n🔍 Matched terms with your input:")
    matched_terms = set(user_input.lower().split()) & set(matched_transcript.lower().split())
    if matched_terms:
        print(", ".join(matched_terms))
    else:
        print("   No specific terms matched with your input.")


0__transcript.txt
0__transcript.txt


In [14]:
user_input = "What supplements can help reduce PMS symptoms like fatigue and mood swings?"
recommend_transcripts(user_input, top_n=3)


🔍 Top 3 most relevant transcripts:

1. 11__transcript.txt (similarity: 0.058)
   Title: How to Stop Headaches Using Science-Based Approaches -- Huberman Lab Podcast
   Link: https://www.youtube.com/watch?v=CGjdgy0cwGk
   Duration: 146.73333333333332 minutes
   Uploaded on: 06/02/23

2. 26__transcript.txt (similarity: 0.055)
   Title: Using Caffeine to Optimize Mental & Physical Performance -- Huberman Lab Podcast 101
   Link: https://www.youtube.com/watch?v=iw97uvIge7c
   Duration: 142.58333333333334 minutes
   Uploaded on: 05/12/22

3. 6__transcript.txt (similarity: 0.046)
   Title: Dr. Andy Galpin: Optimal Nutrition & Supplementation for Fitness -- Huberman Lab Guest Series
   Link: https://www.youtube.com/watch?v=q37ARYnRDGc
   Duration: 185.55 minutes
   Uploaded on: 22/02/23

🔍 Matched terms with your input:
can, reduce, what, fatigue, and, mood, supplements, like, help, symptoms


In [15]:
user_input ="i am feeling tired"
recommend_transcripts(user_input, top_n=3)


🔍 Top 3 most relevant transcripts:

1. 126__transcript.txt (similarity: 0.037)
   Title: Using Failures, Movement & Balance to Learn Faster -- Huberman Lab Podcast #7
   Link: https://www.youtube.com/watch?v=hx3U64IXFOY
   Duration: 88.08333333333333 minutes
   Uploaded on: 15/02/21

2. 123__transcript.txt (similarity: 0.033)
   Title: Tools for Managing Stress & Anxiety -- Huberman Lab Podcast #10
   Link: https://www.youtube.com/watch?v=ntfcfJ28eiU
   Duration: 98.4 minutes
   Uploaded on: 08/03/21

3. 125__transcript.txt (similarity: 0.032)
   Title: Optimize Your Learning & Creativity with Science-based Tools -- Huberman Lab Podcast #8
   Link: https://www.youtube.com/watch?v=uuP-1ioh4LY
   Duration: 90.58333333333333 minutes
   Uploaded on: 22/02/21

🔍 Matched terms with your input:
am, feeling, tired, i


# TESTS

# Collaborative Filtering

### User Item rating matrix

In [17]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class CollaborativeFilteringRecommender:
    def __init__(self, ratings_matrix):
        self.ratings = ratings_matrix
        self.user_user_similarity = cosine_similarity(self.ratings)
        self.item_item_similarity = cosine_similarity(self.ratings.T)

    def user_based_scores(self, user_id):
        # Get similarity scores between the target user and all other users
        sim_scores = self.user_user_similarity[user_id]

        # Compute the weighted sum of all users' ratings, weighted by similarity
        weighted_ratings = sim_scores @ self.ratings

        # Compute the sum of similarities (excluding the target user's self-similarity)
        sim_sum = np.sum(sim_scores) - 1
        sim_sum = sim_sum if sim_sum != 0 else 1e-8  # Avoid division by zero

        # Calculate predicted scores for each item by normalizing weighted ratings
        # Subtract user's own ratings to reduce bias from already seen items
        scores = (weighted_ratings - self.ratings[user_id]) / sim_sum

        # Mark already-rated items with -1 so they are not recommended again
        scores[self.ratings[user_id] > 0] = -1

        # Return the predicted scores for unrated items
        return scores


    def item_based_scores(self, user_id):
        scores = np.zeros(self.ratings.shape[1])
        user_ratings = self.ratings[user_id]

        for item in range(self.ratings.shape[1]):
            if user_ratings[item] != 0:
                scores[item] = -1 # Already rated
                continue
            
            # Similarities of current item to all other items
            sim_scores = self.item_item_similarity[item]
            
            # Ratings of the user for all other items
            rated_mask = user_ratings > 0

            # Weighted sum of similar items' ratings
            weighted_sum = np.dot(sim_scores, user_ratings)
            
            # Remove self-similarity contribution
            sim_sum = np.sum(sim_scores[rated_mask])
            sim_sum = sim_sum if sim_sum != 0 else 1e-8

            scores[item] = weighted_sum / sim_sum

        return scores

    def hybrid_scores(self, user_id, alpha=0.7):
        """alpha=0.5 means equal weight; alpha=1 uses only user-based"""
        user_scores = self.user_based_scores(user_id)
        item_scores = self.item_based_scores(user_id)

        combined_scores = alpha * user_scores + (1 - alpha) * item_scores
        # Preserve -1 for already rated items
        combined_scores[self.ratings[user_id] > 0] = -1
        return combined_scores


In [18]:
# User-Item rating matrix (rows = users, columns = items)
# 0 means no rating

ratings = np.array([
    [5, 3, 0, 1, 0],  # User 0  
    [4, 0, 0, 1, 0],  # User 1 
    [1, 1, 0, 5, 4],  # User 2  
    [0, 0, 5, 4, 0],  # User 3
    [0, 1, 5, 4, 0],  # User 4
])

item_ids = ["Health", "AI", "Startups", "Productivity", "Nutrition"]
user_ids = [0,1,2,3,4]

recommender = CollaborativeFilteringRecommender(ratings)

user_id = 0
print(item_ids)
print(user_ids)
print("User-based CF scores:")
user_based = recommender.user_based_scores(user_id)
print(user_based)

print("\nItem-based CF scores:")
item_based = recommender.item_based_scores(user_id)
print(item_based)

alpha=0.7
print(f"\nHybrid CF scores (alpha={alpha}):")
hybrid_based = recommender.hybrid_scores(user_id, alpha=alpha)
print(hybrid_based)

def print_recommendations(title, scores, item_ids, top_k=3):
    print(title)
    # Collect only unrated (recommended) items
    recommendations = [(item_ids[i], round(scores[i], 3)) for i in range(len(scores)) if scores[i] > 0]
    recommendations.sort(key=lambda x: x[1], reverse=True)

    if not recommendations:
        print("  No recommendations.")
    else:
        for item, score in recommendations[:top_k]:
            print(f"  {item}: {score}")
    print()

print("----------------------")
print_recommendations("User-based CF Recommendations:", user_based, item_ids)
print_recommendations("Item-based CF Recommendations:", item_based, item_ids)
print_recommendations(f"Hybrid CF Recommendations (alpha={alpha}):", hybrid_based, item_ids)




['Health', 'AI', 'Startups', 'Productivity', 'Nutrition']
[0, 1, 2, 3, 4]
User-based CF scores:
[-1.         -1.          0.97079276 -1.          0.90312423]

Item-based CF scores:
[-1.         -1.          1.44900412 -1.          2.10253074]

Hybrid CF scores (alpha=0.7):
[-1.         -1.          1.11425617 -1.          1.26294618]
----------------------
User-based CF Recommendations:
  Startups: 0.971
  Nutrition: 0.903

Item-based CF Recommendations:
  Nutrition: 2.103
  Startups: 1.449

Hybrid CF Recommendations (alpha=0.7):
  Nutrition: 1.263
  Startups: 1.114



# Collaborative + Content

In [21]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class HybridRecommender:
    def __init__(self, ratings_matrix, item_features, w_user=0.4, w_item=0.4, w_content=0.2):
        self.ratings = ratings_matrix
        self.user_user_similarity = cosine_similarity(self.ratings)
        self.item_item_similarity = cosine_similarity(self.ratings.T)
        
        self.item_features = item_features
        self.w_user = w_user
        self.w_item = w_item
        self.w_content = w_content

    def user_based_scores(self, user_id):
        # Get similarity scores between the target user and all other users
        sim_scores = self.user_user_similarity[user_id]

        # Compute the weighted sum of all users' ratings, weighted by similarity
        weighted_ratings = sim_scores @ self.ratings

        # Compute the sum of similarities (excluding the target user's self-similarity)
        sim_sum = np.sum(sim_scores) - 1
        sim_sum = sim_sum if sim_sum != 0 else 1e-8  # Avoid division by zero

        # Calculate predicted scores for each item by normalizing weighted ratings
        # Subtract user's own ratings to reduce bias from already seen items
        scores = (weighted_ratings - self.ratings[user_id]) / sim_sum

        # Mark already-rated items with -1 so they are not recommended again
        scores[self.ratings[user_id] > 0] = -1

        # Return the predicted scores for unrated items
        return scores


    def item_based_scores(self, user_id):
        scores = np.zeros(self.ratings.shape[1])
        user_ratings = self.ratings[user_id]

        for item in range(self.ratings.shape[1]):
            if user_ratings[item] != 0:
                scores[item] = -1 # Already rated
                continue
            
            # Similarities of current item to all other items
            sim_scores = self.item_item_similarity[item]
            
            # Ratings of the user for all other items
            rated_mask = user_ratings > 0

            # Weighted sum of similar items' ratings
            weighted_sum = np.dot(sim_scores, user_ratings)
            
            # Remove self-similarity contribution
            sim_sum = np.sum(sim_scores[rated_mask])
            sim_sum = sim_sum if sim_sum != 0 else 1e-8

            scores[item] = weighted_sum / sim_sum

        return scores
    

    def collaborative_hybrid_scores(self, user_id, alpha=0.7):
        """alpha=0.5 means equal weight; alpha=1 uses only user-based"""
        user_scores = self.user_based_scores(user_id)
        item_scores = self.item_based_scores(user_id)

        combined_scores = alpha * user_scores + (1 - alpha) * item_scores
        # Preserve -1 for already rated items
        combined_scores[self.ratings[user_id] > 0] = -1
        
        recommended_idx = np.argsort(combined_scores)[::-1]

        return [(idx, combined_scores[idx]) for idx in recommended_idx]
    
    
    def content_based_scores(self, user_id):
        user_ratings = self.ratings[user_id]
        user_profile = user_ratings @ self.item_features
        item_norm = np.linalg.norm(self.item_features, axis=1)
        profile_norm = np.linalg.norm(user_profile)
        scores = (self.item_features @ user_profile) / (item_norm * profile_norm + 1e-8)
        scores[user_ratings > 0] = -1
       
        return scores
    
    
    def content_hybrid_scores(self, user_id):
        user_scores = self.user_based_scores(user_id)
        item_scores = self.item_based_scores(user_id)
        content_scores = self.content_based_scores(user_id)

        combined_scores = (self.w_user * user_scores +
                           self.w_item * item_scores +
                           self.w_content * content_scores)

        recommended_idx = np.argsort(combined_scores)[::-1]

        return [(idx, combined_scores[idx]) for idx in recommended_idx]


In [23]:
ratings = np.array([
    [5, 3, 0, 1, 0],  # User 0  
    [4, 0, 0, 1, 0],  # User 1 
    [1, 1, 0, 5, 4],  # User 2  
    [0, 0, 5, 4, 0],  # User 3
    [0, 1, 5, 4, 0],  # User 4
])

item_ids = ["Health", "AI", "Startups", "Productivity", "Nutrition"]
user_ids = [0,1,2,3,4]


item_features = np.array([
    [1, 0, 1],  # Action + Drama
    [1, 1, 0],  # Action + Comedy
    [0, 1, 0],  # Comedy
    [0, 0, 1],  # Drama
    [1, 0, 0],  # Action
])

recommender = HybridRecommender(ratings, item_features)
user_id = 0
hybrid_recommendations = recommender.content_hybrid_scores(user_id)
collaborative_recommendations = recommender.collaborative_hybrid_scores(user_id)

print(f"hybrid ecommendations for user {user_id}:")
for idx, score in hybrid_recommendations:
    print(f"Item {idx} with combined score {score:.3f}")
    
print("--------------")
print(f"collaborative ecommendations for user {user_id}:")
for idx, score in collaborative_recommendations:
    print(f"Item {idx} with combined score {score:.3f}")

hybrid ecommendations for user 0:
Item 4 with combined score 1.356
Item 2 with combined score 1.025
Item 3 with combined score -1.000
Item 1 with combined score -1.000
Item 0 with combined score -1.000
--------------
collaborative ecommendations for user 0:
Item 4 with combined score 1.263
Item 2 with combined score 1.114
Item 3 with combined score -1.000
Item 1 with combined score -1.000
Item 0 with combined score -1.000


# Using Dataframe

Step 1: Build the user-item rating matrix

Each row is a user, each column is a podcast, and values are ratings

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

user_rest_matrix = ratings.pivot(index='user_id', columns='podcast_id', values='rating')

### User based CF

Use cosine similarity or Pearson correlation between user rating vectors.

In [None]:
# Fill NaN with 0 for similarity (or use mean-centered if Pearson)
user_similarity = cosine_similarity(user_rest_matrix.fillna(0))
user_similarity_df = pd.DataFrame(user_similarity, 
                                   index=user_rest_matrix.index,
                                   columns=user_rest_matrix.index)


# Step 3: Make predictions
# To predict how much user A would like podcast X:
# Find k most similar users to A
# Average their ratings for X, weighted by similarity

def predict_rating_user_based(target_user, target_item, k=5):
    similar_users = user_similarity_df[target_user].drop(target_user).sort_values(ascending=False).head(k)
    numerator = 0
    denominator = 0
    for other_user, similarity in similar_users.items():
        rating = user_rest_matrix.loc[other_user, target_item]
        if not pd.isna(rating):
            numerator += similarity * rating
            denominator += abs(similarity)
    return numerator / denominator if denominator != 0 else np.nan

### Item based CF

Step 1: Transpose the user-item matrix

Now rows = podcasts, columns = users.

In [None]:
item_user_matrix = user_rest_matrix.T  # rows = podcast_id

item_similarity = cosine_similarity(item_user_matrix.fillna(0))
item_similarity_df = pd.DataFrame(item_similarity, 
                                   index=item_user_matrix.index,
                                   columns=item_user_matrix.index)


# Step 3: Predict rating

# To predict how much user U would like podcast R:

# Look at items R is similar to
# Use U’s ratings on those similar items, weighted by similarity

def predict_rating_item_based(user_id, target_item, k=5):
    similar_items = item_similarity_df[target_item].drop(target_item).sort_values(ascending=False)
    user_ratings = user_rest_matrix.loc[user_id]
    
    numerator = 0
    denominator = 0
    for item_id, similarity in similar_items.head(k).items():
        if not pd.isna(user_ratings.get(item_id)):
            numerator += similarity * user_ratings[item_id]
            denominator += abs(similarity)
    return numerator / denominator if denominator != 0 else np.nan

# Matrix Factorization

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Aggregate duplicate ratings by taking the mean
df_agg = df.groupby(['userId', 'title'])['rating'].mean().reset_index()

user_item_matrix = df_agg.pivot(index='userId', columns='title', values='rating')

user_item_matrix.head(10)

In [None]:

# Select a user
user_id = 208

# Get podcasts watched by this user
user_podcasts = user_item_matrix.loc[user_id].dropna().index.tolist()
print(f"User {user_id} watched {len(user_podcasts)} movies")

# Keep only these movies for all users
subset_matrix = user_item_matrix[user_podcasts]

# Keep only users who watched at least 70% of these movies
min_podcasts = int(len(user_podcasts) * 0.7)
users_to_keep = subset_matrix.count(axis=1) >= min_podcasts
filtered_matrix = subset_matrix[users_to_keep]

print(f"Kept {len(filtered_matrix)} users that watched at least {min_podcasts} podcasts")

# Calculate user similarities
user_correlations = filtered_matrix.T.corr()

# Get similarities for our target user (excluding self)
similar_users = user_correlations[user_id].drop(user_id).sort_values(ascending=False)

print(f"\nTop 10 most similar users to user {user_id}:")
similar_users.head(10)

In [None]:
# Get movie recommendations
def user_based_recommendation(user_id, n_recommendations=5):
    
    # Get top 5 similar users
    top_similar = similar_users.head(10)
    
    # podcast target user hasn't watched
    unwatched_podcasts = user_item_matrix.columns[user_item_matrix.loc[user_id].isna()]
    
    # Score each unwatched movie
    podcast_scores = {}
    similarities_sum = top_similar.sum()
    if similarities_sum == 0:
        print("No similar users found or all similarities are zero.")
        return []
    
     # Add target average rating to the score
    target_user_avg = user_item_matrix.loc[user_id].mean()
    
    for podcast in unwatched_podcasts:
        
        # calculate summation of scores
        score = 0
        for similar_user, similarity in top_similar.items():
            if not pd.isna(user_item_matrix.loc[similar_user, podcast]):
                rating = user_item_matrix.loc[similar_user, podcast]
                centered_rating = rating - user_item_matrix.loc[similar_user].mean()
                score += centered_rating * similarity
                
        podcast_scores[podcast] = (score + target_user_avg) / similarities_sum
        
    # Return top recommendations
    recommendations = sorted(podcast_scores.items(), key=lambda x: x[1], reverse=True)
    return recommendations[:n_recommendations]

# Get recommendations
recommendations = user_based_recommendation(user_id)
print(f"\nTop 5 recommendations for user {user_id}:")
for i, (podcast, score) in enumerate(recommendations, 1):
    print(f"{i}. Podcast {podcast}: Score = {score:.2f}")

# SVD

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=500, random_state=42)
R = filtered_matrix.fillna(0)
svd.fit(R)  # R is user-item matrix with NaNs filled as 0

cumulative_variance = np.cumsum(svd.explained_variance_ratio_)

plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')
plt.axhline(y=0.9, color='r', linestyle='--')  # for 90% variance explained
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Choosing Number of Components for SVD')
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import mean_squared_error

for k in [10, 20, 50, 100, 200, 500]:
    svd = TruncatedSVD(n_components=k, random_state=42)
    U = svd.fit_transform(R)
    Vt = svd.components_
    R_hat = np.dot(U, Vt)
    mse = mean_squared_error(R.flatten(), R_hat.flatten())
    print(f"k={k}, RMSE={np.sqrt(mse):.4f}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD

# Assuming df_agg and user_item_matrix are ready
R = user_item_matrix.fillna(0).values
k = 500

# Perform SVD using scikit-learn
svd = TruncatedSVD(n_components=k, random_state=42)
U = svd.fit_transform(R)  # shape: (n_users, k)
sigma = svd.singular_values_  # shape: (k,)
Vt = svd.components_          # shape: (k, n_items)

# Reconstruct the approximate user-item matrix
R_hat = np.dot(U, Vt)  # since TruncatedSVD includes the scaling in U and Vt

# Create prediction DataFrame
preds_df = pd.DataFrame(R_hat, index=user_item_matrix.index, columns=user_item_matrix.columns)

def recommend_podcasts(preds_df, user_id, user_item_matrix, top_n=5):
    # Get user's predicted ratings
    user_row = preds_df.loc[user_id]

    # Get movies already rated by user
    rated_podcasts = user_item_matrix.loc[user_id].dropna().index.tolist()

    # Filter out movies already rated
    recommendations = user_row.drop(labels=rated_podcasts)

    # Get top N recommendations
    top_recommendations = recommendations.sort_values(ascending=False).head(top_n)

    return top_recommendations

# Select a random user_id (make sure the ID exists in the index)
user_id = 220

recommendations = recommend_podcasts(preds_df, user_id, user_item_matrix, top_n=5)

print(f"Top recommendations for user {user_id}:")
print(recommendations)
