In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import re

In [2]:
# edit these dataset paths as needed

BOOKS_DATASET = 'data.csv'
CLUSTER_GENRE_COUNTS = 'movies/cluster_genre_counts.csv'
USER_CLUSTERS_WITH_INFO = 'movies/user_clusters_with_info.csv'
model_genre = SentenceTransformer('all-MiniLM-L6-v2')
model_user = SentenceTransformer('all-MiniLM-L6-v2') 

## Training NearestNeighbour Model on books genre embedding

In [None]:
df = pd.read_csv(BOOKS_DATASET)

df['genres_string'] = df['genres'].fillna('').apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
genre_embeddings = model_genre.encode(df['genres_string'].tolist(), show_progress_bar=True)

df['description'] = df['description'].fillna('')
model_description = SentenceTransformer('all-MiniLM-L6-v2')
description_embeddings = model_description.encode(df['description'].tolist(), show_progress_bar=True)

combined_embeddings = np.hstack([genre_embeddings, description_embeddings])
# Train a NearestNeighbors model on the genre embeddings
nn_model_genre = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='auto')
# nn_model_genre.fit(genre_embeddings)
nn_model_genre.fit(combined_embeddings)
print("Combined genre+description NearestNeighbors model trained successfully.")
# print("Genre-based NearestNeighbors model trained successfully.")

Batches:   0%|          | 0/1640 [00:00<?, ?it/s]

Batches:   0%|          | 0/1640 [00:00<?, ?it/s]

Reading user segmentation dataset

In [40]:
user_deets = pd.read_csv(USER_CLUSTERS_WITH_INFO)
cluster_genre_counts = pd.read_csv(CLUSTER_GENRE_COUNTS)
ordered_genres = cluster_genre_counts.groupby('cluster')['genre'].apply(list).reset_index(name='genres')
user_profiles = pd.merge(user_deets, ordered_genres, on='cluster', how='left')

## Recommendation functions

In [27]:
# --- 1. HELPER FUNCTION ---
def map_age_to_bucket(age: int) -> str:
    """Maps a raw age value to one of the defined age buckets."""
    if age < 18: return 'Under 18'
    elif 18 <= age <= 24: return '18-24'
    elif 25 <= age <= 34: return '25-34'
    elif 35 <= age <= 44: return '35-44'
    elif 45 <= age <= 49: return '45-49'
    elif 50 <= age <= 55: return '50-55'
    else: return '56+'

# --- 2. MODEL TRAINING SETUP (USER SIMILARITY) ---
print("--- Training User Similarity Model ---")
# Create feature string for existing users
user_profiles['feature_string'] = (
    user_profiles['gender'] + ' ' +
    user_profiles['occupation_name'] + ' ' +
    user_profiles['age_bucket'] + ' ' +
    user_profiles['genres'].apply(lambda x: ' '.join(x))
)

# Train the NearestNeighbors model on existing user embeddings
print("Creating embeddings and training user similarity model...")
user_embeddings = model_user.encode(user_profiles['feature_string'].tolist(), show_progress_bar=False)
nn_model_user = NearestNeighbors(n_neighbors=50, metric='cosine', algorithm='auto')
nn_model_user.fit(user_embeddings)
print("User-based NearestNeighbors model trained successfully.")


--- Training User Similarity Model ---
Creating embeddings and training user similarity model...
User-based NearestNeighbors model trained successfully.


In [28]:
def get_ranked_recommendations_for_user(user_id, user_profile_data, df_data, model_genre, nn_model_genre, WEIGHT_GENRE = 0.95, WEIGHT_RATING = 0.05, top_n=10, initial_candidates=50):
    """
    Finds and returns the top N most similar books for a user profile's genres,
    re-ranked by a combined score of genre similarity and rating, 
    and includes user profile information.
    """
    # 1. Extract User Profile Details
    user_profile = user_profile_data[user_profile_data['user_id'] == user_id].iloc[0]
    gender = user_profile['gender']
    age = user_profile['age']
    occupation = user_profile['occupation_name']
    genre_list = user_profile['genres']
    
    if not genre_list:
        return pd.DataFrame()

    # 2. Get Initial Candidates based on Genre Similarity
    # genre_string = ' '.join(genre_list)
    # query_embedding = model_genre.encode(genre_string).reshape(1, -1)
    
    # distances, indices = nn_model_genre.kneighbors(query_embedding, n_neighbors=initial_candidates)
    # candidate_indices = indices.flatten()
    # candidate_distances = distances.flatten()
    # Create query embedding from genres + descriptions
    genre_string = ' '.join(genre_list)
    genre_emb = model_genre.encode([genre_string])
 
    # we just use genre embedding to query combined book embeddings
    query_embedding = np.hstack([genre_emb, np.zeros((1, description_embeddings.shape[1]))])
    
    distances, indices = nn_model_combined.kneighbors(query_embedding, n_neighbors=initial_candidates)
    candidate_indices = indices.flatten()
    candidate_distances = distances.flatten()
    
    candidate_df = df_data.iloc[candidate_indices].copy()
    candidate_df['combined_distance'] = candidate_distances
    
    
    # 3. Score the Candidates using Weighted Metrics
    candidate_df = df_data.iloc[candidate_indices].copy()
    candidate_df['genre_distance'] = candidate_distances
    
    # Clean and normalize rating
    candidate_df['rating'] = pd.to_numeric(candidate_df['rating'], errors='coerce').fillna(0)
    candidate_df['normalized_rating'] = candidate_df['rating'] / 5.0  # (0.0 to 1.0)

    # Convert distance to similarity
    candidate_df['normalized_genre_similarity'] = 1 - candidate_df['genre_distance'] # (0.0 to 1.0)
    
    candidate_df['combined_relevance_score'] = \
        (WEIGHT_GENRE * candidate_df['normalized_genre_similarity']) + \
        (WEIGHT_RATING * candidate_df['normalized_rating'])
    
    # 4. Re-rank and Select Top N
    ranked_recommendations = candidate_df.sort_values(by='combined_relevance_score', ascending=False)
    
    # 5. Prepare Final Output
    top_recommendations = ranked_recommendations[['title', 'rating', 'combined_relevance_score']].head(top_n)
    
    # Add User Profile Context to the recommendations
    top_recommendations['user_id'] = user_id
    top_recommendations['gender'] = gender
    top_recommendations['occupation_name'] = occupation
    top_recommendations['age'] = age
    
    # Reorder columns for presentation
    top_recommendations = top_recommendations[
        ['user_id', 'gender', 'occupation_name', 'age','title', 'rating', 'combined_relevance_score']
    ]
    
    return top_recommendations

In [29]:
# Testing out on one existing user
user_id = 2
get_ranked_recommendations_for_user(user_id, user_profiles, df, model_genre, nn_model_genre, top_n=10)

Unnamed: 0,user_id,gender,occupation_name,age,title,rating,combined_relevance_score
11825,2,M,self-employed,56,Tricolor,3.93,0.626492
39715,2,M,self-employed,56,Gefrorenes Herz,4.24,0.62371
13861,2,M,self-employed,56,Stuk,3.38,0.61511
41285,2,M,self-employed,56,Neon Dies At Dawn,4.19,0.608628
47071,2,M,self-employed,56,Street Soldier,4.03,0.607747
18431,2,M,self-employed,56,Frankie,4.12,0.604191
28051,2,M,self-employed,56,Tiara,4.21,0.600651
6801,2,M,self-employed,56,Project Terror,4.2,0.600551
20114,2,M,self-employed,56,Stolen,4.19,0.600451
22656,2,M,self-employed,56,Final Friends Volume One,4.13,0.600315


In [46]:
def get_combined_recommendations(
    new_user_age: int, 
    new_user_gender: str, 
    new_user_occupation: str, 
    new_user_genres: list, 
    user_profiles_df: pd.DataFrame,
    item_data: pd.DataFrame,
    model_user: SentenceTransformer,
    nn_model_user: NearestNeighbors,
    model_genre: SentenceTransformer,
    nn_model_genre: NearestNeighbors,
    top_n_items: int = 10, 
    k_similar_users: int = 3,         
    WEIGHT_GENRE: float = 0.95, 
    WEIGHT_RATING: float = 0.05
):
    """
    Master function to get recommendations and return interpretive data:
    1. Finds K nearest neighbor users based on input features (CF step).
    2. Runs the Content-Based re-ranking for each of those K users.
    3. Aggregates and returns the best unique recommendations with supporting context.
    """
    
    # --- Part 1: Find K Similar Users & Extract Details (CF Component) ---
    
    # Prepare new user feature string and embedding (using external helper)
    new_user_age_bucket = map_age_to_bucket(new_user_age)
    new_user_genre_string = ' '.join(new_user_genres)
    new_user_feature_string = (
        f"{new_user_gender} {new_user_occupation} {new_user_age_bucket} {new_user_genre_string}"
    )
    new_user_embedding = model_user.encode(new_user_feature_string).reshape(1, -1)
    
    # Find K most similar users
    distances, indices = nn_model_user.kneighbors(
        new_user_embedding, 
        n_neighbors=k_similar_users
    )
    similar_user_indices = indices.flatten()
    
    # Extract details of similar users
    similar_users_details = user_profiles_df.iloc[similar_user_indices][[
        'user_id', 'cluster', 'genres'
    ]].reset_index(drop=True)
    
    similar_user_ids = similar_users_details['user_id'].tolist()
    
    # Format the cluster and genre info for the final output report
    nearest_users_report = similar_users_details.to_dict('records')

    # --- Part 2: Generate and Aggregate Recommendations ---
    
    all_recommendations = []
    
    for user_id in similar_user_ids:
        # Call the content-based re-ranker
        rec_df = get_ranked_recommendations_for_user(
            user_id=user_id,
            user_profile_data=user_profiles_df, 
            df_data=item_data,                 
            model_genre=model_genre,
            nn_model_genre=nn_model_genre,
            WEIGHT_GENRE=WEIGHT_GENRE,
            WEIGHT_RATING=WEIGHT_RATING,
            top_n=20, 
            initial_candidates=50
        )
        all_recommendations.append(rec_df)

    if not all_recommendations:
        return pd.DataFrame(), nearest_users_report
    
    final_combined_df = pd.concat(all_recommendations)
    
    # Merge item genres from the item_data to the combined list
    # Use 'title' as the key assuming it's unique enough or use a unique ID if available (e.g., bookId)
    # Since the original function doesn't return bookId, we rely on title for merging
    final_combined_df = pd.merge(
        final_combined_df,
        item_data[['title', 'genres_string']].drop_duplicates(),
        on='title',
        how='left'
    )
    
    # --- Part 3: Aggregate and Re-rank the Final List ---
    
    # Group by title to aggregate scores across similar users
    final_ranking = final_combined_df.groupby('title').agg(
        avg_combined_relevance_score=('combined_relevance_score', 'mean'),
        max_rating=('rating', 'max'),
        recommendation_count=('user_id', 'count'),
        # Get the genre string of the book (first non-null value)
        book_genres=('genres_string', 'first') 
    ).reset_index()
    
    # Final sort
    final_ranking = final_ranking.sort_values(
        by=['avg_combined_relevance_score', 'recommendation_count'],
        ascending=[False, False]
    )
    
    # Prepare final output columns
    final_ranking.rename(columns={'max_rating': 'rating'}, inplace=True)
    
    # Select final top N items and required columns
    top_recommendations = final_ranking[[
        'title', 
        'rating', 
        'avg_combined_relevance_score', 
        'book_genres'
    ]].head(top_n_items)
    
    return top_recommendations, nearest_users_report

In [45]:
# --- Testing for a new user ---
new_user_inputs = {
    'age': 20,
    'gender': 'M',
    'occupation': 'Educator',
    'genres': ['Comedy', 'Romance', 'Fantasy']
}
K_USERS = 3

print("\n--- Generating Combined Recommendations with Context ---")
recommendations_df, user_context = get_combined_recommendations(
    new_user_age=new_user_inputs['age'], 
    new_user_gender=new_user_inputs['gender'], 
    new_user_occupation=new_user_inputs['occupation'], 
    new_user_genres=new_user_inputs['genres'],
    
    user_profiles_df=user_profiles,   
    item_data=df,                      
    model_user=model_user,
    nn_model_user=nn_model_user,
    model_genre=model_user,           
    nn_model_genre=nn_model_genre,   
    top_n_items=5,           
    k_similar_users=K_USERS
)

# --- Print Nearest Users ---
print("-" * 50)
print(f"Nearest {K_USERS} User Details:")
for user in user_context:
    print(f"  - User ID {user['user_id']} (Cluster: {user['cluster']}): Genres {user['genres']}")

# --- Print Top Recommendations with Genres ---
print("\nTop Recommendations with Genres:")
if not recommendations_df.empty:
    for idx, row in recommendations_df.iterrows():
        print(f"  - Title: {row['title']}")
        print(f"    Rating: {row['rating']}")
        print(f"    Avg Score: {row['avg_combined_relevance_score']:.4f}")
        print(f"    Genres: {row['book_genres']}")
        print("-" * 30)
else:
    print("No recommendations available for this user.")


--- Generating Combined Recommendations with Context ---
--------------------------------------------------
Nearest 3 User Details:
  - User ID 789 (Cluster: 3): Genres ['Comedy', 'Drama', 'Action', 'Thriller', 'Sci-Fi', 'Romance', 'Adventure', 'Crime', 'Horror', "Children's", 'War', 'Animation', 'Mystery', 'Fantasy', 'Musical', 'Western', 'Film-Noir', 'Documentary']
  - User ID 1142 (Cluster: 3): Genres ['Comedy', 'Drama', 'Action', 'Thriller', 'Sci-Fi', 'Romance', 'Adventure', 'Crime', 'Horror', "Children's", 'War', 'Animation', 'Mystery', 'Fantasy', 'Musical', 'Western', 'Film-Noir', 'Documentary']
  - User ID 2270 (Cluster: 3): Genres ['Comedy', 'Drama', 'Action', 'Thriller', 'Sci-Fi', 'Romance', 'Adventure', 'Crime', 'Horror', "Children's", 'War', 'Animation', 'Mystery', 'Fantasy', 'Musical', 'Western', 'Film-Noir', 'Documentary']

Top Recommendations with Genres:
  - Title: Gefrorenes Herz
    Rating: 4.24
    Avg Score: 0.6554
    Genres: ['Young Adult', 'Thriller']
-----------

different preferences -> different nearest 3 clusters -> still gets the same book recommendations

many overlapping genres between books -> lead to 

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Assume you already have genre embeddings for all books
# genre_embeddings = model_genre.encode(df['genres_weighted_string'].tolist(), show_progress_bar=True)

# Pick a subset to inspect (optional)
subset_df = df.head(20)
subset_embeddings = genre_embeddings[:20]

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(subset_embeddings)

# Print similarity matrix
print(similarity_matrix)


[[0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.99999976]
 [0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.99999976]
 [0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.99999976]
 [0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.99999976]
 [0.99999976 0.99999976 0.99999976 0.99999976 0.99999976 0.99999976
  0.99999976 0.9