In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import re

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# edit these dataset paths as needed

BOOKS_DATASET = 'datasets/bookdata.csv'
CLUSTER_GENRE_COUNTS = 'movies_generated_clusters/cluster_genre_summary.csv'
USER_CLUSTERS_WITH_INFO = 'movies_generated_clusters/user_clusters_with_info.csv'
model_book = SentenceTransformer('all-MiniLM-L6-v2')
model_genre = SentenceTransformer('all-MiniLM-L6-v2')
model_user = SentenceTransformer('all-MiniLM-L6-v2') 

## Training NearestNeighbour Model on books genre embedding

(combined) book_embeddings = (combined genre and description), then get embeddings

In [8]:
df = pd.read_csv(BOOKS_DATASET)
df['description'] = df['description'].fillna('')
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower().strip()
    return text
df['cleaned_description'] = df['description'].apply(clean_text)
df['genres_string'] = df['genres'].fillna('').apply(
        lambda x: ' '.join(x) if isinstance(x, list) else x)

df['unified_feature_string'] = f"This book has genres: {df['genres_string']}. The plot summary is: {df['cleaned_description']}"
book_embeddings = model_book.encode(df['unified_feature_string'].tolist(), show_progress_bar=True)

nn_model_book = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='auto')
nn_model_book.fit(book_embeddings)

  text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
Batches: 100%|██████████| 1640/1640 [08:10<00:00,  3.34it/s]


0,1,2
,n_neighbors,10
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


Reading user segmentation dataset

In [14]:
user_deets = pd.read_csv(USER_CLUSTERS_WITH_INFO)
cluster_genre_counts = pd.read_csv(CLUSTER_GENRE_COUNTS)
ordered_genres = cluster_genre_counts.groupby('cluster')['genre'].apply(list).reset_index(name='genres')
user_profiles = pd.merge(user_deets, ordered_genres, on='cluster', how='left')

## Recommendation functions

In [17]:
# --- 1. HELPER FUNCTION ---
def map_age_to_bucket(age: int) -> str:
    """Maps a raw age value to one of the defined age buckets."""
    if age < 18: return 'Under 18'
    elif 18 <= age <= 24: return '18-24'
    elif 25 <= age <= 34: return '25-34'
    elif 35 <= age <= 44: return '35-44'
    elif 45 <= age <= 49: return '45-49'
    elif 50 <= age <= 55: return '50-55'
    else: return '56+'

# --- 2. MODEL TRAINING SETUP (USER SIMILARITY) ---
print("--- Training User Similarity Model ---")
# Create feature string for existing users
user_profiles['feature_string'] = (
    user_profiles['gender'] + ' ' +
    user_profiles['occupation'] + ' ' +
    user_profiles['age'] + ' ' +
    user_profiles['genres'].apply(lambda x: ' '.join(x))
)

# Train the NearestNeighbors model on existing user embeddings
print("Creating embeddings and training user similarity model...")
user_embeddings = model_user.encode(user_profiles['feature_string'].tolist(), show_progress_bar=False)
nn_model_user = NearestNeighbors(n_neighbors=50, metric='cosine', algorithm='auto')
nn_model_user.fit(user_embeddings)
print("User-based NearestNeighbors model trained successfully.")


--- Training User Similarity Model ---
Creating embeddings and training user similarity model...
User-based NearestNeighbors model trained successfully.


In [18]:
def get_ranked_recommendations_for_user(user_id, user_profile_data, df_data, model_book, nn_model_book, WEIGHT_BOOK = 0.95, WEIGHT_RATING = 0.05, top_n=10, initial_candidates=50):
    """ Content-Based Re-ranking for a single user. **WEIGHTS ADJUSTED**"""
    user_profile = user_profile_data[user_profile_data['user_id'] == user_id].iloc[0]
    genre_list = user_profile['genres']
    
    if not isinstance(genre_list, list) or not genre_list: return pd.DataFrame()

    genre_string = ' '.join(genre_list)
    query_embedding = model_book.encode(genre_string).reshape(1, -1)

    distances, indices = nn_model_book.kneighbors(query_embedding, n_neighbors=initial_candidates)
    candidate_df = df_data.iloc[indices.flatten()].copy()
    candidate_df['genre_distance'] = distances.flatten()
    
    candidate_df['rating'] = pd.to_numeric(candidate_df['rating'], errors='coerce').fillna(0)
    candidate_df['normalized_rating'] = candidate_df['rating'] / 5.0
    candidate_df['normalized_genre_similarity'] = 1 - candidate_df['genre_distance']
    
    candidate_df['combined_relevance_score'] = (WEIGHT_BOOK * candidate_df['normalized_genre_similarity']) + (WEIGHT_RATING * candidate_df['normalized_rating'])
    ranked_recommendations = candidate_df.sort_values(by='combined_relevance_score', ascending=False)

    top_recommendations = ranked_recommendations[['title', 'rating', 'combined_relevance_score', 'description']].head(top_n)
    top_recommendations['user_id'] = user_id
    top_recommendations['cluster'] = user_profile['cluster']
    
    return top_recommendations

In [19]:
# Testing out on one existing user
user_id = 2
get_ranked_recommendations_for_user(user_id, user_profiles, df, model_genre, nn_model_book, top_n=10)

Unnamed: 0,title,rating,combined_relevance_score,description,user_id,cluster
35003,A Lennon Pastiche,5.0,0.463811,A fully immersive multi-media e-book for iPad ...,2,14
34977,Paint Me a Tragedy,5.0,0.463811,This is book 3 in the Witch's Brew series.Cele...,2,14
34980,Lovely Blues (Bluesday Book II),4.66,0.460411,"Beyond the spotlight...\n\n\nBlues singer, Bob...",2,14
34991,তিন গোয়েন্দা ভলিউম ১/১,4.61,0.459911,"তিন গোয়েন্দা, কঙ্কাল দ্বীপ, রূপালী মাকড়সাহাল...",2,14
34959,Gun Control,4.55,0.459311,“It’s pretty amazing how uncommon common sense...,2,14
34973,Martin and Malcolm and America: A Dream or a N...,4.39,0.457711,Reexamines the ideology of the two most promin...,2,14
34978,The Negotiator,4.34,0.457211,FBI agent Dave Richman from Danger in the Shad...,2,14
34968,Syabab Musafir Kasih,4.32,0.457011,Novel ini mengisahkan seorang pemuda yang mala...,2,14
34983,The Hambledown Dream,4.29,0.456711,Australian Denny Banister had it all; a succes...,2,14
34955,Starting Your Best Life Now: A Guide for New A...,4.28,0.456611,Grow your faith and develop a positive attitud...,2,14


In [20]:
def get_combined_recommendations(
    new_user_age: int, 
    new_user_gender: str, 
    new_user_occupation: str, 
    new_user_genres: list, 
    user_profiles_df: pd.DataFrame,
    item_data: pd.DataFrame,
    model_user: SentenceTransformer,
    nn_model_user: NearestNeighbors,
    model_book: SentenceTransformer,
    nn_model_book: NearestNeighbors,
    top_n_items: int = 10, 
    k_similar_users: int = 3,         
    WEIGHT_BOOK: float = 0.95, 
    WEIGHT_RATING: float = 0.05
):
    """
    Master function to get recommendations and return interpretive data:
    1. Finds K nearest neighbor users based on input features (CF step).
    2. Runs the Content-Based re-ranking for each of those K users.
    3. Aggregates and returns the best unique recommendations with supporting context.
    """
    
    # --- Part 1: Find K Similar Users & Extract Details (CF Component) ---
    
    # Prepare new user feature string and embedding (using external helper)
    new_user_age_bucket = map_age_to_bucket(new_user_age)
    new_user_genre_string = ' '.join(new_user_genres)
    new_user_feature_string = (
        f"{new_user_gender} {new_user_occupation} {new_user_age_bucket} {new_user_genre_string}"
    )
    new_user_embedding = model_user.encode(new_user_feature_string).reshape(1, -1)
    
    # Find K most similar users
    distances, indices = nn_model_user.kneighbors(
        new_user_embedding, 
        n_neighbors=k_similar_users
    )
    similar_user_indices = indices.flatten()
    
    # Extract details of similar users
    similar_users_details = user_profiles_df.iloc[similar_user_indices][[
        'user_id', 'cluster', 'genres'
    ]].reset_index(drop=True)
    
    similar_user_ids = similar_users_details['user_id'].tolist()
    
    # Format the cluster and genre info for the final output report
    nearest_users_report = similar_users_details.to_dict('records')

    # --- Part 2: Generate and Aggregate Recommendations ---
    
    all_recommendations = []
    
    for user_id in similar_user_ids:
        # Call the content-based re-ranker
        rec_df = get_ranked_recommendations_for_user(
            user_id=user_id,
            user_profile_data=user_profiles_df, 
            df_data=item_data,                 
            model_book=model_book,
            nn_model_book=nn_model_book,
            WEIGHT_BOOK=WEIGHT_BOOK,
            WEIGHT_RATING=WEIGHT_RATING,
            top_n=20, 
            initial_candidates=50
        )
        all_recommendations.append(rec_df)

    if not all_recommendations:
        return pd.DataFrame(), nearest_users_report
    
    final_combined_df = pd.concat(all_recommendations)
    
    # Merge item genres from the item_data to the combined list
    # Use 'title' as the key assuming it's unique enough or use a unique ID if available (e.g., bookId)
    # Since the original function doesn't return bookId, we rely on title for merging
    final_combined_df = pd.merge(
        final_combined_df,
        item_data[['title', 'genres_string']].drop_duplicates(),
        on='title',
        how='left'
    )
    
    # --- Part 3: Aggregate and Re-rank the Final List ---
    
    # Group by title to aggregate scores across similar users
    final_ranking = final_combined_df.groupby('title').agg(
        avg_combined_relevance_score=('combined_relevance_score', 'mean'),
        max_rating=('rating', 'max'),
        recommendation_count=('user_id', 'count'),
        # Get the genre string of the book (first non-null value)
        book_genres=('genres_string', 'first') 
    ).reset_index()
    
    # Final sort
    final_ranking = final_ranking.sort_values(
        by=['avg_combined_relevance_score', 'recommendation_count'],
        ascending=[False, False]
    )
    
    # Prepare final output columns
    final_ranking.rename(columns={'max_rating': 'rating'}, inplace=True)
    
    # Select final top N items and required columns
    top_recommendations = final_ranking[[
        'title', 
        'rating', 
        'avg_combined_relevance_score', 
        'book_genres'
    ]].head(top_n_items)
    
    return top_recommendations, nearest_users_report

In [22]:
# --- Testing for a new user ---
new_user_inputs = {
    'age': 20,
    'gender': 'M',
    'occupation': 'Educator',
    'genres': ['Comedy', 'Romance', 'Fantasy']
}
K_USERS = 3

print("\n--- Generating Combined Recommendations with Context ---")
recommendations_df, user_context = get_combined_recommendations(
    new_user_age=new_user_inputs['age'], 
    new_user_gender=new_user_inputs['gender'], 
    new_user_occupation=new_user_inputs['occupation'], 
    new_user_genres=new_user_inputs['genres'],
    
    user_profiles_df=user_profiles,   
    item_data=df,                      
    model_user=model_user,
    nn_model_user=nn_model_user,
    model_book=model_book,           
    nn_model_book=nn_model_book,   
    top_n_items=5,           
    k_similar_users=K_USERS
)

# --- Print Nearest Users ---
print("-" * 50)
print(f"Nearest {K_USERS} User Details:")
for user in user_context:
    print(f"  - User ID {user['user_id']} (Cluster: {user['cluster']}): Genres {user['genres']}")

# --- Print Top Recommendations with Genres ---
print("\nTop Recommendations with Genres:")
if not recommendations_df.empty:
    for idx, row in recommendations_df.iterrows():
        print(f"  - Title: {row['title']}")
        print(f"    Rating: {row['rating']}")
        print(f"    Avg Score: {row['avg_combined_relevance_score']:.4f}")
        print(f"    Genres: {row['book_genres']}")
        print("-" * 30)
else:
    print("No recommendations available for this user.")


--- Generating Combined Recommendations with Context ---
--------------------------------------------------
Nearest 3 User Details:
  - User ID 1347 (Cluster: 4): Genres ['Comedy', 'Action', 'Drama', 'Sci-Fi', 'Thriller', 'Adventure', 'Romance', "Children's", 'Animation', 'Horror', 'Crime', 'Musical', 'War', 'Fantasy', 'Mystery', 'Western', 'Film-Noir', 'Documentary']
  - User ID 4517 (Cluster: 4): Genres ['Comedy', 'Action', 'Drama', 'Sci-Fi', 'Thriller', 'Adventure', 'Romance', "Children's", 'Animation', 'Horror', 'Crime', 'Musical', 'War', 'Fantasy', 'Mystery', 'Western', 'Film-Noir', 'Documentary']
  - User ID 585 (Cluster: 4): Genres ['Comedy', 'Action', 'Drama', 'Sci-Fi', 'Thriller', 'Adventure', 'Romance', "Children's", 'Animation', 'Horror', 'Crime', 'Musical', 'War', 'Fantasy', 'Mystery', 'Western', 'Film-Noir', 'Documentary']

Top Recommendations with Genres:
  - Title: A Lennon Pastiche
    Rating: 5.0
    Avg Score: 0.4737
    Genres: []
------------------------------
  - 