In [61]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import pickle

In [62]:
merged_df=pd.read_csv(r'/Users/priyankamalavade/Desktop/Book_recommendation_system_mini_project/data/books_with_clusters.csv')
merged_df.head()

Unnamed: 0,Book Name,Author,Rating,Number of Reviews,Price,Rank,Genre,Listening Time (minutes),cleaned_description,cluster
0,Think Like a Monk: The Secret of How to Harnes...,Jay Shetty,4.9,342.0,10080.0,1,Society & Culture (Books),654.0,over past three year jay shetty become one wor...,3
1,Ikigai: The Japanese Secret to a Long and Happ...,Héctor García,4.6,3670.0,615.0,2,Personal Success,203.0,brought penguin,4
2,The Subtle Art of Not Giving a F*ck: A Counter...,Mark Manson,4.4,20240.0,10378.0,3,Personal Development & Self-Help,317.0,in guide superstar blogger cut crap show u sto...,3
3,Atomic Habits: An Easy and Proven Way to Build...,James Clear,4.6,4646.0,888.0,5,Personal Success,335.0,brought penguin,4
4,Life's Amazing Secrets: How to Find Balance an...,Gaur Gopal Das,4.6,4305.0,1005.0,6,Spiritualism,385.0,stop going life start growing life,3


In [63]:
# Content-Based Filtering: Cosine similarity using genres, authors, and cleaned descriptions
def calculate_similarity(df):

    # Fill NaN values with empty strings in the relevant columns
    df['Genre'] = df['Genre'].fillna('')
    df['Author'] = df['Author'].fillna('')
    df['cleaned_description'] = df['cleaned_description'].fillna('')
    
    
    df['combined_features'] = df['Genre'].astype(str) + ' ' + df['Author'] + ' ' + df['cleaned_description']
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['combined_features'])
    cosine_sim = cosine_similarity(tfidf_matrix)
    
    # Save the cosine similarity matrix using pickle
    with open('/Users/priyankamalavade/Desktop/Book_recommendation_system_mini_project/models/cosine_similarity_matrix.pkl', 'wb') as file:
        pickle.dump(cosine_sim, file)
    
    return cosine_sim


In [64]:
# Function to recommend books based on a given book (by index)
def recommend_books(book_index, cosine_sim, df, top_n=5):
    similar_books = list(enumerate(cosine_sim[book_index]))
    sorted_books = sorted(similar_books, key=lambda x: x[1], reverse=True)[1:top_n+1]
    recommendations = [df.iloc[i[0]]['Book Name'] for i in sorted_books]
    return recommendations


In [65]:
# Clustering-based recommendations using KMeans
def perform_clustering(df, n_clusters=5):
    # Using numerical features for clustering
    X = df[['Rating', 'Number of Reviews', 'Price', 'Rank', 'Listening Time (minutes)']]
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(X)
    
    # Save the clustering model using pickle
    with open('/Users/priyankamalavade/Desktop/Book_recommendation_system_mini_project/models/kmeans_clustering_model.pkl', 'wb') as file:
        pickle.dump(kmeans, file)
    
    return kmeans

In [66]:
# Recommend books from the same cluster
def recommend_from_cluster(book_index, df, top_n=5):
    cluster_label = df.iloc[book_index]['cluster']
    cluster_books = df[df['cluster'] == cluster_label].head(top_n)['Book Name'].tolist()
    return cluster_books


In [67]:
# Hybrid: Combine content-based and clustering recommendations
def hybrid_recommendation(book_index, cosine_sim, df, top_n=5):
    content_recommendations = recommend_books(book_index, cosine_sim, df, top_n=top_n)
    cluster_recommendations = recommend_from_cluster(book_index, df, top_n=top_n)
    return list(set(content_recommendations + cluster_recommendations))



In [68]:
# Precision and Recall calculation
def calculate_precision_recall(recommended, actual):
    recommended_set = set(recommended)
    actual_set = set(actual)

    true_positives = len(recommended_set.intersection(actual_set))
    precision = true_positives / len(recommended_set) if recommended_set else 0
    recall = true_positives / len(actual_set) if actual_set else 0

    return precision, recall

In [69]:
# RMSE Calculation using Linear Regression for rating prediction
def calculate_rmse(df):
    # Train a simple Linear Regression model for rating prediction
    X = df[['Rank', 'Number of Reviews', 'Price', 'Listening Time (minutes)']]
    y = df['Rating']

    model = LinearRegression()
    model.fit(X, y)
    
    # Make predictions
    predicted_ratings = model.predict(X)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y, predicted_ratings)
    
    # Take square root of MSE to get RMSE
    rmse = np.sqrt(mse)
    
    # Save the linear regression model
    with open('/Users/priyankamalavade/Desktop/Book_recommendation_system_mini_project/models/linear_regression_model.pkl', 'wb') as file:
        pickle.dump(model, file)
    
    return rmse

In [70]:
# Step 1: Calculate similarity for content-based filtering
cosine_sim = calculate_similarity(merged_df)


In [73]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.02993284, ..., 0.15960031, 0.0440914 ,
        0.        ],
       [0.        , 0.02993284, 1.        , ..., 0.        , 0.11113278,
        0.        ],
       ...,
       [0.        , 0.15960031, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.0440914 , 0.11113278, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]], shape=(1642, 1642))

In [71]:
# Step 2: Perform clustering
perform_clustering(merged_df)


In [72]:
# Step 3: Get hybrid recommendations for a given book index (example index 0)
book_index = 0
hybrid_recommendations = hybrid_recommendation(book_index, cosine_sim, merged_df, top_n=5)
print(f'Hybrid Recommendations: {hybrid_recommendations}')


Hybrid Recommendations: ['Everything Is F*cked: A Book About Hope', 'The 5AM Club: Own Your Morning. Elevate Your Life.', 'The Book of Why: The New Science of Cause and Effect', 'The Order of Time: Narrated by Benedict Cumberbatch', 'Think Like a Monk: The Secret of How to Harness the Power of Positivity and Be Happy Now', 'Influence: The Psychology of Persuasion', 'The Facebook Effect: The Inside Story of the Company That Is Connecting the World', 'The Intelligent Investor Rev Ed.', 'The Rise and Fall of the Dinosaurs: The Untold Story of a Lost World', 'Das Think Like a Monk-Prinzip: Finde innere Ruhe und Kraft für ein erfülltes und sinnvolles Leben']


In [75]:
merged_df['Book Name'].unique()

array(['Think Like a Monk: The Secret of How to Harness the Power of Positivity and Be Happy Now',
       'Ikigai: The Japanese Secret to a Long and Happy Life',
       'The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life',
       ..., 'Terra Incognita: 100 Maps to Survive the Next 100 Years',
       'Universal Mind Power: New Silva Method Techniques for Developing Your Ideal Self',
       "Dr. Bernstein's Diabetes Solution: The Complete Guide to Achieving Normal Blood Sugars"],
      shape=(1616,), dtype=object)

In [79]:
actual_books = ['Everything Is F*cked: A Book About Hope','Ikigai: The Japanese Secret to a Long and Happy Life','Universal Mind Power: New Silva Method Techniques for Developing Your Ideal Self','The 5AM Club: Own Your Morning. Elevate Your Life.','The Order of Time: Narrated by Benedict Cumberbatch']  # Replace with actual books liked by the user
precision, recall = calculate_precision_recall(hybrid_recommendations, actual_books)
print(f'Precision: {precision}, Recall: {recall}')


Precision: 0.3, Recall: 0.6


In [80]:
# Calculate RMSE
rmse = calculate_rmse(merged_df)
print(f'RMSE: {rmse}')


RMSE: 1.625641464505686
