<a href="https://colab.research.google.com/github/srewashimondal/NLP-Book-Recommendation/blob/main/Srewashi_AI4ALL_NLP_Book_Rec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
files.upload()

ModuleNotFoundError: No module named 'google'

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d dk123891/books-dataset-goodreadsmay-2024

In [None]:
#Unzip the dataset
!unzip books-dataset-goodreadsmay-2024.zip

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("Book_Details.csv")

# Preview the data
df.head(10)

In [None]:
#Data Cleaning and Preparation

In [None]:
#Columns we need right now
df = df[['book_title', 'book_details','publication_info', 'author', 'num_pages', 'genres', 'num_ratings','num_reviews','average_rating']]

In [None]:
#Drop missing values
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
#See cleaned dataset
df.info()
df.head(3)

In [None]:
#Check for duplicates
df['book_title'].duplicated().sum()

#If we find duplicated, keep only first entry
df.drop_duplicates(subset='book_title', keep='first', inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
#Count for each column
df.info()

In [None]:
#Lowercase and strip spaces in text fields like book_title, book_details, genres, and author
text_cols = ['book_title', 'book_details', 'genres', 'author']

for col in text_cols:
    df[col] = df[col].str.lower().str.strip()

In [None]:
#Ensure num_pages, num_ratings, num_reviews, and average_rating are numeric
# numeric_cols = ['num_pages', 'num_ratings', 'num_reviews', 'average_rating']

# for col in numeric_cols:
#     df[col] = pd.to_numeric(df[col], errors='coerce')

#Then drop or fill any NaNs that may have appeared:
# df.dropna(subset=numeric_cols, inplace=True)
# df.reset_index(drop=True, inplace=True)

#Gave me error so I commented out this block of code

In [None]:
#Extract Year from publication_info (for if we want to later filter by publication year)
# df['year_published'] = df['publication_info'].str.extract(r'(\d{4})').astype(float)

#Gave me error so I commented out this block of code

In [None]:
print("Unique values per column:")
print(df.nunique())

print("\nMissing values per column:")
print(df.isnull().sum())

print("\nTop genres:")
print(df['genres'].value_counts().head(10))


In [None]:
df.shape

In [None]:
# shows all 100 rows without truncation
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# df.head(100)
# for the sake of data, I will do df.head(10)
df.head(10)

In [None]:
#Next steps
# clean strings - text data
# fix data types

In [None]:
# Combine important text fields into one column
df['combined_text'] = df['book_title'] + ' ' + df['book_details'] + ' ' + df['genres'] + ' ' + df['author']
df['combined_text'] = df['combined_text'].str.lower()
df['combined_text'] = df['combined_text'].str.replace(r'[^\w\s]', '', regex=True)

# To see it
df[['book_title', 'combined_text']].head(5)

In [None]:
# It’s a method that turns text into numerical vectors and measures how important a word is to a document in a collection (in your case, a book in your dataset)
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize vectorizer
# means it will ignore common English words like “the,” “is,” “and,” etc
vectorizer = TfidfVectorizer(stop_words='english')

# Fit learns the vocabulary from data
# Creates a matrix where: each row = one book, each column = one unique word from the full vocabulary, & each value = TF-IDF score for that word in that book. This matrix is called a sparse matrix
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# Print shape
print(tfidf_matrix.shape)

In [None]:
from sklearn.neighbors import NearestNeighbors

# Initialize the model. Uses cosine similarity to compare books
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')

# Fit the model on the TF-IDF matrix
knn_model.fit(tfidf_matrix)

In [None]:
def recommend_books(prompt, n=5):
    # Preprocess the prompt
    prompt = prompt.lower().strip()

    # Vectorize the prompt using the same TF-IDF vectorizer
    prompt_vector = vectorizer.transform([prompt])

    # Find nearest neighbors (books)
    distances, indices = knn_model.kneighbors(prompt_vector, n_neighbors=n)

    # Return book titles and distances
    return df.iloc[indices[0]][['book_title', 'author', 'genres']]

In [None]:
recommend_books("magic school fantasy", n=5)

In [None]:
# Step 1: Ask the user for their prompt
user_input = input("Describe the kind of book you're looking for: ")

# Step 2: Call the recommendation function with user input
recommendations = recommend_books(user_input, n=5)

# Step 3: Display the results
print("\nTop recommended books:\n")
print(recommendations)

In [None]:
import joblib

# Saves everything I need
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(knn_model, 'knn_model.pkl')
joblib.dump(tfidf_matrix, 'tfidf.pkl')
joblib.dump(df, 'books_df.pkl')

In [None]:
from google.colab import files

files.download('vectorizer.pkl')
files.download('knn_model.pkl')
files.download('tfidf.pkl')
files.download('books_df.pkl')