<a href="https://colab.research.google.com/github/srewashimondal/-My-eCornell-ML-Portfolio/blob/main/Srewashi_AI4ALL_NLP_Book_Rec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d dk123891/books-dataset-goodreadsmay-2024

In [None]:
#Unzip the dataset
!unzip books-dataset-goodreadsmay-2024.zip

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("Book_Details.csv")

# Preview the data
df.head()

In [None]:
#Data Cleaning and Preparation

In [None]:
#Columns we need right now
df = df[['book_title', 'book_details','publication_info', 'author', 'num_pages', 'genres', 'num_ratings','num_reviews','average_rating']]

In [None]:
#Drop missing values
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
#See cleaned dataset
df.info()
df.head(3)

In [None]:
#Check for duplicates
df['book_title'].duplicated().sum()

#If we find duplicated, keep only first entry
df.drop_duplicates(subset='book_title', keep='first', inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
#Count for each column
df.info()

In [None]:
#Lowercase and strip spaces in text fields like book_title, book_details, genres, and author
text_cols = ['book_title', 'book_details', 'genres', 'author']

for col in text_cols:
    df[col] = df[col].str.lower().str.strip()

In [None]:
#Ensure num_pages, num_ratings, num_reviews, and average_rating are numeric
# numeric_cols = ['num_pages', 'num_ratings', 'num_reviews', 'average_rating']

# for col in numeric_cols:
#     df[col] = pd.to_numeric(df[col], errors='coerce')

#Then drop or fill any NaNs that may have appeared:
# df.dropna(subset=numeric_cols, inplace=True)
# df.reset_index(drop=True, inplace=True)

#Gave me error so I commented out this block of code

In [None]:
#Extract Year from publication_info (for if we want to later filter by publication year)
# df['year_published'] = df['publication_info'].str.extract(r'(\d{4})').astype(float)

#Gave me error so I commented out this block of code

In [None]:
print("Unique values per column:")
print(df.nunique())

print("\nMissing values per column:")
print(df.isnull().sum())

print("\nTop genres:")
print(df['genres'].value_counts().head(10))


In [None]:
df.shape

In [None]:
# shows all 100 rows without truncation
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

df.head(100)

In [None]:
#Next steps
# clean strings - text data
# fix data types

In [None]:
# Combine relevant columns into one string per book
df['combined_text'] = (
    df['book_title'].fillna('') + ' ' +
    df['book_author'].fillna('') + ' ' +
    df['genres'].fillna('') + ' ' +
    df['description'].fillna('')
)

In [None]:
from sklearn.model_selection import train_test_split

# Split TF-IDF matrix and dataframe
X_train, X_test, y_train, y_test = train_test_split(
    book_embeddings,
    df,
    test_size=0.2,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

from sklearn.model_selection import KFold
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_distances
import numpy as np

# Initialize 5-Fold Cross Validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

avg_distances = []  # to store average cosine distance for each fold

for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
    X_fold_train = X_train[train_index]
    X_fold_val = X_train[val_index]

    # Train NearestNeighbors on current fold
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(X_fold_train)

    # Find nearest neighbors for each book in validation set
    distances, indices = knn.kneighbors(X_fold_val, n_neighbors=6)  # 6 b/c 1st result is the book itself
    avg_fold_distance = np.mean(distances[:, 1:])  # skip self-distance (0.0)

    avg_distances.append(avg_fold_distance)
    print(f"Fold {fold+1}: Average cosine distance to nearest neighbors = {avg_fold_distance:.4f}")


In [None]:
import matplotlib.pyplot as plt

plt.plot(range(1, 6), avg_distances, marker='o')
plt.title("Average Cosine Distance to Nearest Neighbors (5-Fold CV)")
plt.xlabel("Fold Number")
plt.ylabel("Avg Cosine Distance")
plt.grid(True)
plt.show()

In [None]:
# Final KNN model trained on full training set
final_knn = NearestNeighbors(metric='cosine', algorithm='brute')
final_knn.fit(X_train)

# Evaluate: find distances for test set books
test_distances, test_indices = final_knn.kneighbors(X_test, n_neighbors=6)
avg_test_distance = np.mean(test_distances[:, 1:])  # exclude self-distance
print(f"\nFinal Evaluation: Avg cosine distance on reserved test set = {avg_test_distance:.4f}")

# Notes for us to consider
- ue all 4-5 models, train data, make dashboard with reports of all 4 models and show how datasets are working on each model
- purpose is to compare

In [None]:
!pip uninstall ipython -y
!pip install ipython

In [None]:
from IPython import get_ipython

all_code = "\n\n".join(get_ipython().history_manager.input_hist_raw)
with open("all_code.py", "w") as f:
    f.write(all_code)

Random Forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load dataset
df = pd.read_csv("Book_Details.csv")

# Keep necessary columns
df = df[['book_title', 'book_details','publication_info', 'author', 'num_pages', 'genres', 'num_ratings','num_reviews','average_rating']]

# Drop missing values and reset index
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Drop duplicates based on book title
df.drop_duplicates(subset='book_title', keep='first', inplace=True)
df.reset_index(drop=True, inplace=True)

# Clean text columns
text_cols = ['book_title', 'book_details', 'genres', 'author']
for col in text_cols:
    df[col] = df[col].str.lower().str.strip()

# Assuming you already created TF-IDF or embedding vectors and stored them in book_embeddings
# For example:
# from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer(stop_words='english')
# book_embeddings = vectorizer.fit_transform(df['book_details'])

# ----> Replace this line with however you generated book_embeddings
# book_embeddings = ...

# For now, let’s assume book_embeddings is ready:
# e.g., book_embeddings = tfidf_matrix

# Target variable: average_rating
X = book_embeddings  # TF-IDF or other embedding matrix
y = df['average_rating']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Random Forest Regressor
rf = RandomForestRegressor(n_estimators=20, random_state=42)
rf.fit(X_train, y_train)

# Predict on test data
y_pred = rf.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("=== Random Forest Regression Results ===")
print("Mean Squared Error:", mse)
print("R² Score:", r2)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load dataset
df = pd.read_csv("Book_Details.csv")

