<a href="https://colab.research.google.com/github/sallyp0318/predictive_modeling_ctr/blob/main/Code_Colab_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## GoodReads Young Adult Book Reviews

In [None]:
import pandas as pd
import numpy as np
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# spaCy's English model
nlp = spacy.load("en_core_web_sm")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Updated file paths to your Drive's shortcut location
preprocessed_file = "/content/drive/MyDrive/APAN 5430 NLP Group Project/Goodreads/goodreads_young_adult_preprocessed.parquet"

In [None]:
import pandas as pd

df = pd.read_parquet(preprocessed_file)

### Data Prep

#### Packages

In [None]:
!python -m spacy download en_core_web_sm
!pip install -U sentence-transformers

In [None]:
import pandas as pd
import numpy as np
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# spaCy's English model
nlp = spacy.load("en_core_web_sm")

#### Load data

In [None]:
pip install pandas pyarrow

In [None]:
import pandas as pd


books = pd.read_json("goodreads_books_young_adult.json", orient="records", lines=True)
reviews = pd.read_json("goodreads_reviews_young_adult.json", orient="records", lines=True)

In [None]:
# book data
books = pd.read_json("goodreads_books_young_adult.json", orient="records", lines=True)

In [None]:
print(books.columns)

In [None]:
# review data
reviews = pd.read_json("goodreads_reviews_young_adult.json", orient="records", lines=True)

In [None]:
print(reviews.columns)

#### Pulling Relevant Columns

In [None]:
books_columns = [
    'book_id',
    'title',
    'title_without_series',
    'authors',
    'publisher',
    'average_rating',
    'ratings_count',
    'text_reviews_count',
    'description',
    'popular_shelves',
    'similar_books',
    'language_code',
    'format'
]

In [None]:
reviews_columns = [
    'book_id',
    'review_text',
    'rating',
    'date_added',
    'read_at',
    'started_at',
    'user_id',
    'n_votes',
    'n_comments'
]

In [None]:
books = books[books_columns]
reviews = reviews[reviews_columns]

#### Checking dataframe

In [None]:
len(books)

In [None]:
len(reviews)

In [None]:
len(books['book_id'].unique())

In [None]:
len(reviews['book_id'].unique())

In [None]:
books['book_id'].isin(reviews['book_id']).all()

In [None]:
reviews['book_id'].isin(books['book_id']).all()

#### Cutting data (file too large, too many reivews)

Only keeping top 30K books with at least 5 or more reviews

In [None]:
# Drop null and unrated reviews
reviews = reviews.dropna(subset=['review_text'])
reviews = reviews[reviews['rating'] > 0]

In [None]:
len(reviews)

In [None]:
# Count reviews per book
review_counts = reviews.groupby('book_id').size().reset_index(name='review_count')

In [None]:
# Select top 30k books with at least 5 reviews
top_books = (
    review_counts[review_counts['review_count'] >= 5]
    .sort_values('review_count', ascending=False)
    .head(30000)
)
top_book_ids = top_books['book_id'].tolist()

In [None]:
# Filter reviews and books
filtered_reviews = reviews[reviews['book_id'].isin(top_book_ids)]
filtered_books = books[books['book_id'].isin(top_book_ids)]

In [None]:
# Cap reviews per book to 150
filtered_reviews = (
    filtered_reviews
    .groupby('book_id')
    .apply(lambda x: x.sample(n=min(len(x), 150), random_state=42))
    .reset_index(drop=True)
)

In [None]:
print(f"✅ Filtered books: {len(filtered_books):,}")
print(f"✅ Filtered reviews: {len(filtered_reviews):,}")

#### Merge Books and Reviews

In [None]:
# Merge into books
df = filtered_reviews.merge(filtered_books, on='book_id', how='inner')

#### Save to parquet for loading later

In [None]:
df.to_parquet("goodreads_young_adult_filtered.parquet", index=False)

In [None]:
# to load later
df = pd.read_parquet("goodreads_young_adult_filtered.parquet")

### Preprocess Reviews

In [None]:
def preprocess_text(text):
    # Basic cleaning
    text = re.sub(r"<.*?>", " ", text)                # Remove HTML
    text = re.sub(r"http\S+|www\S+", " ", text)       # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", " ", text)          # Remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip().lower()  # Normalize whitespace and lowercase

    # Process with spaCy
    doc = nlp(text)

    tokens = [
        token.lemma_ for token in doc
        if token.is_alpha
        and not token.is_stop
        and len(token.lemma_) > 1
    ]

    return " ".join(tokens)

# Apply to each review
df['clean_review'] = df['review_text'].dropna().apply(preprocess_text)


In [None]:
df.to_parquet("goodreads_young_adult_preprocessed.parquet", index=False)

NameError: name 'df' is not defined

In [None]:
len(df)

1047803

In [None]:
print(df.head())

   book_id                                        review_text  rating  \
0       50  I remember reading this for school way back wh...       5   
1       50  A story of a boy that survives a small plane c...       5   
2       50  I first read this book many years ago (either ...       4   
3       50  Not as good as I remembered... quite possibly ...       3   
4       50  3.5 Stars \n Thanks, Gary Paulsen. \n Now I sh...       3   

                       date_added                         read_at  \
0  Tue Sep 16 08:36:59 -0700 2014                                   
1  Sat Aug 30 14:23:29 -0700 2008  Thu Jan 01 00:00:00 -0800 1987   
2  Mon Dec 13 11:59:19 -0800 2010  Tue Apr 26 00:00:00 -0700 2011   
3  Mon Jan 30 20:43:48 -0800 2012  Mon Jan 30 20:44:26 -0800 2012   
4  Sun Mar 03 16:17:42 -0800 2013                                   

                       started_at                           user_id  n_votes  \
0                                  4ac9790a722813db73a51a479e904a8

In [None]:
print(df.columns.to_list())

['book_id', 'review_text', 'rating', 'date_added', 'read_at', 'started_at', 'user_id', 'n_votes', 'n_comments', 'title', 'title_without_series', 'authors', 'publisher', 'average_rating', 'ratings_count', 'text_reviews_count', 'description', 'popular_shelves', 'similar_books', 'language_code', 'format', 'clean_review']


### Text Vecorization and Similarity

In [None]:
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastparquet
Successfully installed fastparquet-2024.11.0


In [None]:
# df.to_parquet("goodreads_young_adult_preprocessed.parquet", index=False)
# df = pd.read_parquet("goodreads_young_adult_preprocessed.parquet")

df = pd.read_parquet("/content/drive/My Drive/APAN 5430 NLP Group Project/Goodreads/goodreads_young_adult_preprocessed.parquet")

In [None]:
texts = df['clean_review'].astype(str).tolist()  # Ensure all values are strings

NameError: name 'df' is not defined

#### Corpus average TF_IDF scores

##### Unigram

In [None]:
texts = df['review_text'].astype(str).tolist()

In [None]:
print(df.columns)

Index(['book_id', 'review_text', 'rating', 'date_added', 'read_at',
       'started_at', 'user_id', 'n_votes', 'n_comments', 'title',
       'title_without_series', 'authors', 'publisher', 'average_rating',
       'ratings_count', 'text_reviews_count', 'description', 'popular_shelves',
       'similar_books', 'language_code', 'format', 'clean_review'],
      dtype='object')


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# TFIDF Unigram
# Unigrams only (1, 1)
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 1)) # (min_words, max_words)
tfidf_matrix = vectorizer.fit_transform(texts)
feature_names = vectorizer.get_feature_names_out()

# Get TFIDF score for each term
tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).ravel()
#tfidf_scores = np.asarray(tfidf_matrix.sum(axis=0)).ravel()

# Build df and view unigrams and scores
# Sort by score (descending)
df_tfidf_uni = pd.DataFrame({
    'term': feature_names,
    'tfidf_score': tfidf_scores
}).sort_values('tfidf_score', ascending=False).reset_index(drop=True)

print("Top TF-IDF Unigrams:")
df_tfidf_uni.head(10)

Top TF-IDF Unigrams:


Unnamed: 0,term,tfidf_score
0,book,0.054135
1,read,0.031769
2,really,0.026765
3,story,0.025848
4,like,0.02431
5,just,0.023517
6,love,0.022288
7,series,0.019944
8,good,0.019823
9,loved,0.019344


##### Bigram

In [None]:
# TFIDF Bigram
# Bigrams only (2, 2)
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2, 2)) # (min_words, max_words)
tfidf_matrix = vectorizer.fit_transform(texts)
feature_names = vectorizer.get_feature_names_out()

# Get TFIDF score for each term
tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).ravel()
#tfidf_scores = np.asarray(tfidf_matrix.sum(axis=0)).ravel()

# Build df and view unigrams and scores
# Sort by score (descending)
df_tfidf = pd.DataFrame({
    'term': feature_names,
    'tfidf_score': tfidf_scores
}).sort_values('tfidf_score', ascending=False).reset_index(drop=True)

print("Top TF-IDF Bigrams:")
df_tfidf.head(10)

Top TF-IDF Bigrams:


Unnamed: 0,term,tfidf_score
0,review come,0.004266
1,read book,0.003293
2,loved book,0.00306
3,really enjoyed,0.002946
4,main character,0.002449
5,ve read,0.00219
6,really liked,0.002168
7,felt like,0.002164
8,good book,0.00211
9,book series,0.002075


##### Recommended

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words='english',       # remove English stopwords
    ngram_range=(1, 2),         # unigrams + bigrams (use bigrams that unigrams can miss)
    max_df=0.9,                 # ignore overly common terms (ex. love book, read book, etc.)
    min_df=100                  # ignore very rare terms
)
tfidf_matrix = vectorizer.fit_transform(texts)

feature_names = vectorizer.get_feature_names_out()

# Get TFIDF score for each term
tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).ravel()
#tfidf_scores = np.asarray(tfidf_matrix.sum(axis=0)).ravel()

# Build df and view unigrams and scores
# Sort by score (descending)
df_tfidf = pd.DataFrame({
    'term': feature_names,
    'tfidf_score': tfidf_scores
}).sort_values('tfidf_score', ascending=False).reset_index(drop=True)

print("Top TF-IDF Bigrams:")
df_tfidf.head(10)


### Simhash Deduplication

In [None]:
!pip install simhash


Collecting simhash
  Using cached simhash-2.1.2-py3-none-any.whl.metadata (382 bytes)
Using cached simhash-2.1.2-py3-none-any.whl (4.7 kB)
Installing collected packages: simhash
Successfully installed simhash-2.1.2


In [None]:
from simhash import Simhash, SimhashIndex

# Ensure text column is string
df['clean_review'] = df['clean_review'].astype(str)


# Compute Simhash on plain tokens split from text (no weights)
# Use Simhash default setting

# Remove near-duplicate reviews
def compute_simhash(text):
    # Split text into tokens
    tokens = text.split()
    return Simhash(tokens).value

df['simhash'] = df['clean_review'].apply(compute_simhash)
df = df.drop_duplicates(subset=['simhash']).reset_index(drop=True)

##### Overflow error when scaling TF-IDF socres to integer weights

In [None]:
# from simhash import Simhash, SimhashIndex
# import numpy as np
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer

# texts = df['clean_review'].astype(str).tolist()

# # Compute TF-IDF unigram and bigram matrices separately

# # Unigrams
# vectorizer_uni = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
# tfidf_uni_matrix = vectorizer_uni.fit_transform(texts)
# uni_feature_names = vectorizer_uni.get_feature_names_out()
# uni_vocab = set(uni_feature_names)

# # Bigrams
# vectorizer_bi = TfidfVectorizer(stop_words='english', ngram_range=(2, 2))
# tfidf_bi_matrix = vectorizer_bi.fit_transform(texts)
# bi_feature_names = vectorizer_bi.get_feature_names_out()
# bi_vocab = set(bi_feature_names)

# # For each document, get TF-IDF tokens with score above threshold

# f = 64
# objs = []

# for i, text in enumerate(texts):
#     # Vectorize this doc separately for uni and bi
#     uni_vec = vectorizer_uni.transform([text])
#     bi_vec = vectorizer_bi.transform([text])

#     # Get tokens and scores for unigrams above threshold (e.g., 0.1)
#     uni_indices = uni_vec.indices
#     uni_data = uni_vec.data
#     uni_tokens = [(uni_feature_names[idx], uni_data[j]) for j, idx in enumerate(uni_indices) if uni_data[j] > 0.1]

#     # Same for bigrams
#     bi_indices = bi_vec.indices
#     bi_data = bi_vec.data
#     bi_tokens = [(bi_feature_names[idx], bi_data[j]) for j, idx in enumerate(bi_indices) if bi_data[j] > 0.1]

#     # Combine tokens
#     tokens = uni_tokens + bi_tokens

#     # Convert tokens with TF-IDF scores to integer weights for Simhash
#     features = {}
#     for token, score in tokens:
#         weight = max(1, int(score * 10))  # scale and convert to int weight
#         features[token] = weight

#     sh = Simhash(features, f=f)
#     objs.append((str(i), sh))

# # Build Simhash index and map
# hamming_distance = 15
# index = SimhashIndex(objs, k=hamming_distance, f=f)
# simhash_map = dict(objs)

# # Deduplicate texts using Simhash
# seen_ids = set()
# deduplicated_reviews = []

# for i, text in enumerate(texts):
#     current_id = str(i)
#     if current_id in seen_ids:
#         continue
#     duplicates = index.get_near_dups(simhash_map[current_id])
#     seen_ids.update(duplicates)
#     deduplicated_reviews.append({'id': current_id, 'text': text})

# print(f"Original count: {len(texts)}")
# print(f"Deduplicated count: {len(deduplicated_reviews)}")

In [None]:
# # Prepare data with unique IDs
# feeds = []

# for i, n in enumerate(texts):
#     # Making copies of the original newsfeeds to avoid changing the imported data
#     feed = n.copy()
#     feed['id'] = str(i)
#     feeds.append(feed)

# # Construct SimHash Objects
# # Number of bits in Simhash
# f = 64

# # Create list of (id, Simhash) pairs for the titles
# objs = []

# for feed in feeds:
#     sh = Simhash(feed.get('title', ''), f=f)
#     objs.append((feed['id'], sh))

# # Bucket articles based on desited Hamming distance
# hamming_distance = 15                                       # smaller = stricter deduplication
# index = SimhashIndex(objs, k=hamming_distance, f=f)

# # Create a dictionary from list of (id, Simhash) pairs stored in objs
# simhash_map = dict(objs)

In [None]:
# # Deduplicate newsfeeds
# seen_ids = set()
# deduplicated_simhash = []

# for feed in feeds:
#     current_id = feed['id']
#     # Skip if its a known duplicate
#     if current_id in seen_ids:
#         continue

#     # Find all duplicates of current article
#     duplicates = index.get_near_dups(simhash_map[current_id])

#     # Mark all duplicates as seen
#     seen_ids.update(duplicates)

#     # Keep only the first occurrence (current article)
#     deduplicated_simhash.append(feed)

# print(f"Total newfeeds: {len(texts)}")
# print(f"Number of newsfeeds after deduplication: {len(deduplicated_simhash)}")

##### TF-IDF Vectorization

In [None]:
# Only Unigrams to reduce dimensionality
# Limit to top 50k terms
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), max_features=50000)
tfidf_matrix = vectorizer.fit_transform(df['clean_review'])

# TF-IDF Shape (n_samples, n_features)
print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")


##### TRY THIS #####
# TF-IDF Vectorization (Unigrams + Bigrams)
# ngram_range=(1, 2)

TF-IDF Matrix Shape: (986418, 50000)


##### Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Recommend Books Based on User Input (Batch-wise Cosine Similarity)
def recommend_books(user_input, top_n=5, batch_size=5000):
    # Vectorize user input
    user_vec = vectorizer.transform([user_input])

    # Compute cosine similarities in batches
    similarities = []
    for start in range(0, tfidf_matrix.shape[0], batch_size):
        end = min(start + batch_size, tfidf_matrix.shape[0])
        batch_sim = cosine_similarity(user_vec, tfidf_matrix[start:end])
        similarities.extend(batch_sim.flatten())

    # Attach similarity scores back to df
    df['similarity'] = similarities

    # Aggregate per book (max similarity score per book)
    book_scores = df.groupby('book_id')['similarity'].max().reset_index()

    # Get Top N recommended books
    top_books = book_scores.sort_values('similarity', ascending=False).head(top_n)

    return top_books

##### Test

In [None]:
# Test

user_query = "A thrilling mystery novel with a strong female lead character"
top_recommendations = recommend_books(user_query, top_n=5)

print("\nTop Book Recommendations:")
print(top_recommendations)



Top Book Recommendations:
        book_id  similarity
29         8957    0.645870
1027     223822    0.611323
300       47763    0.558308
6126    7507908    0.531760
19769  20643052    0.530809


Part 3: BERTopic

In [None]:
!pip install -U bertopic
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.3


In [None]:
sampled_texts = df['clean_review'].astype(str).sample(n=5000, random_state=42).tolist()


In [None]:
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer("all-MiniLM-L6-v2")  # already one of the fastest
sample_embeddings = sbert_model.encode(sampled_texts, show_progress_bar=True, convert_to_numpy=True)

from bertopic import BERTopic

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(sampled_texts, sample_embeddings)

topic_model.get_topic_info().head()
topic_model.visualize_topics()


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [None]:
topic_model.get_topic_info().head()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,35,-1_und_da_die_zu,"[und, da, die, zu, das, saya, ne, shche, biakh...",[gute sterne fur den abschluss die erste seite...
1,0,4292,0_book_read_like_love,"[book, read, like, love, story, character, thi...",[review find live book honestly don people lov...
2,1,187,1_que_de_la_el,"[que, de, la, el, en, es, lo, los, un, libro]",[lara jean es la hija mediana de tres hermanas...
3,2,74,2_smuh_bradwel_illegal_dead,"[smuh, bradwel, illegal, dead, oh, couldn, , ,...","[illegal, smuh, oh bradwel]"
4,3,66,3_wow_amazing_good_meh,"[wow, amazing, good, meh, sniff, great, nope, ...","[wow wow, wow wow wow need amazing, wow good]"


In [None]:
# Create a new dataframe for just the sample
#df_sampled = df.loc[sampled_texts_idx].copy()  # if you used .sample(..., random_state=42)

# Or if you don't have the indexes, just:
df_sampled = pd.DataFrame({'clean_review': sampled_texts})

# Add topic info to this new smaller dataframe
df_sampled['topic'] = topics
df_sampled['topic_prob'] = probs


In [None]:
df_sampled.head()


Unnamed: 0,clean_review,topic,topic_prob
0,edit add rating book drop star star read chall...,0,1.0
1,spoiler alert plow book enjoy world building o...,0,1.0
2,pandemonium actually well book series delirium...,0,1.0
3,fue un error leer este libro de partes pero la...,1,1.0
4,confused feel literally like neil matt dan was...,0,1.0


User/Book Profile Building

In [None]:
df_sample = df.sample(n=5000, random_state=42).copy()

df_sample['lang'] = df_sample['clean_review'].apply(safe_detect)
df_english_sample = df_sample[df_sample['lang'] == 'en'].copy()

print(f"✅ English-only sample: {len(df_english_sample)} reviews out of 5000")


✅ English-only sample: 4128 reviews out of 5000


In [None]:
#user profile
user_profile = df_english_sample.groupby('user_id').agg({
    'clean_review': 'count',
    'rating': 'mean'
}).rename(columns={
    'clean_review': 'review_count',
    'rating': 'avg_rating'
}).reset_index()

user_profile.head()


Unnamed: 0,user_id,review_count,avg_rating
0,000883382802f2d95a3dd545bb953882,1,1.0
1,00238d8a4c276c47f5d5e242f54a8f28,1,3.0
2,00268e2e7b05159626c6dfff078aa795,2,4.0
3,002eff40d3de8ff36174a48d26d93da7,1,3.0
4,00499ff082d215fa9114b19ed6060041,1,5.0


In [None]:
#book profile
book_profile = df_english_sample.groupby('book_id').agg({
    'clean_review': 'count',
    'rating': 'mean'
}).rename(columns={
    'clean_review': 'review_count',
    'rating': 'avg_rating'
}).reset_index()

book_profile.head()


Unnamed: 0,book_id,review_count,avg_rating
0,4325,2,4.0
1,4580,1,4.0
2,8957,1,4.0
3,8960,1,2.0
4,8962,1,4.0


In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# pulled from previous in case of crashing
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode English reviews
english_embeddings = sbert_model.encode(
    df_english_sample['clean_review'].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True
)

# Create and fit BERTopic model
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(df_english_sample['clean_review'].tolist(), english_embeddings)

# Add topic labels and probability vectors to DataFrame
df_english_sample['topic'] = topics
df_english_sample['topic_probs_vector'] = probs


Batches:   0%|          | 0/129 [00:00<?, ?it/s]

In [None]:
#average topic per user
import numpy as np

# Convert topic vector list to NumPy arrays
df_english_sample['topic_probs_vector'] = df_english_sample['topic_probs_vector'].apply(np.array)

# Average topic distribution per user
user_topic_profiles = df_english_sample.groupby('user_id')['topic_probs_vector'].apply(
    lambda x: np.mean(np.vstack(x), axis=0)
).reset_index()

user_topic_profiles.head()


Unnamed: 0,user_id,topic_probs_vector
0,000883382802f2d95a3dd545bb953882,[1.0]
1,00238d8a4c276c47f5d5e242f54a8f28,[1.0]
2,00268e2e7b05159626c6dfff078aa795,[1.0]
3,002eff40d3de8ff36174a48d26d93da7,[1.0]
4,00499ff082d215fa9114b19ed6060041,[1.0]


In [None]:
#average topic per book
book_topic_profiles = df_english_sample.groupby('book_id')['topic_probs_vector'].apply(
    lambda x: np.mean(np.vstack(x), axis=0)
).reset_index()

book_topic_profiles.head()


Unnamed: 0,book_id,topic_probs_vector
0,4325,[0.8809942672143469]
1,4580,[1.0]
2,8957,[1.0]
3,8960,[1.0]
4,8962,[1.0]


In [None]:
user_full_profile = user_profile.merge(user_topic_profiles, on='user_id', how='left')
book_full_profile = book_profile.merge(book_topic_profiles, on='book_id', how='left')


In [None]:
user_full_profile.head()

Unnamed: 0,user_id,review_count,avg_rating,topic_probs_vector
0,000883382802f2d95a3dd545bb953882,1,1.0,[1.0]
1,00238d8a4c276c47f5d5e242f54a8f28,1,3.0,[1.0]
2,00268e2e7b05159626c6dfff078aa795,2,4.0,[1.0]
3,002eff40d3de8ff36174a48d26d93da7,1,3.0,[1.0]
4,00499ff082d215fa9114b19ed6060041,1,5.0,[1.0]


In [None]:
book_full_profile.head()

Unnamed: 0,book_id,review_count,avg_rating,topic_probs_vector
0,4325,2,4.0,[0.8809942672143469]
1,4580,1,4.0,[1.0]
2,8957,1,4.0,[1.0]
3,8960,1,2.0,[1.0]
4,8962,1,4.0,[1.0]


#### ABSA

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting spacy
  Using cached spacy-3.8.7-cp310-cp310-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.13-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.11-cp310-cp310-macosx_11_0_arm64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Using cached preshed-3.0.10-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Using cached thinc-8.3.6-cp310-cp310-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Using cached wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy

In [None]:
aspects = [
    "characters", "plot", "writing", "ending", "romance", "story",
    "pace", "world building", "main character", "love story"
]

In [None]:
import spacy
from collections import defaultdict
from tqdm import tqdm

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Make tqdm work with pandas
tqdm.pandas()

# VADER already loaded as: sia = SentimentIntensityAnalyzer()

def extract_aspect_sentiments(text, aspects):
    aspect_sentiments = defaultdict(list)
    doc = nlp(text.lower())  # lowercase for matching
    for sent in doc.sents:
        sent_text = sent.text
        sentiment_score = sia.polarity_scores(sent_text)['compound']
        if sentiment_score > 0.2:
            sentiment = 'positive'
        elif sentiment_score < -0.2:
            sentiment = 'negative'
        else:
            sentiment = 'neutral'
        for aspect in aspects:
            if aspect in sent_text:
                aspect_sentiments[aspect].append(sentiment)
    return dict(aspect_sentiments)

In [None]:
df['aspect_sentiments'] = df['clean_review'].progress_apply(lambda x: extract_aspect_sentiments(x, aspects))

100%|██████████| 986418/986418 [3:48:12<00:00, 72.04it/s]   


In [None]:
from collections import Counter

# Flatten and count aspect sentiments
aspect_sentiment_counter = defaultdict(Counter)

for row in df['aspect_sentiments']:
    for aspect, sentiments in row.items():
        aspect_sentiment_counter[aspect].update(sentiments)

# Convert to DataFrame
df_aspect_summary = pd.DataFrame(aspect_sentiment_counter).T.fillna(0)
df_aspect_summary['total'] = df_aspect_summary.sum(axis=1)
df_aspect_summary = df_aspect_summary.sort_values('total', ascending=False)

df_aspect_summary.head(10)

Unnamed: 0,negative,positive,neutral,total
story,41432,374503,15555,431490
plot,16197,127380,5638,149215
romance,4450,102992,1619,109061
ending,9854,87460,4531,101845
writing,7571,77421,2586,87578
main character,8271,65694,2436,76401
pace,4994,50935,1975,57904
love story,969,30823,364,32156
world building,1145,15431,385,16961
characters,6,49,4,59


Goal:
To find out what readers are talking about (aspects like “plot”, “characters”) and how they feel about each one.

Steps:
Step 1: Define Aspect Terms
I used a curated list of common literary review topics, including:
["characters", "plot", "writing", "ending", "romance", "story", "pace", "world building", "main character", "love story"]
These were selected based on:
	•	Domain knowledge of book reviews
	•	The top TF-IDF unigrams and bigrams

Step 2: Sentence-Level Analysis
	•	I used spaCy to break each review into sentences.
	•	For each sentence, we:
	•	Checked if it mentioned an aspect (e.g., "plot").
	•	Scored its sentiment using VADER.
	•	Assigned that sentiment to the aspect(s) in the sentence.

Step 3: Aggregate Sentiment for Each Aspect
We counted how often each aspect was mentioned with positive, negative, or neutral sentiment.

Interpretation:
	•	Story, plot, and romance were the most discussed aspects.
	•	Most sentiments were positive, suggesting strong reader engagement and satisfaction.
	•	Aspects like ending and main character showed more sentiment polarization (a mix of love/hate).
