# Book Recommendation System - Data Exploration

This notebook explores the original Goodbooks 10K dataset to understand:
- Dataset structure and statistics
- Genre distributions
- User-book interaction patterns
- Identification of Adventure and Mystery books for bias experiments

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Load Original Data

In [11]:
# Load datasets
data_path = '../data/original/goodbooks-10k'

books = pd.read_csv(f'{data_path}/books.csv')
ratings = pd.read_csv(f'{data_path}/ratings.csv')
book_tags = pd.read_csv(f'{data_path}/book_tags.csv')
tags = pd.read_csv(f'{data_path}/tags.csv')

print(f"📚 Books: {books.shape}")
print(f"⭐ Ratings: {ratings.shape}")
print(f"🏷️  Book Tags: {book_tags.shape}")
print(f"🏷️  Tags: {tags.shape}")

📚 Books: (10000, 23)
⭐ Ratings: (5976479, 3)
🏷️  Book Tags: (999912, 3)
🏷️  Tags: (34252, 2)


## 2. Basic Dataset Statistics

In [12]:
# Basic statistics
print("=== DATASET OVERVIEW ===")
print(f"📖 Total books: {books['book_id'].nunique():,}")
print(f"👥 Total users: {ratings['user_id'].nunique():,}")
print(f"⭐ Total ratings: {len(ratings):,}")
print(f"🏷️  Total tags: {tags['tag_name'].nunique():,}")

# Rating distribution
print("\n=== RATING DISTRIBUTION ===")
rating_dist = ratings['rating'].value_counts().sort_index()
for rating, count in rating_dist.items():
    percentage = (count / len(ratings)) * 100
    print(f"Rating {rating}: {count:,} ({percentage:.1f}%)")

# Average rating
avg_rating = ratings['rating'].mean()
print(f"\n📊 Average rating: {avg_rating:.2f}")

=== DATASET OVERVIEW ===
📖 Total books: 10,000
👥 Total users: 53,424
⭐ Total ratings: 5,976,479
🏷️  Total tags: 34,252

=== RATING DISTRIBUTION ===
Rating 1: 124,195 (2.1%)
Rating 2: 359,257 (6.0%)
Rating 3: 1,370,916 (22.9%)
Rating 4: 2,139,018 (35.8%)
Rating 5: 1,983,093 (33.2%)

📊 Average rating: 3.92


## 3. Genre Analysis Using Tags

In [13]:
# Merge tags with books to get genre information
book_tag_info = book_tags.merge(tags, on='tag_id')

# Look for Adventure and Mystery related tags
adventure_tags = book_tag_info[book_tag_info['tag_name'].str.contains('adventure', case=False, na=False)]
mystery_tags = book_tag_info[book_tag_info['tag_name'].str.contains('mystery|detective|crime|thriller', case=False, na=False)]

print("=== ADVENTURE TAGS ===")
print(adventure_tags['tag_name'].unique()[:20])

print("\n=== MYSTERY/THRILLER TAGS ===")
print(mystery_tags['tag_name'].unique()[:20])

=== ADVENTURE TAGS ===
['adventure' 'action-adventure' 'travel-adventure' 'adventure-travel'
 'travel-and-adventure' 'fantasy-adventure' 'adventures' 'adventurers'
 'true-adventure' 'outdoor-adventure' 'adventure-books'
 'adventure-outdoors' 'adventure-survival' 'non-fiction-adventure'
 'adventure-nonfiction' 'outdoors-adventure' 'adventure-thriller'
 'mystery-adventure' 'adventure-spirituality' 'adventure-fiction']

=== MYSTERY/THRILLER TAGS ===
['mystery' 'thriller' 'crime' 'mystery-thriller' 'thrillers'
 'crime-thriller' 'detective' 'detectives' 'mystery-crime'
 'mystery-suspense' 'crime-fiction' 'crime-mystery' 'eco-thriller'
 'historical-mystery' 'mystery-and-suspense' 'suspense-thriller'
 'thriller-mystery' 'thriller-suspense' 'murder-mystery'
 'mystery-thrillers']


In [14]:
# Find most popular tags
popular_tags = book_tag_info.groupby('tag_name')['count'].sum().sort_values(ascending=False)

print("=== TOP 20 MOST POPULAR TAGS ===")
for i, (tag, count) in enumerate(popular_tags.head(20).items(), 1):
    print(f"{i:2d}. {tag}: {count:,}")

=== TOP 20 MOST POPULAR TAGS ===
 1. to-read: 140,718,761
 2. currently-reading: 7,507,958
 3. favorites: 4,503,173
 4. fiction: 3,688,819
 5. fantasy: 3,548,157
 6. young-adult: 1,848,306
 7. classics: 1,756,920
 8. books-i-own: 1,317,235
 9. romance: 1,231,926
10. owned: 1,224,279
11. ya: 898,334
12. mystery: 872,282
13. non-fiction: 857,901
14. historical-fiction: 815,421
15. series: 782,637
16. science-fiction: 703,866
17. sci-fi: 597,325
18. paranormal: 542,559
19. kindle: 506,882
20. contemporary: 486,001


## 4. Identify Adventure and Mystery Books

In [15]:
# Define Adventure and Mystery related keywords
adventure_keywords = ['adventure', 'quest', 'journey', 'exploration']
mystery_keywords = ['mystery', 'detective', 'crime', 'thriller', 'suspense', 'murder', 'investigation']

# Function to check if a book belongs to a genre
def get_genre_books(keywords, min_tag_count=5):
    genre_pattern = '|'.join(keywords)
    genre_tags = book_tag_info[book_tag_info['tag_name'].str.contains(genre_pattern, case=False, na=False)]
    
    # Group by book and sum tag counts
    genre_books = genre_tags.groupby('goodreads_book_id')['count'].sum().reset_index()
    genre_books = genre_books[genre_books['count'] >= min_tag_count]
    
    return genre_books

# Get Adventure and Mystery books
adventure_books = get_genre_books(adventure_keywords)
mystery_books = get_genre_books(mystery_keywords)

print(f"🗺️  Adventure books found: {len(adventure_books)}")
print(f"🔍 Mystery books found: {len(mystery_books)}")

🗺️  Adventure books found: 3525
🔍 Mystery books found: 4084


In [16]:
# Map goodreads_book_id to book_id for our ratings dataset
book_id_mapping = books[['book_id', 'goodreads_book_id']].set_index('goodreads_book_id')['book_id'].to_dict()

adventure_book_ids = [book_id_mapping.get(gid) for gid in adventure_books['goodreads_book_id']]
adventure_book_ids = [bid for bid in adventure_book_ids if bid is not None]

mystery_book_ids = [book_id_mapping.get(gid) for gid in mystery_books['goodreads_book_id']]
mystery_book_ids = [bid for bid in mystery_book_ids if bid is not None]

print(f"🗺️  Adventure books with ratings: {len(adventure_book_ids)}")
print(f"🔍 Mystery books with ratings: {len(mystery_book_ids)}")

# Show some examples
print("\n=== SAMPLE ADVENTURE BOOKS ===")
sample_adventure = books[books['book_id'].isin(adventure_book_ids)][['title', 'authors']].head(10)
for _, row in sample_adventure.iterrows():
    print(f"• {row['title']} by {row['authors']}")

print("\n=== SAMPLE MYSTERY BOOKS ===")
sample_mystery = books[books['book_id'].isin(mystery_book_ids)][['title', 'authors']].head(10)
for _, row in sample_mystery.iterrows():
    print(f"• {row['title']} by {row['authors']}")

🗺️  Adventure books with ratings: 3525
🔍 Mystery books with ratings: 4084

=== SAMPLE ADVENTURE BOOKS ===
• The Hunger Games (The Hunger Games, #1) by Suzanne Collins
• Harry Potter and the Sorcerer's Stone (Harry Potter, #1) by J.K. Rowling, Mary GrandPré
• The Hobbit by J.R.R. Tolkien
• Angels & Demons  (Robert Langdon, #1) by Dan Brown
• Divergent (Divergent, #1) by Veronica Roth
• The Girl with the Dragon Tattoo (Millennium, #1) by Stieg Larsson, Reg Keeland
• Catching Fire (The Hunger Games, #2) by Suzanne Collins
• Harry Potter and the Prisoner of Azkaban (Harry Potter, #3) by J.K. Rowling, Mary GrandPré, Rufus Beck
• The Fellowship of the Ring (The Lord of the Rings, #1) by J.R.R. Tolkien
• Mockingjay (The Hunger Games, #3) by Suzanne Collins

=== SAMPLE MYSTERY BOOKS ===
• The Hunger Games (The Hunger Games, #1) by Suzanne Collins
• Harry Potter and the Sorcerer's Stone (Harry Potter, #1) by J.K. Rowling, Mary GrandPré
• To Kill a Mockingbird by Harper Lee
• Angels & Demons  (R

## 5. Rating Patterns for Genre Books

In [17]:
# Analyze ratings for Adventure and Mystery books
adventure_ratings = ratings[ratings['book_id'].isin(adventure_book_ids)]
mystery_ratings = ratings[ratings['book_id'].isin(mystery_book_ids)]
other_ratings = ratings[~ratings['book_id'].isin(adventure_book_ids + mystery_book_ids)]

print("=== RATING STATISTICS BY GENRE ===")
print(f"🗺️  Adventure ratings: {len(adventure_ratings):,} ({len(adventure_ratings)/len(ratings)*100:.1f}%)")
print(f"🔍 Mystery ratings: {len(mystery_ratings):,} ({len(mystery_ratings)/len(ratings)*100:.1f}%)")
print(f"📚 Other ratings: {len(other_ratings):,} ({len(other_ratings)/len(ratings)*100:.1f}%)")

print("\n=== AVERAGE RATINGS BY GENRE ===")
print(f"🗺️  Adventure average: {adventure_ratings['rating'].mean():.2f}")
print(f"🔍 Mystery average: {mystery_ratings['rating'].mean():.2f}")
print(f"📚 Other average: {other_ratings['rating'].mean():.2f}")
print(f"🌐 Overall average: {ratings['rating'].mean():.2f}")

=== RATING STATISTICS BY GENRE ===
🗺️  Adventure ratings: 2,490,467 (41.7%)
🔍 Mystery ratings: 2,438,080 (40.8%)
📚 Other ratings: 2,318,937 (38.8%)

=== AVERAGE RATINGS BY GENRE ===
🗺️  Adventure average: 3.98
🔍 Mystery average: 3.92
📚 Other average: 3.89
🌐 Overall average: 3.92


## 6. Save Processed Data for Next Steps

In [18]:
# Save the clean processed data
output_path = '../data/processed'

# Save book IDs for genres
pd.DataFrame({'book_id': adventure_book_ids}).to_csv(f'{output_path}/adventure_book_ids.csv', index=False)
pd.DataFrame({'book_id': mystery_book_ids}).to_csv(f'{output_path}/mystery_book_ids.csv', index=False)

# Save clean datasets
ratings.to_csv(f'{output_path}/clean_ratings.csv', index=False)
books.to_csv(f'{output_path}/clean_books.csv', index=False)

# Save summary statistics
summary_stats = {
    'total_books': books['book_id'].nunique(),
    'total_users': ratings['user_id'].nunique(),
    'total_ratings': len(ratings),
    'adventure_books': len(adventure_book_ids),
    'mystery_books': len(mystery_book_ids),
    'adventure_ratings': len(adventure_ratings),
    'mystery_ratings': len(mystery_ratings),
    'average_rating': ratings['rating'].mean(),
    'adventure_avg_rating': adventure_ratings['rating'].mean(),
    'mystery_avg_rating': mystery_ratings['rating'].mean()
}

pd.DataFrame([summary_stats]).to_csv(f'{output_path}/dataset_summary.csv', index=False)

print("✅ Data exploration complete!")
print(f"📁 Processed data saved to: {output_path}")
print("\n🔄 Next step: Run 02_baseline_svd.ipynb")

✅ Data exploration complete!
📁 Processed data saved to: ../data/processed

🔄 Next step: Run 02_baseline_svd.ipynb
