In [1]:
import pandas as pd
import numpy as np

# Load the data again
DATA_PATH = '../data/raw/'
books_df = pd.read_csv(f'{DATA_PATH}books.csv')
ratings_df = pd.read_csv(f'{DATA_PATH}ratings.csv')
tags_df = pd.read_csv(f'{DATA_PATH}tags.csv')
book_tags_df = pd.read_csv(f'{DATA_PATH}book_tags.csv')

print("Data loaded successfully!")

Data loaded successfully!


In [2]:
# Merge book_tags with tags to get tag names
book_tags_with_names = book_tags_df.merge(tags_df, on='tag_id')

# Group by book and combine the tags into a single string
# We'll sort by count to get the most important tags first
def combine_tags(group):
    return ' '.join(group.sort_values('count', ascending=False)['tag_name'])

book_content = book_tags_with_names.groupby('goodreads_book_id').apply(combine_tags).reset_index()
book_content.rename(columns={0: 'tags'}, inplace=True)

print("Combined tags for each book:")
display(book_content.head())

Combined tags for each book:


  book_content = book_tags_with_names.groupby('goodreads_book_id').apply(combine_tags).reset_index()


Unnamed: 0,goodreads_book_id,tags
0,1,to-read fantasy favorites currently-reading yo...
1,2,to-read currently-reading fantasy favorites ch...
2,3,to-read favorites fantasy currently-reading yo...
3,5,favorites fantasy currently-reading young-adul...
4,6,fantasy young-adult fiction harry-potter owned...


In [3]:
# The 'goodreads_book_id' in book_tags corresponds to 'book_id' in books_df
books_with_content = books_df.merge(book_content, left_on='book_id', right_on='goodreads_book_id')

# Let's create the final 'content' string for each book
# We'll fill missing values with empty strings to avoid errors
books_with_content['authors'].fillna('', inplace=True)
books_with_content['tags'].fillna('', inplace=True)

books_with_content['content'] = books_with_content['title'] + ' ' + books_with_content['authors'] + ' ' + books_with_content['tags']

print("Final DataFrame with content column:")
display(books_with_content[['book_id', 'title', 'content']].head())

Final DataFrame with content column:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  books_with_content['authors'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  books_with_content['tags'].fillna('', inplace=True)


Unnamed: 0,book_id,title,content
0,2767052,"The Hunger Games (The Hunger Games, #1)","The Hunger Games (The Hunger Games, #1) Suzann..."
1,3,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter and the Sorcerer's Stone (Harry P...
2,41865,"Twilight (Twilight, #1)","Twilight (Twilight, #1) Stephenie Meyer young-..."
3,2657,To Kill a Mockingbird,To Kill a Mockingbird Harper Lee classics favo...
4,4671,The Great Gatsby,The Great Gatsby F. Scott Fitzgerald classics ...


In [4]:
import os

# Create the processed directory if it doesn't exist
if not os.path.exists('../data/processed'):
    os.makedirs('../data/processed')

# Save the books dataframe with the new content column
books_with_content.to_csv('../data/processed/books_with_content.csv', index=False)

# Save the original ratings dataframe (it's already in the right format)
ratings_df.to_csv('../data/processed/ratings.csv', index=False)

print("✅ Processed data saved to 'data/processed/' directory!")

✅ Processed data saved to 'data/processed/' directory!
