# Step 2: Transformation (The Refiner)

In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import ftfy
import os

INPUT_PATH = '../data/books_raw_enriched.csv'
OUTPUT_PATH = '../data/books_cleaned.csv'

df = pd.read_csv(INPUT_PATH)
print(f"Loaded {len(df)} rows.")

Loaded 36358 rows.


**Cleaning Descriptions**

In [6]:
def clean_description(text):
    if pd.isna(text):
        return None
    
    # Fix encoding (mojibake)
    text = ftfy.fix_text(str(text))
    
    # Remove HTML
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    
    # Extra whitespace cleaning
    text = ' '.join(text.split())
    
    # Handle "Description not available" or short/empty strings
    if len(text) < 5 or "description not available" in text.lower():
        return None
        
    return text

df['clean_description'] = df['description'].apply(clean_description)

**Filtering Valid Data**

In [None]:
initial_len = len(df)
df_clean = df.dropna(subset=['clean_description'])

# Deduplication
df_clean = df_clean.drop_duplicates(subset=['clean_isbn'])

print(f"Rows before: {initial_len}, after filtering and deduplication: {len(df_clean)}")

Rows before: 36358, after filtering and deduplication: 10921


**Saving Cleaned Data**

In [9]:
df_clean.to_csv(OUTPUT_PATH, index=False)
print(f"Saved cleaned data to {OUTPUT_PATH}")

Saved cleaned data to ../data/books_cleaned.csv
