In [17]:
import pandas as pd
import re

In [18]:
INPUT_PATH = '../dataset/book_summary.csv'
# Load your CSV file
df = pd.read_csv(INPUT_PATH)

In [24]:
#functions

def is_valid_id(value):
    return isinstance(value, (int, float)) or (isinstance(value, str) and value.isdigit())

# Keep rows where title is a non-empty string and contains at least one letter or digit
def is_valid_title(title):
    if not isinstance(title, str):
        return False
    title = title.strip()
    return bool(re.search(r"[A-Za-z0-9]", title))  # at least one alphanumeric

# Define a function to check if author is valid
def is_valid_author(author):
    if not isinstance(author, str):
        return False
    author = author.strip()

    # Reject if it looks like a JSON/dictionary or contains braces
    if author.startswith("{") or ":" in author or "/" in author:
        return False

    # Must contain at least one alphabetic word (e.g., "John Smith")
    return bool(re.fullmatch(r"[A-Za-z\s\.\'\-]+", author))

In [25]:
# Apply the filter

df_clean = df[df['id'].apply(is_valid_id)]

df_clean = df_clean[df_clean['title'].apply(is_valid_title)]
df_clean = df_clean[df_clean['author'].apply(is_valid_author)]

In [26]:
# Drop rows where 'genres' is NaN or empty
df_clean = df_clean[df_clean['genres'].notna()]  # removes NaN
df_clean = df_clean[df_clean['genres'].str.strip() != '']  # removes empty strings

In [27]:
df_clean.count()

id              11705
title           11705
author          11705
publish_year     9019
genres          11705
summary         11705
dtype: int64

In [28]:
OUTPUT_PATH = "../dataset/preprocessed_data.csv"

# Save the cleaned file
df_clean.to_csv(OUTPUT_PATH, index=False, encoding='utf-8')

print(f"Remaining rows after dropping missing data: {len(df_clean)}")

Remaining rows after dropping missing data: 11705
