In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dylanjcastillo/7k-books-with-metadata")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
import numpy as np
import time
import requests

In [None]:
books = pd.read_csv(f"{path}/books.csv")

In [None]:
books.head(5)

In [None]:
summary = books.describe(include="all").transpose()
summary["missing_values"] = books.isna().sum()
summary["non_missing_values"] = books.notna().sum()
summary

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
ax = plt.axes()
sns.heatmap(books.isna().transpose(), cbar=False, ax=ax)

plt.xlabel("Column")
plt.ylabel("Missing values")

plt.show()

In [None]:
books['missing_description'] = np.where(books['description'].isna(), 1, 0)
books['age_of_book'] = 2025 - books['published_year']

columns_of_interest = ["num_pages", "age_of_book", "missing_description", "average_rating"]
correlation_matrix = books[columns_of_interest].corr(method='spearman')

sns.set_theme(style='white')
plt.figure(figsize=(8,6))
heatmap = sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', cbar_kws={'label': "Spearman correlation"})
heatmap.set_title("Correlation matrix")
plt.show()

Filling missing description for books

In [None]:
def fetch_description(isbn):
    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"
    try:
        response = requests.get(url, timeout=5)
        data = response.json()
        items = data.get("items")
        if items:
            volume_info = items[0].get("volumeInfo", {})
            return volume_info.get("description")
    except Exception as e:
        print(f"Error fetching ISBN {isbn}: {e}")
    return None


for i, row in books[books['description'].isna()].iterrows():
    isbn = row['isbn10']
    title = row['title']
    desc = fetch_description(isbn)
    if desc:
        books.at[i, 'description'] = desc
        print(f"Filled description for ISBN {isbn} and title {title}")
    else:
        print(f"No description found for ISBN {isbn} and title {title}")
    time.sleep(0.1)  

In [None]:
books.isna().sum()

In [None]:
books_missing = books[books['description'].isna() |
      books['num_pages'].isna() |
      books['average_rating'].isna() |
      books['published_year'].isna() 
] 

books = books[~books['isbn10'].isin(books_missing['isbn10'])]

In [None]:
books.isna().sum()

Category of the book

In [None]:
books['categories'].value_counts().reset_index().sort_values("count", ascending=False)

Number of words in description

In [None]:
books["words_in_desc"] = books['description'].str.split().str.len()

In [None]:
sns.set_theme(style="whitegrid")
plt.figure(figsize=(10,6))

sns.histplot(books['words_in_desc'], bins=100)
plt.xlabel("Number of words in description")
plt.ylabel("Number of books")
plt.title("Distribution of description lengths")
plt.show()


In [None]:
books.loc[books['words_in_desc'].between(1, 4), "description"]

In [None]:
books.loc[books['words_in_desc'].between(5, 14), "description"]

In [None]:
books.loc[books['words_in_desc'].between(15, 24), "description"]

In [None]:
books.loc[books['words_in_desc'].between(25, 34), "description"]

In [None]:
books = books[books['words_in_desc'] >= 25]

In [None]:
len(books)

Merging title and subtitle

In [None]:
books["title_and_subtitle"] = (np.where(books["subtitle"].isna(), books["title"], books[['title', 'subtitle']].astype(str).agg(": ".join, axis=1)))

In [None]:
books

Tagged description

In [None]:
books['tagged_description'] = books[['isbn13', 'description']].astype(str).agg(" ".join, axis=1)

In [None]:
books

Dropping some columns

In [None]:
books.drop(['subtitle', 'missing_description', "age_of_book", "words_in_desc"], axis=1).to_csv("books_cleaned.csv", index=False)