In [None]:
import kagglehub

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
path = kagglehub.dataset_download("dylanjcastillo/7k-books-with-metadata")

print("Path to dataset files:", path)

In [None]:
books = pd.read_csv(f"{path}/books.csv")
books

In [None]:
ax = plt.axes()
sns.heatmap(books.isna().transpose(), cbar=False, ax=ax)

plt.xlabel("Columns")
plt.ylabel("Missing Values")

plt.show()

In [None]:
books["missing_description"] = np.where(books["description"].isna(), 1, 0)
books["age_of_book"] = 2025 - books["published_year"]

In [None]:
columns_of_interest = ["num_pages", "age_of_book", "missing_description", "average_rating"]

correlation_matrix = books[columns_of_interest].corr(method="spearman")

sns.set_theme(style="white")
plt.figure(figsize=(8, 6))
heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar_kws={"label": "Spearman correlation"})
heatmap.set_title("Correlation Heatmap")
plt.show()

In [None]:
books[(books["description"].isna()) |
      (books["average_rating"].isna()) |
      (books["num_pages"].isna()) |
      (books["published_year"].isna())
      ]

In [None]:
books_missing = books[~(books["description"].isna()) &
      ~(books["average_rating"].isna()) &
      ~(books["num_pages"].isna()) &
      ~(books["published_year"].isna())
      ]

In [None]:
books_missing

In [None]:
books_missing.describe()

In [None]:
books_missing["categories"].value_counts().reset_index().sort_values("count", ascending=False)

In [None]:
# Compute counts and reset index
categories_df = books_missing["categories"].value_counts().reset_index()
categories_df.columns = ["category", "count"]

# Sort the data by count in descending order
sorted_categories = categories_df.sort_values("count", ascending=False)

# Create an interactive bar chart with a horizontal range slider for scrolling
fig = px.bar(
    sorted_categories,
    x="category",
    y="count",
    template="plotly_white",
    title="Count of Books per Category"
)

fig.update_layout(
    xaxis_title="Category",
    yaxis_title="Count",
    xaxis_tickangle=-45,
    xaxis=dict(
        rangeslider=dict(visible=True),
        type="category"
    )
)

fig.show()

In [None]:
books_missing["words_in_description"] = books_missing["description"].str.split().str.len()

In [None]:
books_missing

In [None]:
books_missing.loc[books_missing["words_in_description"].between(1, 4), "description"].reset_index()

In [None]:
books_missing.loc[books_missing["words_in_description"].between(5, 14), "description"].reset_index()

In [None]:
books_missing.loc[books_missing["words_in_description"].between(15, 24), "description"].reset_index()

In [None]:
books_missing.loc[books_missing["words_in_description"].between(25, 34), "description"].reset_index()

In [None]:
book_missing_25_words = books_missing[books_missing["words_in_description"] >= 25]

In [None]:
book_missing_25_words.reset_index()

In [None]:
book_missing_25_words["title_and_subtitle"] = (
    np.where(book_missing_25_words["subtitle"].isna(), book_missing_25_words["title"],
             book_missing_25_words[["title", "subtitle"]].astype(str).agg(": ".join, axis=1))
)

In [None]:
book_missing_25_words

In [None]:
book_missing_25_words["tagged_description"] = book_missing_25_words[["isbn13", "description"]].astype(str).agg(": ".join, axis=1)

In [None]:
book_missing_25_words

In [None]:
book_missing_25_words.drop(["subtitle", "missing_description", "age_of_book", "words_in_description"], axis=1).to_csv("book_cleaned.csv", index=False)