In [1]:
import pandas as pd

from sklearn.impute import SimpleImputer

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Books table

# Books Table

This notebook contains exploratory data analysis (EDA) and preprocessing steps for the `books` table.

The main preprocessing tasks include:
- Cleaning and formatting the `price` and `pages` columns
- Calculating the age of each book based on its publication year

Other columns (such as authors, publisher and categories) are excluded from modeling due to high cardinality and limited feature utility.


## Load books

In [2]:
file_path = 'data/books.csv'
books_df = pd.read_csv(file_path)
books_df.head()

Unnamed: 0,id,title,authors,publisher,publishedDate,categories,price,pages
0,hVFwAAAAQBAJ,Ogilvy on Advertising,['David Ogilvy'],Vintage,2013-09-11,['Social Science'],72.99,320
1,bRY9AAAAYAAJ,Foreign Publications for Advertising American ...,['United States. Bureau of Foreign and Domesti...,,1913,['Advertising'],469.99,654
2,ZapAAAAAIAAJ,Advertising and the Public Interest,"['John A. Howard', 'James Hulbert']",,1973,['Advertising'],372.0,784
3,A-HthMfF5moC,Profitable Advertising,,,1894,['Advertising'],240.99USD,559
4,4Z9JAAAAMAAJ,Report of the Federal Trade Commission on Dist...,['United States. Federal Trade Commission'],,1944,['Government publications'],539.0,757


In [3]:
# Check data types and number of missing values
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             240 non-null    object
 1   title          240 non-null    object
 2   authors        173 non-null    object
 3   publisher      94 non-null     object
 4   publishedDate  238 non-null    object
 5   categories     201 non-null    object
 6   price          238 non-null    object
 7   pages          240 non-null    object
dtypes: object(8)
memory usage: 15.1+ KB


## Analyze authors and publishers

In [4]:
# Publishers cardinality is very high - not a good feature candidate
books_df['publisher'].nunique()

64

In [5]:
# Cardinality for authors is too high  - not a good feature candidate
books_df['authors'].nunique()

171

In [6]:
# Cardinality for categories is too high  - not a good feature candidate
books_df['categories'].nunique()

87

## Analyze prices and pages

In [7]:
# Format price and pages
# Cast price to float and pages to int
books_df['price'] = books_df['price'].str.extract(r'(\d+(?:\.\d+)?)').astype(float)
books_df['pages'] = books_df['pages'].str.extract(r'(\d+(?:\.\d+)?)').astype(int)

# Impute price with median values
books_df['price_imputed'] = pd.isna(books_df['price'])

imp_median = SimpleImputer(strategy='median') 
books_df[['price']] = imp_median.fit_transform(books_df[['price']])

## Analyze publish dates

In [8]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             240 non-null    object 
 1   title          240 non-null    object 
 2   authors        173 non-null    object 
 3   publisher      94 non-null     object 
 4   publishedDate  238 non-null    object 
 5   categories     201 non-null    object 
 6   price          240 non-null    float64
 7   pages          240 non-null    int64  
 8   price_imputed  240 non-null    bool   
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 15.4+ KB


In [None]:
# Calculate book age compared to year 2019
# Imute missing valyes with median

books_df['publishedDate'] = pd.to_datetime(books_df['publishedDate'], errors='coerce', format='mixed').dt.year
books_df['book_age'] = 2019 - books_df['publishedDate'] 

books_df['book_age_imputed'] = (pd.isna(books_df['book_age'])) | (books_df['book_age']<0)

books_df.loc[books_df['book_age_imputed'], 'book_age'] = None
imp_median = SimpleImputer(strategy='median') 
books_df[['book_age']] = imp_median.fit_transform(books_df[['book_age']])

# Save preprocessed books dataset
books_df.to_csv('data_preprocessed/books_processed.csv')