In [8]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns  
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt

# Scikit-Learn modules for machine learning
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [9]:
books_data_df = pd.read_csv('data/book_cover/books_data.csv')

In [10]:
# Check the structure of the dataset
print(books_data_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Title          212403 non-null  object 
 1   description    143962 non-null  object 
 2   authors        180991 non-null  object 
 3   image          160329 non-null  object 
 4   previewLink    188568 non-null  object 
 5   publisher      136518 non-null  object 
 6   publishedDate  187099 non-null  object 
 7   infoLink       188568 non-null  object 
 8   categories     171205 non-null  object 
 9   ratingsCount   49752 non-null   float64
dtypes: float64(1), object(9)
memory usage: 16.2+ MB
None


In [11]:
# Dataset Shape and Data Types
print("Dataset Shape:", books_data_df.shape)
print("\nData Types:")
print(books_data_df.dtypes.value_counts())

# Summary of numerical columns
books_data_df.describe()

# Count unique values in categorical columns
categorical_columns = books_data_df.select_dtypes(include='object').columns
for col in categorical_columns:
    print(f"{col}: {books_data_df[col].nunique()} unique values")

Dataset Shape: (212404, 10)

Data Types:
object     9
float64    1
Name: count, dtype: int64
Title: 212403 unique values
description: 133226 unique values
authors: 127278 unique values
image: 149387 unique values
previewLink: 188099 unique values
publisher: 16016 unique values
publishedDate: 11582 unique values
infoLink: 184506 unique values
categories: 10883 unique values


In [6]:
# Check for missing values
print(books_data_df.isnull().sum())

Title                 1
description       68442
authors           31413
image             52075
previewLink       23836
publisher         75886
publishedDate     25305
infoLink          23836
categories        41199
ratingsCount     162652
dtype: int64


In [16]:
books_data_df.dropna(how='any', subset=['image', 'ratingsCount'], inplace=True)
print("\nDataFrame after dropping rows with nulls in 'image' & 'ratingsCount':")
print(books_data_df.info())


DataFrame after dropping rows with nulls in 'image' & 'ratingsCount':
<class 'pandas.core.frame.DataFrame'>
Index: 47127 entries, 5 to 212402
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Title          47126 non-null  object 
 1   description    44217 non-null  object 
 2   authors        46503 non-null  object 
 3   image          47127 non-null  object 
 4   previewLink    47127 non-null  object 
 5   publisher      42643 non-null  object 
 6   publishedDate  46991 non-null  object 
 7   infoLink       47127 non-null  object 
 8   categories     45976 non-null  object 
 9   ratingsCount   47127 non-null  float64
dtypes: float64(1), object(9)
memory usage: 4.0+ MB
None


In [17]:
# Preview the dataset
print(books_data_df.head())

                                                Title  \
5   The Church of Christ: A Biblical Ecclesiology ...   
31  Voices from the Farm: Adventures in Community ...   
33                            The Battleship Bismarck   
35                                     Mini-mysteries   
42                            Tess and the Highlander   

                                          description               authors  \
5   In The Church of Christ: A Biblical Ecclesiolo...  ['Everett Ferguson']   
31  Twenty-five years ago, at the height of the co...       ['Rupert Fike']   
33  The Bismarck is perhaps the most famous – and ...  ['Stefan Draminski']   
35                                                NaN                   NaN   
42  In 1543, on a windswept isle off of Scotland, ...    ['May Mcgoldrick']   

                                                image  \
5   http://books.google.com/books/content?id=kVqRa...   
31  http://books.google.com/books/content?id=IjTAB...   
33  http://