# Image Processing

### Downloading Book Cover Images Using Selenium

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import os
import time

# Load dataset from "processed_data.csv"
file_path = "processed_data.csv"  # Ensure this file exists
df = pd.read_csv(file_path)

# Set up Selenium options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Start WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Directory to save images
image_dir = "book_covers"
os.makedirs(image_dir, exist_ok=True)

# Number of images to download
num_images = 339  # Adjust as needed

# Extract image URLs from the "Cover Image" column
image_urls = df["Cover Image"].dropna().head(num_images).tolist()

# Download images using Selenium and save them directly
for idx, url in enumerate(image_urls):
    try:
        driver.get(url)
        time.sleep(3)  # Wait for the page to load
        
        # Locate the image element
        image_element = driver.find_element("tag name", "img")

        # Save the image via Selenium
        image_path = os.path.join(image_dir, f"book_{idx}.png")
        image_element.screenshot(image_path)

    except Exception:
        pass  # Ignore errors and continue

# Close the browser after completing the task
driver.quit()


We used Selenium to download book cover images from processed_data.csv, bypassing Cloudflare protection and dynamic loading by simulating real browsing. Running Selenium in headless mode, we stored images in book_covers, extracting 313 URLs from the "Cover Image" column. For each, we opened the webpage, located the <img> element, and captured a screenshot. Error handling ensured smooth execution. With the covers downloaded, we can now analyze them for classification using color extraction, contrast analysis, or machine learning.

 ### Analyzing Images and Extracting Key Features

Now, we will load the downloaded images and extract the following visual features:

- Mean Color: To determine the dominant colors in each cover.
- Contrast: To measure how clear and vibrant the cover is.
- Edge Complexity: To assess the level of detail and design complexity.

In [34]:
import cv2
import numpy as np
import pandas as pd
import os

# Define the directory where book cover images are stored
image_dir = "book_covers"
image_files = [f for f in os.listdir(image_dir) if f.endswith(".png")]

# Function to extract visual features from images
def extract_visual_features(image_path):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Compute mean color
    mean_color = np.mean(img, axis=(0, 1))

    # Convert the image to grayscale for contrast analysis
    gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    contrast = np.std(gray_img)

    # Use edge detection to measure design complexity
    edges = cv2.Canny(gray_img, 100, 200)
    edge_complexity = np.mean(edges)

    return mean_color, contrast, edge_complexity

# Extract features from all images
features = []
for img_file in image_files:
    img_path = os.path.join(image_dir, img_file)
    mean_color, contrast, edge_complexity = extract_visual_features(img_path)
    features.append((img_file, mean_color, contrast, edge_complexity))

# Convert the extracted features into a DataFrame
df_features = pd.DataFrame(features, columns=["Filename", "Mean_Color", "Contrast", "Edge_Complexity"])

# Save extracted features to a CSV file
df_features.to_csv("book_cover_features.csv", index=False)

# Load extracted feature data
df_features = pd.read_csv("book_cover_features.csv")


### Classifying Covers as "Interesting" or "Not Interesting"

After extracting features, we will classify covers as "Interesting" if they exceed a threshold in contrast or edge complexity.

In [35]:
# Calculate thresholds for determining interesting covers
contrast_threshold = df_features["Contrast"].median()
edge_threshold = df_features["Edge_Complexity"].median()

# Adjust the threshold slightly to make classification more practical
contrast_threshold *= 0.8
edge_threshold *= 0.8

# Classify covers based on extracted features
df_features["Interesting"] = df_features.apply(
    lambda row: 1 if row["Contrast"] > contrast_threshold or row["Edge_Complexity"] > edge_threshold else 0, axis=1
)

# Save the classified covers to a new CSV file
df_features.to_csv("book_cover_features_with_classification.csv", index=False)


### Merging Classification Results with Original Book Data

Finally, we will update processed_data.csv to include the new "Interesting or Not" classification.

In [None]:
# Load the original processed data
df_processed = pd.read_csv("processed_data.csv")

# Extract filenames from the URLs
df_processed["Filename"] = df_processed["Cover Image"].apply(lambda x: os.path.basename(x) if isinstance(x, str) else "")

# Merge the datasets using the filename as the key
df_processed = df_processed.merge(df_features[["Filename", "Interesting"]], on="Filename", how="left")

# Remove the redundant filename column
df_processed.drop(columns=["Filename"], inplace=True)

# Save the updated dataset
df_processed.to_csv("processed_data_with_classification.csv", index=False)
