In [1]:
!pip install pandas tensorflow tensorflow-hub tensorflow-text



In [2]:
import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text
import requests
import re

In [3]:
import tensorflow as tf

# List available GPUs
gpu_devices = tf.config.list_physical_devices('GPU')
if gpu_devices:
    print(f"✅ GPU detected! Using: {gpu_devices[0]}")
    tf.config.set_visible_devices(gpu_devices, 'GPU')  # Force TensorFlow to use GPU
else:
    print("❌ No GPU detected. Running on CPU.")



✅ GPU detected! Using: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [4]:
# Set TensorFlow to use GPU explicitly
tf.config.set_visible_devices(gpu_devices, 'GPU')
print("✅ TensorFlow now running on GPU!")

✅ TensorFlow now running on GPU!


In [5]:
# Load dataset
df = pd.read_csv("/content/books_isbn_title.csv")  # Adjust path in Colab

In [10]:
# Define genre mapping using keywords
GENRE_KEYWORDS = {
    "fantasy": ["magic", "dragon", "wizard", "elf", "fairy", "sorcery", "spell"],
    "science fiction": ["space", "robot", "alien", "cyberpunk", "dystopian"],
    "mystery": ["detective", "murder", "crime", "thriller", "suspense"],
    "romance": ["love", "romance", "affair", "passion", "relationship"],
    "horror": ["ghost", "vampire", "werewolf", "haunted", "supernatural"],
    "historical": ["history", "biography", "war", "medieval", "ancient"],
    "non-fiction": ["philosophy", "science", "self-help", "memoir", "psychology"],
    "adventure": ["journey", "expedition", "exploration", "survival"],
    "children's books": ["kids", "storybook", "nursery", "fairytale"],
    "humor": ["comedy", "satire", "parody", "funny"],
    "drama": ["family", "tragedy", "emotion", "conflict"],
    "self-improvement": ["motivation", "personal development", "growth"]
}

In [11]:
# Preprocess titles
df["title"] = df["title"].str.lower().str.replace(r'[^a-z0-9\s]', '', regex=True)

In [12]:
# Load Universal Sentence Encoder on GPU
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [13]:
import tensorflow as tf

# 1. PRE-COMPUTE GENRE EMBEDDINGS ONCE
GENRE_NAMES = list(GENRE_KEYWORDS.keys())
genre_embeddings = use_model(GENRE_NAMES)  # Shape: [12, 512]

# 2. BATCH PROCESSING FUNCTION (Compiled for GPU)
@tf.function
def classify_batch(titles_batch):
    try:
      title_embeddings = use_model(titles_batch)  # Shape: [batch, 512]
      # Matrix mult: [batch, 512] x [512, 12] -> [batch, 12]
      similarity = tf.matmul(title_embeddings, genre_embeddings, transpose_b=True)
      return tf.argmax(similarity, axis=1)  # Returns indices
    except tf.errors.ResourceExhaustedError:
      batch_size //= 2  # Reduce batch size if OOM

# Convert the "Title" column into a list for batch classification
titles = df["title"].tolist()

# 3. PROCESS IN BATCHES
batch_size = 4096  # Maximize GPU utilization
all_genres = []

for i in range(0, len(titles), batch_size):
    batch = titles[i:i+batch_size]
    batch_indices = classify_batch(batch).numpy()
    all_genres.extend([GENRE_NAMES[idx] for idx in batch_indices])

df["genre"] = all_genres



In [None]:
#if i % (10*batch_size) == 0:
#    print(f"Processed {i}/{len(titles)} titles")

In [17]:
df.isna().sum()

Unnamed: 0,0
ISBN,0
title,0
genre,0


In [21]:
# Save results
output_path = "/content/genres.csv"
df.to_csv(output_path, index=False)
print(f"💾 Results saved to {output_path}")

# Download from Colab
from google.colab import files
files.download(output_path)

💾 Results saved to /content/genres.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>