# Task for Today  

***

## Bestseller Genre Prediction  

Given *data about Amazon's Top 50 best selling books from 2009-2019*, let's try to predict the **genre** of a given book.  
  
We will use a TensorFlow ANN with two inputs to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import re
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [None]:
data = pd.read_csv('../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
stop_words = stopwords.words('english')

def process_name(name):
    name = re.sub(r'\d+', ' ', name)
    name = name.split()
    name = " ".join([word for word in name if word not in stop_words])
    return name

In [None]:
names = data['Name'].apply(process_name)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(names)

vocab_length = len(tokenizer.word_index) + 1

print("Vocabulary length:", vocab_length)

In [None]:
names = tokenizer.texts_to_sequences(names)

In [None]:
max_seq_length = np.max(list(map(lambda name: len(name), names)))

print("Max sequence length:", max_seq_length)

In [None]:
names = pad_sequences(names, maxlen=max_seq_length, padding='post')

In [None]:
names

In [None]:
data = data.drop('Name', axis=1)

# Encoding Other Features

In [None]:
data

In [None]:
genre_mapping = {'Non Fiction': 0, 'Fiction': 1}

data['Genre'] = data['Genre'].replace(genre_mapping)

In [None]:
print("Number of unique authors:", len(data['Author'].unique()))

In [None]:
def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
data = onehot_encode(data, 'Author', 'auth')

In [None]:
data

# Splitting/Scaling

In [None]:
y = data['Genre'].copy()
X = data.drop('Genre', axis=1).copy()

In [None]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [None]:
names_train, names_test, X_train, X_test, y_train, y_test = train_test_split(names, X, y, train_size=0.7, random_state=100)

# Modeling/Training

In [None]:
names.shape

In [None]:
X.shape

In [None]:
embedding_dim = 64

# Name features
name_input = tf.keras.Input(shape=(20,), name="name_input")

embedding = tf.keras.layers.Embedding(
    input_dim=vocab_length,
    output_dim=embedding_dim,
    input_length=max_seq_length,
    name="name_embedding"
)(name_input)

name_flatten = tf.keras.layers.Flatten(name="name_flatten")(embedding)


# Other features
other_input = tf.keras.Input(shape=(252,), name="other_input")

hidden_1 = tf.keras.layers.Dense(256, activation='relu', name="other_dense_1")(other_input)
hidden_2 = tf.keras.layers.Dense(256, activation='relu', name="other_dense_2")(hidden_1)

# Concatenate and output
concat = tf.keras.layers.concatenate([name_flatten, hidden_2], name="concatenate")

outputs = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(concat)


model = tf.keras.Model(inputs=[name_input, other_input], outputs=outputs)


print(model.summary())

tf.keras.utils.plot_model(model)

In [None]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)


batch_size = 32
epochs = 100

history = model.fit(
    [names_train, X_train],
    y_train,
    validation_split=0.12,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

# Results

In [None]:
model.evaluate([names_test, X_test], y_test)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/FIY53JthQD0