Can we with some degree of acuracy predict which category or genre a book is from the title alone?

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import spacy
nlp = spacy.load('en')
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# Read data out into pandas dataframes
raw_data_df = pd.read_csv('/kaggle/input/goodreads-10k-dataset-integrated/books_updated.csv')
# we will take only the original tite, and tags column
columns = ['original_title', 'tag_name']
df = raw_data_df[columns].copy()
# we will remove any rows which have nan values or empty strings in the original title or tag names
df['original_title'].replace('', np.nan, inplace=True)
df['tag_name'].replace('', np.nan, inplace=True)
df.dropna(inplace=True) 
df.head()

In [None]:
# Get unique tag values
unique_tags = [val.strip() for sublist in df['tag_name'].dropna().str.split(",").tolist() for val in sublist]
print(f'No. of unique tags {len(unique_tags)}, first 10 entries {unique_tags[0:10]}')
# print count for each unique tag
tags_summary = pd.DataFrame(unique_tags,columns=['tag_name']).value_counts().reset_index().rename(columns={0:'count'})

In [None]:
tags_summary[0:5]

In [None]:
# test = pd.DataFrame(unique_tags,columns=['tag_name'])
# sns.countplot(x='tag_name',data=test[0:50])


In [None]:
# We need to remove those that are not genres such as audio, toread etc 'fiction', 'fantasy', 'nonfiction',
to_remove = ['library','audio', 'books', 'audiobook', 'read', 'tobuy', 'ebook', 'ya', 'ownedbooks', 'default', 'readin', 'kindle', 'bookclub', 'series', 'booksiown', 'owned', 'currentlyreading', 'favourites', 'favorites', 'ebooks', 'childrens', 'toread', 'audiobooks']
# ya here I will assume is young adult and merge, same for childrens and children
index_names = []
for tag in to_remove:
    indexes = (tags_summary[tags_summary['tag_name'] == tag ].index)
    for index in indexes:
        index_names.append(index)

tags_summary.drop(index_names, inplace = True)
genres_as_list = tags_summary['tag_name'][0:30].tolist()
print(genres_as_list)

In [None]:
plt.figure(figsize=(30,4))
plt.bar('tag_name', 'count', data=tags_summary[50:150])
plt.xticks(rotation=90)
plt.show()

In [None]:
# Now we can create a column for each genre and then assign a value if the book has been tagged as that genre
for genre in genres_as_list:
    
    df[genre] = df['tag_name'].map(lambda x: 1 if (genre in x) else 0)

In [None]:
# great now we can remove the tag name column
df.drop(['tag_name'], axis=1, inplace=True)

df.head()

In [None]:
def load_data(df, split=0.9):
    
    # Shuffle data
    train_data = df.sample(frac=1, random_state=7)
    
    texts = train_data['original_title']
    y = train_data.drop(['original_title'], axis=1) # this leaves us with all the other columns
    labels = y.to_dict('records')
    split = int(len(train_data) * split)
    
    train_labels = [{"cats": labels} for labels in labels[:split]]
    val_labels = [{"cats": labels} for labels in labels[split:]]
    
    return texts[:split], train_labels, texts[split:], val_labels

In [None]:
train_texts, train_labels, val_texts, val_labels = load_data(df)

In [None]:
print('Texts from training data\n------')
print(train_texts[:2])
print('\nLabels from training data\n------')
print(train_labels[:2])

In [None]:
# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
              "textcat",
              config={
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

In [None]:
# Add labels to text classifier
for genre in genres_as_list:
    textcat.add_label(genre)

In [None]:
from spacy.util import minibatch
import random

def train(model, train_data, optimizer):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    
    batches = minibatch(train_data, size=8)
    for batch in batches:
        # train_data is a list of tuples [(text0, label0), (text1, label1), ...]
        # Split batch into texts and labels
        texts, labels = zip(*batch)
        
        # Update model with texts and labels
        model.update(texts, labels, sgd=optimizer, losses=losses)
        
    return losses

In [None]:
# Fix seed for reproducibility
spacy.util.fix_random_seed(1)
random.seed(1)

# This may take a while to run!
optimizer = nlp.begin_training()
train_data = list(zip(train_texts, train_labels))
losses = train(nlp, train_data, optimizer)
print(losses['textcat'])

In [None]:
text = "The girl with the dragon tattoo"
doc = nlp(text)
print(doc.cats)

In [None]:
def predict(nlp, texts): 
    # Use the model's tokenizer to tokenize each input text
    docs = [nlp.tokenizer(text) for text in texts]
    
    # Use textcat to get the scores for each doc
    textcat = nlp.get_pipe('textcat')
    scores, _ = textcat.predict(docs)
    # From the scores, find the class with the highest score/probability
    predicted_class = scores.argmax(axis=1)
    
    return predicted_class

In [None]:
texts = val_texts[34:38]
predictions = predict(nlp, texts)

for p, t in zip(predictions, texts):
    print(f"{textcat.labels[p]}: {t} \n")

In [None]:
texts = val_texts
predictions = predict(nlp, texts)

true_classes = [max(each['cats'], key=each['cats'].get) for each in val_labels] # this only takes one of the genres (the first on with 1)

def get_accuracry(predictions, true_classes):
    correct_predictions = []

    for p, c in zip(predictions, true_classes):
        if textcat.labels[p] == c:
            correct_predictions.append(1)
        else:
            correct_predictions.append(0)
    
    return sum(correct_predictions) / len(correct_predictions)


print(f'Accuracy: {get_accuracry(predictions, true_classes)}')

In [None]:
def evaluate(model, texts, labels):
    """ Returns the accuracy of a TextCategorizer model. 
    
        Arguments
        ---------
        model: ScaPy model with a TextCategorizer
        texts: Text samples, from load_data function
        labels: True labels, from load_data function
    
    """
    # Get predictions from textcat model (using your predict method)
    predicted_class = predict(model, texts)
    # From labels, get the true class as a list of integers (POSITIVE -> 1, NEGATIVE -> 0)
    # true_class = [max(each['cats'], key=each['cats'].get) for each in labels]
    true_class = []
    for label in labels:
        true_classes_per_label = []
        for cat in label['cats']:
            if label['cats'][cat] == 1:
                true_classes_per_label.append(cat)
        true_class.append(true_classes_per_label)
            
    # A boolean or int array indicating correct predictions
    correct_predictions = []
    for p, c in zip(predicted_class, true_class):
        correct_predictions.append(textcat.labels[p] in c)
        
    # The accuracy, number of correct predictions divided by all predictions
    accuracy = sum(correct_predictions) / len(correct_predictions)
    
    return accuracy

TODO: clean rows that do not match**** any of the categories

In [None]:
accuracy = evaluate(nlp, val_texts, val_labels)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
# doc = nlp(train_texts[0])

# print(f"Token \t\tLemma \t\tStopword".format('Token', 'Lemma', 'Stopword'))
# print("-"*40)
# for token in doc:
#     print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}")