In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
from utilities import read_xml

In [27]:
# Import
PATH_DEV = os.path.join(os.getcwd(), 'origdata', 'BlurbGenreCollection_EN_dev.txt')
PATH_TEST = os.path.join(os.getcwd(), 'origdata', 'BlurbGenreCollection_EN_test.txt')
PATH_TRAIN = os.path.join(os.getcwd(), 'origdata', 'BlurbGenreCollection_EN_train.txt')

df_train = read_xml(PATH_TRAIN)
df_test = read_xml(PATH_TEST)
df_dev = read_xml(PATH_DEV)

frames = [df_train, df_test, df_dev]
df = pd.concat(frames)


In [28]:
df.head()

Unnamed: 0,TITLE,AUTHOR,PUBLISHED,ISBN,PAGE_NUM,URL,TOPICS,COPYRIGHT,DESCRIPTION,DATE,LANGUAGE
0,The New York Times Daily Crossword Puzzles: Th...,New York Times,"Dec 28, 1996",9780804115827,224 Pages,https://www.penguinrandomhouse.com/books/12309...,"Nonfiction, Games",(c) Penguin Random House,Monday’s Crosswords Do with EaseTuesday’s Cros...,2018-08-18,en
1,Creatures of the Night (Second Edition),Neil Gaiman,"Nov 29, 2016",9781506700250,48 Pages,https://www.penguinrandomhouse.com/books/53958...,"Fiction, Graphic Novels Manga",(c) Penguin Random House,Two of literary comics modern masters present ...,2018-08-18,en
2,Cornelia and the Audacious Escapades of the So...,Lesley M. M. Blume,"Jan 08, 2008",9780440421108,272 Pages,https://www.penguinrandomhouse.com/books/15160...,"Children’s Books, Children’s Middle Grade Books",(c) Penguin Random House,Eleven-year-old Cornelia is the daughter of tw...,2018-08-18,en
3,The Alchemist's Daughter,Katharine McMahon,"Oct 24, 2006",9780307335852,352 Pages,https://www.penguinrandomhouse.com/books/11223...,"Fiction, Historical Fiction",(c) Penguin Random House,"During the English Age of Reason, a woman cloi...",2018-08-18,en
4,Dangerous Boy,Mandy Hubbard,"Aug 30, 2012",9781101575017,272 Pages,https://www.penguinrandomhouse.com/books/30534...,"Teen Young Adult, Teen Young Adult Mystery ...",(c) Penguin Random House,A modern-day retelling of The Strange Case of ...,2018-08-18,en


In [29]:
# Basic Statistic

print('Number of samples: ', df.shape[0])
print('Number of columns: ', df.shape[1])
print('Columns: ', ', '.join(df.columns))

Number of samples:  91894
Number of columns:  11
Columns:  TITLE, AUTHOR, PUBLISHED, ISBN, PAGE_NUM, URL, TOPICS, COPYRIGHT, DESCRIPTION, DATE, LANGUAGE


In [30]:
df["AUTHOR"].value_counts()
# DK (Dorling Kindersley) - 1022 - British publishing company
# Golden Books - 445 - children's books series
# Random House - 234 - an imprint and publishing group of Penguin Random House
# Louis L'Amour - American author renowned for his western novels

AUTHOR
DK                                            1022
Various                                        445
Golden Books                                   251
Random House                                   234
Louis L'Amour                                  234
                                              ... 
Peter Mansfield                                  1
Jarett Kobek                                     1
Roger Fisher, William L. Ury, Bruce Patton       1
Amy Finley                                       1
Ches Schneider                                   1
Name: count, Length: 35928, dtype: int64

In [31]:
df.isnull().sum()
# 2364 Authors missing

TITLE             0
AUTHOR         2364
PUBLISHED         0
ISBN              0
PAGE_NUM          0
URL               0
TOPICS            0
COPYRIGHT         0
DESCRIPTION       0
DATE              0
LANGUAGE          0
dtype: int64

In [32]:
df["TOPICS"].value_counts()

TOPICS
Children’s Books                                                                                            11375
Children’s Books, Children’s Middle Grade Books                                                              5156
Fiction, Graphic Novels  Manga                                                                               3793
Fiction, Literary Fiction                                                                                    3098
Religion, Nonfiction, Religion  Philosophy                                                                   2172
                                                                                                            ...  
World History, Religion, Nonfiction, Religion  Philosophy, History                                              1
World History, Military History, World War II Military History, Asian World History, Nonfiction, History        1
Management, Economics, Technology, Nonfiction, Business, Popular Science         

In [33]:
all_topics = df["TOPICS"].str.split(", ").explode()
all_topics.value_counts()

TOPICS
Nonfiction                       34270
Fiction                          32812
Children’s Books                 19792
Mystery  Suspense                 8856
Children’s Middle Grade Books     7728
                                 ...  
Travel: Middle East                 17
Travel: Africa                      14
Travel: Caribbean  Mexico           14
Bibles                               5
Travel: Australia  Oceania           5
Name: count, Length: 146, dtype: int64

In [34]:
df.info() # all columns are objects...

<class 'pandas.core.frame.DataFrame'>
Index: 91894 entries, 0 to 14784
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   TITLE        91894 non-null  object
 1   AUTHOR       89530 non-null  object
 2   PUBLISHED    91894 non-null  object
 3   ISBN         91894 non-null  object
 4   PAGE_NUM     91894 non-null  object
 5   URL          91894 non-null  object
 6   TOPICS       91894 non-null  object
 7   COPYRIGHT    91894 non-null  object
 8   DESCRIPTION  91894 non-null  object
 9   DATE         91894 non-null  object
 10  LANGUAGE     91894 non-null  object
dtypes: object(11)
memory usage: 8.4+ MB


In [35]:
df['PAGE_NUM'] = df['PAGE_NUM'].str.extract(r'(\d+)').astype(int)
df['PUBLISHED'] = pd.to_datetime(df['PUBLISHED'], errors='coerce')
df['DATE'] = pd.to_datetime(df['DATE'], errors='coerce')

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91894 entries, 0 to 14784
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   TITLE        91894 non-null  object        
 1   AUTHOR       89530 non-null  object        
 2   PUBLISHED    91894 non-null  datetime64[ns]
 3   ISBN         91894 non-null  object        
 4   PAGE_NUM     91894 non-null  int64         
 5   URL          91894 non-null  object        
 6   TOPICS       91894 non-null  object        
 7   COPYRIGHT    91894 non-null  object        
 8   DESCRIPTION  91894 non-null  object        
 9   DATE         91894 non-null  datetime64[ns]
 10  LANGUAGE     91894 non-null  object        
dtypes: datetime64[ns](2), int64(1), object(8)
memory usage: 8.4+ MB


Try NLTK: https://www.nltk.org/

In [37]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lolo7\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lolo7\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [38]:
print(set(stopwords.words('english')))

{"you'd", 'during', 'herself', 'in', "she'll", 'against', 'wouldn', 'them', 'wasn', 'will', 'mustn', "you're", "don't", "she'd", 'm', 'after', 'me', 'than', 'as', "shan't", 'that', 'itself', 'then', 'while', 'when', 'most', 'himself', "mustn't", 'being', 'whom', 'we', 'your', 'now', 'hasn', 's', "that'll", 'at', 'such', "they'll", "won't", "isn't", "wasn't", 'didn', 'how', "you'll", 'her', 'theirs', 'there', "i'm", "needn't", 've', 'so', 'until', "mightn't", 'themselves', 'have', 'each', 'which', 'but', 'both', "they're", 'who', 'yourselves', 'doesn', 'some', 'all', "he'll", 'where', 'and', 'my', 'off', 't', 'he', 'here', 'd', 'has', 'shouldn', 'out', 'don', "i'd", 'just', 'our', 'were', 're', 'once', "hasn't", 'ma', "shouldn't", "i've", 'needn', 'did', 'mightn', 'or', 'can', 'before', 'isn', 'over', "we've", 'any', 'hers', 'no', "doesn't", 'yours', "she's", "couldn't", 'nor', "haven't", 'was', 'y', 'their', 'had', 'an', 'its', "i'll", 'with', "aren't", 'am', 'this', 'aren', 'she', 'is

In [39]:
data = "Monday’s Crosswords Do with EaseTuesday’s Crosswords Not a BreezeWednesday’s Crosswords Harder StillThursday’s Crosswords Take Real SkillFriday’s Crosswords — You’ve Come This Far…Saturday’s Crosswords — You’re a Star!For millions of people, the New York Times crossword puzzles are as essential to each day as the first cup of coffee in the morning. Now, for the first time ever, these premier puzzles are available in six clever installments. With each day of the week, the puzzles increase gradually in skill level; Monday’s the easiest, but Saturday’s sure to challenge! Push your mental muscles a little harder each day with America’s favorite sophisticated — and fun — pastime: the New York Times crossword puzzles!The legendary Eugene T. Maleska was crossword editor of The New York Times from 1977 to 1993."
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)
wordsFiltered_nltk = [w for w in words if w not in stopWords]

print(wordsFiltered_nltk)


['Monday', '’', 'Crosswords', 'Do', 'EaseTuesday', '’', 'Crosswords', 'Not', 'BreezeWednesday', '’', 'Crosswords', 'Harder', 'StillThursday', '’', 'Crosswords', 'Take', 'Real', 'SkillFriday', '’', 'Crosswords', '—', 'You', '’', 'Come', 'This', 'Far…Saturday', '’', 'Crosswords', '—', 'You', '’', 'Star', '!', 'For', 'millions', 'people', ',', 'New', 'York', 'Times', 'crossword', 'puzzles', 'essential', 'day', 'first', 'cup', 'coffee', 'morning', '.', 'Now', ',', 'first', 'time', 'ever', ',', 'premier', 'puzzles', 'available', 'six', 'clever', 'installments', '.', 'With', 'day', 'week', ',', 'puzzles', 'increase', 'gradually', 'skill', 'level', ';', 'Monday', '’', 'easiest', ',', 'Saturday', '’', 'sure', 'challenge', '!', 'Push', 'mental', 'muscles', 'little', 'harder', 'day', 'America', '’', 'favorite', 'sophisticated', '—', 'fun', '—', 'pastime', ':', 'New', 'York', 'Times', 'crossword', 'puzzles', '!', 'The', 'legendary', 'Eugene', 'T.', 'Maleska', 'crossword', 'editor', 'The', 'New', 

Try spacy: https://spacy.io/usage

In [40]:
import spacy
# spacy.cli.download("en_core_web_sm") # download once
nlp = spacy.load("en_core_web_sm")

In [60]:
print(set(nlp.Defaults.stop_words))

{'herself', 'during', 'namely', 'somehow', 'except', 'whereas', 'them', 'call', 'else', 'make', 'me', 'anyhow', 'than', 'serious', 'name', 'itself', 'amount', 'then', 'something', 'when', 'himself', 'every', 'whom', 'we', 'your', 'whose', 'now', 'noone', 'never', 'such', 'twenty', 'nine', 'may', 'how', 'eight', 'part', 'seemed', 'sixty', '‘re', 'there', 'nobody', 'latterly', 'themselves', 'three', 'former', 'which', 'since', 'some', 'hence', 'fifteen', 'all', 'where', "n't", 'off', 'seem', 'he', 'still', 'beyond', 'almost', 'became', 'therefore', 'has', 'out', 'whenever', 'forty', 'last', 'just', 'were', 're', 'well', 'sometime', 'whatever', 'along', 'using', 'even', 'herein', 'or', 'would', 'before', 'over', 'any', 'no', 'yours', '‘m', 'become', 'without', 'bottom', 'unless', 'first', 'ten', 'was', 'beforehand', 'keep', 'had', 'an', 'perhaps', 'thus', 'within', 'indeed', 'this', 'she', 'formerly', 'several', 'own', 'these', 'though', 'full', 'do', 'behind', 'ourselves', '‘d', 'might',

In [62]:
nltk_stopwords = set(stopwords.words('english'))
spacy_stopwords = set(nlp.Defaults.stop_words)

nlp_not_in_nltk = spacy_stopwords - nltk_stopwords  # SpaCy stopwords not in NLTK
nltk_not_in_nlp = nltk_stopwords - spacy_stopwords  # NLTK stopwords not in SpaCy
common_stopwords = nltk_stopwords & spacy_stopwords   # Common stopwords

print("Stopwords in NLTK but not in SpaCy:", sorted(nltk_not_in_nlp))
print("Stopwords in SpaCy but not in NLTK:", sorted(nlp_not_in_nltk))
print("Common stopwords:", sorted(common_stopwords))

Stopwords in NLTK but not in SpaCy: ['ain', 'aren', "aren't", 'couldn', "couldn't", 'd', 'didn', "didn't", 'doesn', "doesn't", 'don', "don't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'having', "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've", 'isn', "isn't", "it'd", "it'll", "it's", 'll', 'm', 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'o', 's', 'shan', "shan't", "she'd", "she'll", "she's", "should've", 'shouldn', "shouldn't", 't', "that'll", 'theirs', "they'd", "they'll", "they're", "they've", 've', 'wasn', "wasn't", "we'd", "we'll", "we're", "we've", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'y', "you'd", "you'll", "you're", "you've"]
Stopwords in SpaCy but not in NLTK: ["'d", "'ll", "'m", "'re", "'s", "'ve", 'across', 'afterwards', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'among', 'amongst', 'amount', 'another', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'around', 'back', 'became', '

In [42]:
doc = nlp("Monday’s Crosswords Do with EaseTuesday’s Crosswords Not a BreezeWednesday’s Crosswords Harder StillThursday’s Crosswords Take Real SkillFriday’s Crosswords — You’ve Come This Far…Saturday’s Crosswords — You’re a Star!For millions of people, the New York Times crossword puzzles are as essential to each day as the first cup of coffee in the morning. Now, for the first time ever, these premier puzzles are available in six clever installments. With each day of the week, the puzzles increase gradually in skill level; Monday’s the easiest, but Saturday’s sure to challenge! Push your mental muscles a little harder each day with America’s favorite sophisticated — and fun — pastime: the New York Times crossword puzzles!The legendary Eugene T. Maleska was crossword editor of The New York Times from 1977 to 1993.")

stopWords = set(nlp.Defaults.stop_words)
wordsFiltered_spacy = [w.text for w in doc if w.text.lower not in stopWords]
print(wordsFiltered_spacy)

['Monday', '’s', 'Crosswords', 'Do', 'with', 'EaseTuesday', '’s', 'Crosswords', 'Not', 'a', 'BreezeWednesday', '’s', 'Crosswords', 'Harder', 'StillThursday', '’s', 'Crosswords', 'Take', 'Real', 'SkillFriday', '’s', 'Crosswords', '—', 'You', '’ve', 'Come', 'This', 'Far', '…', 'Saturday', '’s', 'Crosswords', '—', 'You', '’re', 'a', 'Star!For', 'millions', 'of', 'people', ',', 'the', 'New', 'York', 'Times', 'crossword', 'puzzles', 'are', 'as', 'essential', 'to', 'each', 'day', 'as', 'the', 'first', 'cup', 'of', 'coffee', 'in', 'the', 'morning', '.', 'Now', ',', 'for', 'the', 'first', 'time', 'ever', ',', 'these', 'premier', 'puzzles', 'are', 'available', 'in', 'six', 'clever', 'installments', '.', 'With', 'each', 'day', 'of', 'the', 'week', ',', 'the', 'puzzles', 'increase', 'gradually', 'in', 'skill', 'level', ';', 'Monday', '’s', 'the', 'easiest', ',', 'but', 'Saturday', '’s', 'sure', 'to', 'challenge', '!', 'Push', 'your', 'mental', 'muscles', 'a', 'little', 'harder', 'each', 'day', 'w

Clean **Description** by lowercasing + remove special characters

In [43]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) # only keep a-z + whitespaces
    text = re.sub(r'\s+', ' ', text).strip() # remove consecutive whitespaces + leading/trailing
    return text

df['clean_description'] = df['DESCRIPTION'].astype(str).apply(clean_text)
df[['DESCRIPTION', 'clean_description']].head()

Unnamed: 0,DESCRIPTION,clean_description
0,Monday’s Crosswords Do with EaseTuesday’s Cros...,mondays crosswords do with easetuesdays crossw...
1,Two of literary comics modern masters present ...,two of literary comics modern masters present ...
2,Eleven-year-old Cornelia is the daughter of tw...,elevenyearold cornelia is the daughter of two ...
3,"During the English Age of Reason, a woman cloi...",during the english age of reason a woman clois...
4,A modern-day retelling of The Strange Case of ...,a modernday retelling of the strange case of d...


Tokenize **Description** + remove stopwords

In [44]:
stop_words = set(stopwords.words('english'))

def tokenize_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

df['tokens'] = df['clean_description'].apply(tokenize_text)
df[['clean_description', 'tokens']].head()

Unnamed: 0,clean_description,tokens
0,mondays crosswords do with easetuesdays crossw...,"[mondays, crosswords, easetuesdays, crosswords..."
1,two of literary comics modern masters present ...,"[two, literary, comics, modern, masters, prese..."
2,elevenyearold cornelia is the daughter of two ...,"[elevenyearold, cornelia, daughter, two, world..."
3,during the english age of reason a woman clois...,"[english, age, reason, woman, cloistered, sinc..."
4,a modernday retelling of the strange case of d...,"[modernday, retelling, strange, case, dr, jeky..."


Vectorize using TF-IDF (maybe use something different later - word2vec)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
vectorizer = TfidfVectorizer(stop_words='english') # english stopwords ignored
X = vectorizer.fit_transform(df['clean_description'])

print("TF-IDF matrix shape:", X.shape)
# 91894 book descriptions
# 343357 vocabulary size

TF-IDF matrix shape: (91894, 343357)


Cluster **Desription** using KMeans

In [63]:
true_k = 146 # two clusters
model = KMeans(n_clusters=true_k, random_state=42)
model.fit(X)

# Add cluster labels to the DataFrame
df['cluster'] = model.labels_
print(df[['TITLE', 'clean_description', 'cluster']])

                                                   TITLE  \
0      The New York Times Daily Crossword Puzzles: Th...   
1                Creatures of the Night (Second Edition)   
2      Cornelia and the Audacious Escapades of the So...   
3                               The Alchemist's Daughter   
4                                          Dangerous Boy   
...                                                  ...   
14780                                  Finding the Quiet   
14781            You Can Save the Earth, Revised Edition   
14782                                   The Holistic Cat   
14783                                Jigsaw Sticker Book   
14784                                          The Flame   

                                       clean_description  cluster  
0      mondays crosswords do with easetuesdays crossw...      106  
1      two of literary comics modern masters present ...       29  
2      elevenyearold cornelia is the daughter of two ...       18  
3      

Inspect Cluster Centers to get idea of cluster themes

In [47]:
def print_top_terms_per_cluster(model, vectorizer, n_terms=5):
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()
    for i in range(true_k):
        print(f"Cluster {i} top terms:")
        top_terms = [terms[ind] for ind in order_centroids[i, :n_terms]]
        print(", ".join(top_terms))
        print("")

print_top_terms_per_cluster(model, vectorizer)

Cluster 0 top terms:
book, new, world, guide, life

Cluster 1 top terms:
new, life, story, love, world



Connect to Neoj3 database

In [57]:
from neo4j import GraphDatabase
import requests
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "neo4jtest1"))

try:
    response = requests.get("http://localhost:7474")
    if response.status_code != 200:
        raise SystemExit(f"Neo4j is unreachable. HTTP Status Code: {response.status_code}")
    else:
        print("Neo4j is up and reachable!")

except requests.exceptions.RequestException as e:
    raise SystemExit(f"ERROR: Could not reach Neo4j. Reason: {e}")

def create_book_node(tx, book):
    # Example: create a Book node with title and description properties.
    query = (
        "CREATE (b:Book {title: $title, description: $description})"
    )
    tx.run(query, title=book['TITLE'], description=book['clean_description'])

with driver.session() as session:
    for _, book in df.head(10).iterrows(): # test with only 10 books
        session.execute_write(create_book_node, book)


print("Book nodes have been created in Neo4j.")

Neo4j is up and reachable!
Book nodes have been created in Neo4j.


In [58]:
def get_books(tx):
    query = "MATCH (b:Book) RETURN b.title AS title, b.description AS description LIMIT 5"
    return list(tx.run(query))

with driver.session() as session:
    books = session.execute_read(get_books)
    print("Sample books from Neo4j:")
    for book in books:
        print(book)

Sample books from Neo4j:
<Record title='The New York Times Daily Crossword Puzzles: Thursday, Volume 1' description='mondays crosswords do with easetuesdays crosswords not a breezewednesdays crosswords harder stillthursdays crosswords take real skillfridays crosswords youve come this farsaturdays crosswords youre a starfor millions of people the new york times crossword puzzles are as essential to each day as the first cup of coffee in the morning now for the first time ever these premier puzzles are available in six clever installments with each day of the week the puzzles increase gradually in skill level mondays the easiest but saturdays sure to challenge push your mental muscles a little harder each day with americas favorite sophisticated and fun pastime the new york times crossword puzzlesthe legendary eugene t maleska was crossword editor of the new york times from to'>
<Record title='Creatures of the Night (Second Edition)' description='two of literary comics modern masters pre

In [59]:
# Remember to close the driver when you're done
driver.close()