In [1]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
wordnet_lemmatizer = WordNetLemmatizer()
titles = [line.rstrip() for line in open('all_book_titles.txt')]
stops  = set(stopwords.words('english'))
stops  = stops.union({
    'introduction','edition','series','application','approach','card','access','package','plus',
    'etext','brief','vol','fundamental','guide','essential','printed','third','second','fourth','volume'
})

In [5]:
def custom_tokenizer(s):
    # lowercase
    s = s.lower()
    # split string into words
    tokens = nltk.tokenize.word_tokenize(s)
    # remove short words
    tokens = [t for t in tokens if len(t) > 2]
    # put words in base form
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    # remove stopwords
    tokens = [t for t in tokens if t not in stops]
    # remove any digits
    tokens = [t for t in tokens if not any(c.isdigit() for c in t)]
    # return tokens
    return tokens

In [6]:
vecorizer = CountVectorizer(binary=True,tokenizer=custom_tokenizer)
X = vecorizer.fit_transform(titles)

In [7]:
index_word_map = vecorizer.get_feature_names_out()

In [8]:
# transpose X to make rows = terms and columns as documents
X = X.T

In [9]:
svd = TruncatedSVD()
Z = svd.fit_transform(X)

In [13]:
import plotly.express as px

In [15]:
fig = px.scatter(x=Z[:,0],y=Z[:,1],text=index_word_map,size_max=60)
fig.update_traces(textposition='top center')
fig.show()