<a href="https://colab.research.google.com/github/saishshinde15/NLP/blob/main/Latent__Semantic__Analysis__CountVectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/all_book_titles.txt

--2024-05-08 13:23:02--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/all_book_titles.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 127992 (125K) [text/plain]
Saving to: ‘all_book_titles.txt’


2024-05-08 13:23:03 (11.3 MB/s) - ‘all_book_titles.txt’ saved [127992/127992]



In [2]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
wordnet_lemmatizer = WordNetLemmatizer()

In [5]:
titles = [line.rstrip() for line in open('all_book_titles.txt')]

In [6]:
stops = set(stopwords.words('english'))

In [7]:
# great example of domain-specific stopwords
stops = stops.union({
  'introduction', 'edition', 'series', 'application',
  'approach', 'card', 'access', 'package', 'plus', 'etext',
  'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',
  'third', 'second', 'fourth', 'volume'})

In [8]:
def my_tokenizer(s):
  # downcase
  s = s.lower()

  # split string into words (tokens)
  tokens = nltk.tokenize.word_tokenize(s)

  # remove short words, they're probably not useful
  tokens = [t for t in tokens if len(t) > 2]

  # put words into base form
  tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]

  # remove stopwords
  tokens = [t for t in tokens if t not in stops]

  # remove any digits, i.e. "3rd edition"
  tokens = [t for t in tokens if not any(c.isdigit() for c in t)]

  return tokens

In [9]:
vectorizer = CountVectorizer(binary=True, tokenizer=my_tokenizer)

In [10]:
X = vectorizer.fit_transform(titles)



In [11]:
# create index > word map for plotting later

# conceptually what we want to do
# index_word_map = [None] * len(vectorizer.vocabulary_)
# for word, index in vectorizer.vocabulary_.items():
#   index_word_map[index] = word

# but it's already stored in the count vectorizer
index_word_map = vectorizer.get_feature_names_out()

In [12]:
# transpose X to make rows = terms, cols = documents
X = X.T

In [13]:
svd = TruncatedSVD()
Z = svd.fit_transform(X)

In [14]:
!pip install plotly



In [15]:
import plotly.express as px

In [16]:
fig = px.scatter(x=Z[:,0], y=Z[:,1], text=index_word_map, size_max=60)
fig.update_traces(textposition='top center')
fig.show()