<a href="https://colab.research.google.com/github/richapatel93/NLP/blob/main/LSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# 1. Load the text
with open("drseuss.txt", "r", encoding="utf-8") as file:
    text = file.read()

# 2. Break the text into small parts (like paragraphs)
documents = text.split('\n\n')

# 3. Convert words into numbers using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# 4. Apply LSA using SVD to find 5 topics
lsa = TruncatedSVD(n_components=5, random_state=42)
lsa.fit(X)

# 5. Show top 10 words for each topic
terms = vectorizer.get_feature_names_out()
for i, comp in enumerate(lsa.components_):
    terms_in_topic = [terms[i] for i in comp.argsort()[-10:]]
    print(f"Topic {i+1}: {' | '.join(terms_in_topic)}")


Topic 1: mouse | house | fox | box | ham | eggs | green | eat | sam | like
Topic 2: chicks | say | sews | sue | mr | box | fox | sir | socks | knox
Topic 3: know | mother | did | oh | look | things | fish | hat | said | cat
Topic 4: shame | knox | ham | green | eggs | try | sam | like | mr | sir
Topic 5: rain | let | dark | tree | train | say | mr | car | sir | eat
