# TruncatedSVD (LSA)


TruncatedSVD reduces high-dimensional TF-IDF vectors to low-dimensional semantic concepts.

Useful for: Clustering, visualization, topic modeling, and more.





# Imports

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Dummy Datasets

In [4]:
documents = [
    "Machine learning is amazing",
    "Deep learning and neural networks are part of machine learning",
    "Natural language processing uses machine learning techniques",
    "AI includes machine learning and deep learning",
    "Language models are used in NLP",
    "NLP stands for natural language processing",
    "Transformers are powerful models for NLP",
    "Deep learning is a subfield of machine learning",
    "AI is the future of technology",
    "Technology is evolving with AI and ML"
]
documents

['Machine learning is amazing',
 'Deep learning and neural networks are part of machine learning',
 'Natural language processing uses machine learning techniques',
 'AI includes machine learning and deep learning',
 'Language models are used in NLP',
 'NLP stands for natural language processing',
 'Transformers are powerful models for NLP',
 'Deep learning is a subfield of machine learning',
 'AI is the future of technology',
 'Technology is evolving with AI and ML']

# Vectorization

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(documents)

# Apply TruncatedSVD (LSA)

In [8]:
# Reduce to 2 latent topics/components
svd = TruncatedSVD(n_components=3)
X_lsa = svd.fit_transform(X_tfidf)

# Display Results

In [10]:
terms = vectorizer.get_feature_names_out()
for i, comp in enumerate(svd.components_):
    terms_in_comp = zip(terms, comp)
    sorted_terms = sorted(terms_in_comp, key=lambda x: x[1], reverse=True)[:5]
    print(f"\nTop terms in component {i + 1}:")
    for term, weight in sorted_terms:
        print(f"{term} ({weight:.4f})")



Top terms in component 1:
learning (0.7055)
machine (0.4330)
deep (0.3413)
amazing (0.1853)
subfield (0.1677)

Top terms in component 2:
nlp (0.4904)
language (0.4465)
models (0.3670)
natural (0.2955)
processing (0.2955)

Top terms in component 3:
technology (0.5796)
ai (0.5412)
future (0.3727)
evolving (0.3091)
ml (0.3091)
