In [1]:
from nltk.util import ngrams
from collections import Counter
import nltk

nltk.download('punkt')

# Sample corpus
corpus = [
    "the quick brown fox",
    "the slow brown dog",
    "the quick red dog",
    "the lazy yellow fox"
]

# Tokenize and build n-grams
def build_ngrams(corpus, n):
    all_ngrams = []
    for sentence in corpus:
        tokens = nltk.word_tokenize(sentence.lower())
        n_grams = list(ngrams(tokens, n))
        all_ngrams.extend(n_grams)
    return Counter(all_ngrams)

# Unigram, Bigram, Trigram
unigrams = build_ngrams(corpus, 1)
bigrams = build_ngrams(corpus, 2)
trigrams = build_ngrams(corpus, 3)

print("Unigrams:\n", unigrams)
print("\nBigrams:\n", bigrams)
print("\nTrigrams:\n", trigrams)

Unigrams:
 Counter({('the',): 4, ('quick',): 2, ('brown',): 2, ('fox',): 2, ('dog',): 2, ('slow',): 1, ('red',): 1, ('lazy',): 1, ('yellow',): 1})

Bigrams:
 Counter({('the', 'quick'): 2, ('quick', 'brown'): 1, ('brown', 'fox'): 1, ('the', 'slow'): 1, ('slow', 'brown'): 1, ('brown', 'dog'): 1, ('quick', 'red'): 1, ('red', 'dog'): 1, ('the', 'lazy'): 1, ('lazy', 'yellow'): 1, ('yellow', 'fox'): 1})

Trigrams:
 Counter({('the', 'quick', 'brown'): 1, ('quick', 'brown', 'fox'): 1, ('the', 'slow', 'brown'): 1, ('slow', 'brown', 'dog'): 1, ('the', 'quick', 'red'): 1, ('quick', 'red', 'dog'): 1, ('the', 'lazy', 'yellow'): 1, ('lazy', 'yellow', 'fox'): 1})


[nltk_data] Downloading package punkt to C:\Users\attar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

docs = [
    "the quick brown fox",
    "the slow brown dog",
    "the quick red dog",
    "the lazy yellow fox"
]

# ---------- LDA ----------
count_vectorizer = CountVectorizer(stop_words='english')
doc_term_matrix = count_vectorizer.fit_transform(docs)

lda_model = LatentDirichletAllocation(n_components=2, random_state=42)
lda_topics = lda_model.fit_transform(doc_term_matrix)

print("\nLDA Topics:")
terms = count_vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda_model.components_):
    top_words = [terms[i] for i in topic.argsort()[-3:]]
    print(f"Topic {idx+1}: {', '.join(top_words)}")

# ---------- LSA ----------
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(docs)

lsa_model = TruncatedSVD(n_components=2, random_state=42)
lsa_topics = lsa_model.fit_transform(tfidf_matrix)

print("\nLSA Topics:")
terms = tfidf_vectorizer.get_feature_names_out()
for idx, component in enumerate(lsa_model.components_):
    top_words = [terms[i] for i in component.argsort()[-3:]]
    print(f"Topic {idx+1}: {', '.join(top_words)}")



LDA Topics:
Topic 1: brown, quick, dog
Topic 2: yellow, lazy, fox

LSA Topics:
Topic 1: dog, quick, brown
Topic 2: lazy, yellow, fox
