<a href="https://colab.research.google.com/github/sheemapatel/nlp--/blob/main/24_10_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import wordnet
from nltk.util import ngrams
import nltk
nltk.download('wordnet')

corpus = [
    "economic stability is key to global peace and trade policy",
    "market analysis reveals new trade deals boost the economy",
    "historical accounts of the war show political instability and conflict",
    "military technology advances increase the danger of large scale conflict and war",
    "financial reports predict market downturn and instability worldwide",
]
documents = [
    "The market analysis predicted a major downturn in financial reports.",
    "A major economic downturn in the global market was predicted by analysis."
]
doc1_text = documents[0].lower()
doc2_text = documents[1].lower()

stopwords = set(['a', 'the', 'in', 'of', 'and', 'by', 'was'])

def tokenize_and_filter(text):
    tokens = text.split()
    return [token for token in tokens if token not in stopwords]


def run_lda_and_display(corpus, n_topics_list):
    vectorizer = CountVectorizer(stop_words='english')
    data_vectorized = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names_out()

    lda_results = {}

    for n_topics in n_topics_list:
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=10)
        lda.fit(data_vectorized)

        top_words = []
        for topic_idx, topic in enumerate(lda.components_):
            top_features_ind = topic.argsort()[:-6:-1]
            top_words.append([feature_names[i] for i in top_features_ind])

        lda_results[n_topics] = top_words

    return lda_results

lda_output = run_lda_and_display(corpus, [3, 5, 8])

print("## Task 1: Topic Number Tuning (LDA) Results ðŸ“Š")
for k, topics in lda_output.items():
    print(f"\n### LDA with K = {k} Topics")
    for i, words in enumerate(topics):
        print(f"Topic {i+1}: {', '.join(words)}")



keyword = "war"
synsets = wordnet.synsets(keyword, pos=wordnet.NOUN)
target_synset = synsets[0]

hypernyms = [lemma.name() for hyper in target_synset.hypernyms() for lemma in hyper.lemmas()]
hyponyms = [lemma.name() for hypo in target_synset.hyponyms() for lemma in hypo.lemmas()]

print("\n" + "-"*50)
print("## Task 2: WordNet Hypernyms & Hyponyms ðŸŒ²")
print(f"Keyword: **{keyword}** ({target_synset.name()})")
print(f"Hypernyms (Broader): {set(hypernyms)}")
print(f"Hyponyms (Narrower): {set(hyponyms)}")
def calculate_jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if len(union) > 0 else 0.0

doc1_unigrams = set(tokenize_and_filter(doc1_text))
doc2_unigrams = set(tokenize_and_filter(doc2_text))

def get_bigram_set(text):
    tokens = tokenize_and_filter(text)
    return set(ngrams(tokens, 2))

doc1_bigrams = get_bigram_set(doc1_text)
doc2_bigrams = get_bigram_set(doc2_text)

jaccard_unigram = calculate_jaccard_similarity(doc1_unigrams, doc2_unigrams)
jaccard_bigram = calculate_jaccard_similarity(doc1_bigrams, doc2_bigrams)

print("\n" + "-"*50)
print("## Task 3: Jaccard Similarity with Bigrams ðŸ”—")
print(f"Document 1 Unigrams: {doc1_unigrams}")
print(f"Document 2 Unigrams: {doc2_unigrams}")
print(f"Unigram-based Jaccard Similarity: **{jaccard_unigram:.4f}**")
print("---")
print(f"Document 1 Bigrams: {doc1_bigrams}")
print(f"Document 2 Bigrams: {doc2_bigrams}")
print(f"Bigram-based Jaccard Similarity: **{jaccard_bigram:.4f}**")

[nltk_data] Downloading package wordnet to /root/nltk_data...


## Task 1: Topic Number Tuning (LDA) Results ðŸ“Š

### LDA with K = 3 Topics
Topic 1: war, conflict, scale, technology, military
Topic 2: trade, market, new, reveals, deals
Topic 3: instability, market, predict, worldwide, financial

### LDA with K = 5 Topics
Topic 1: technology, scale, military, large, danger
Topic 2: trade, war, conflict, instability, market
Topic 3: market, instability, predict, worldwide, financial
Topic 4: war, conflict, instability, political, historical
Topic 5: trade, reveals, new, economy, deals

### LDA with K = 8 Topics
Topic 1: trade, stability, policy, peace, global
Topic 2: instability, war, conflict, market, trade
Topic 3: instability, war, conflict, market, trade
Topic 4: war, conflict, instability, political, historical
Topic 5: reveals, new, analysis, economy, deals
Topic 6: instability, war, conflict, market, trade
Topic 7: technology, scale, military, large, danger
Topic 8: market, worldwide, predict, reports, downturn

-----------------------------