In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
import matplotlib.pyplot as plt

# Sample data
articles = [
    "The economy is growing rapidly, with new jobs being created every day.",
    "The government has announced a new policy to boost economic growth.",
    "The stock market is experiencing a downturn, with many investors losing money.",
    "A new study has found that climate change is having a significant impact on the environment.",
    "The government has launched a new initiative to reduce carbon emissions.",
    "This book is about machine learning and artificial intelligence.",
    "Machine learning is a field of artificial intelligence that involves training models on data.",
    "Artificial intelligence is a broad field that includes machine learning and natural language processing.",
    "Natural language processing is a subfield of artificial intelligence that deals with text and language.",
    "Machine learning models can be trained on large datasets to make predictions and classify data.",
    "This document is about contract law and breach of contract.",
    "Contract law is a branch of law that deals with agreements between parties.",
    "Breach of contract occurs when one party fails to fulfill their obligations.",
    "A new case has been filed in court regarding a breach of contract.",
    "The court has ruled in favor of the plaintiff in the breach of contract case."
]

# Task 1: Latent Dirichlet Allocation (LDA)
vectorizer = CountVectorizer()
article_vectors = vectorizer.fit_transform(articles[:5])
lda = LatentDirichletAllocation(n_components=5)
lda.fit(article_vectors)
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-5 - 1:-1]]))

# Task 2: K-Means Clustering
vectorizer = TfidfVectorizer()
book_vectors = vectorizer.fit_transform(articles[5:10])
kmeans = KMeans(n_clusters=3)
kmeans.fit(book_vectors)
print("Cluster labels:")
print(kmeans.labels_)

# Task 3: Hierarchical Clustering
document_vectors = vectorizer.fit_transform(articles[10:])
Z = linkage(document_vectors.toarray(), method='ward')
plt.figure(figsize=(10, 7))
dendrogram(Z)
plt.show()
