In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import tkinter as tk
from tkinter.scrolledtext import ScrolledText
from datasets import load_dataset

# Sample data
data = {
    'Title': [
        'Uefa Opens Proceedings against Barcelona, Juventus in Super League Row',
        'Amazon Blames Inflation as It Increases Cost of Prime Subscription',
        'Nigeria’s Parliament Passes Amended Electoral Bill amid Controversy',
        'Nigeria: Lagos Governor Tests Positive for Covid-19',
        'South Africa Calls For Calm as Electoral Reform Talks Stall'
    ],
    'Excerpt': [
        'Uefa has opened disciplinary proceedings against Barcelona, Juventus and Real Madrid over their involvement in the proposed breakaway Super League.',
        'The increases are steeper than the 17 percent jump it implemented last year.',
        "Nigeria's Senate on Tuesday passed the harmonised Electoral Bill amid controversy.",
        'The Lagos State Governor, Mr. Babajide Sanwo-Olu, has tested positive for COVID-19.',
        'South Africa has raised concerns about the deterioration of the political situation in Lesotho and called for calm.'
    ],
    'Category': ['sports', 'business', 'politics', 'health', 'politics']
}

dataset = load_dataset("okite97/news-data")

# Convert data to DataFrame
df = pd.DataFrame(dataset['train'])

# Filter data for only 'health', 'sports', and 'business' categories
df = df[df['Category'].isin(['health', 'sports', 'business'])]

# Drop rows with missing values in 'Title' or 'Excerpt' columns
df = df.dropna(subset=['Title', 'Excerpt'])
print(df.head())

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english', use_idf=True)
tfidf_matrix = vectorizer.fit_transform(df['Title'] + ' ' + df['Excerpt'])

# Clustering (K-means)
n_clusters = 3  # Number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Predict cluster for user input
def predict_cluster(input_text):
    input_vector = vectorizer.transform([input_text])
    cluster = kmeans.predict(input_vector)
    return cluster[0]

# Sample user input
user_input = "Barcelona and Juventus disciplinary proceedings"
predicted_cluster = predict_cluster(user_input)
print("Predicted Cluster:", predicted_cluster)

# Display documents in the predicted cluster
clustered_documents = df[kmeans.labels_ == predicted_cluster]
print(clustered_documents)


Found cached dataset csv (/Users/sanjokdangol/.cache/huggingface/datasets/okite97___csv/okite97--news-data-40e98cfd881f0955/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

                                               Title  \
0  Uefa Opens Proceedings against Barcelona, Juve...   
1  Amazon Blames Inflation as It Increases Cost o...   
3  Nigeria: Lagos Governor Tests Positive for Cov...   
5  Guardiola To Leave Man City When Contract Expi...   
9  Premier League Clubs Reject ‘Project Big Picture’   

                                             Excerpt  Category  
0  Uefa has opened disciplinary proceedings again...    sports  
1  The increases are steeper than the 17 percent ...  business  
3  The Lagos State Governor, Mr. Babajide Sanwo-O...    health  
5  Pep Guardiola has said that he will leave Manc...    sports  
9  Premier League clubs have "unanimously agreed"...    sports  
Predicted Cluster: 0
                                                  Title  \
1     Amazon Blames Inflation as It Increases Cost o...   
11    Old Trafford Modified for 23,500 Socially Dist...   
12           CBN Eyes $200bn from Fresh Non-Oil Exports   
15    Buhari Ret

  super()._check_params_vs_input(X, default_n_init=10)


In [17]:
# Function to predict cluster and category
def predict_cluster_and_category(input_text):
    input_vector = vectorizer.transform([input_text])
    cluster = kmeans.predict(input_vector)[0]
    category = df.iloc[cluster]['Category']
    return cluster, category

In [18]:
# Tkinter GUI
window = tk.Tk()
window.title("Document Clustering")
window.minsize(600, 400)

text_box = ScrolledText(window)
text_box.grid(row=0, column=0, padx=5, pady=5, sticky="nsew")

predicted_cluster_label = tk.Label(window, text="Predicted Cluster:")
predicted_cluster_label.grid(row=1, column=0, padx=5, pady=5, sticky="w")

predicted_category_label = tk.Label(window, text="Predicted Category:")
predicted_category_label.grid(row=2, column=0, padx=5, pady=5, sticky="w")

def process_input():
    input_text = text_box.get("1.0", tk.END)
    predicted_cluster, category = predict_cluster_and_category(input_text)
    predicted_cluster_label.config(text=f"Predicted Cluster: {predicted_cluster}")
    predicted_category_label.config(text=f"Predicted Category: {category}")
    print("Predicted Cluster:", predicted_cluster)
    print("Category Name:", category)

# Tkinter GUI components
btn = tk.Button(window, text="Cluster Documents", command=process_input)
btn.grid(row=3, column=0, padx=5, pady=5)

window.mainloop()

Predicted Cluster: 0
Category Name: sports


In [None]:


from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

# df=result
# label_encoder = LabelEncoder()
# df['category_encoded'] = label_encoder.fit_transform(df['category'])
# df['title'] = label_encoder.fit_transform(df['title'])
# df['category'] = label_encoder.fit_transform(df['category'])

# print(df['title'].head())

# # Extract features (X) from the dataset
# X = df[['title', 'category_encoded']]  # Adjust features accordingly
# Generate sample data with varying densities
X, _ = make_blobs(n_samples=1000, centers=3, cluster_std=[1.0, 2.0, 3.0], random_state=42)

# Perform KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X)

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)

# Plot results
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis', s=50, alpha=0.7)
plt.title('KMeans Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

plt.subplot(1, 2, 2)
plt.scatter(X[:, 0], X[:, 1], c=dbscan_labels, cmap='viridis', s=50, alpha=0.7)
plt.title('DBSCAN Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

plt.tight_layout()
plt.show()


In [None]:
# Create sample data
X, _ = make_blobs(n_samples=1000, centers=3, cluster_std=[1.0, 2.0, 3.0], random_state=42)

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)

# Plot results
plt.scatter(X[:, 0], X[:, 1], c=dbscan_labels, cmap='viridis', s=50, alpha=0.7)
plt.title('DBSCAN Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()