# Import Libraries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import tkinter as tk
from tkinter.scrolledtext import ScrolledText
from datasets import load_dataset

In [None]:
# data = {
#     'Title': [
#         'Uefa Opens Proceedings against Barcelona, Juventus in Super League Row',
#         'Amazon Blames Inflation as It Increases Cost of Prime Subscription',
#         'Nigeria’s Parliament Passes Amended Electoral Bill amid Controversy',
#         'Nigeria: Lagos Governor Tests Positive for Covid-19',
#         'South Africa Calls For Calm as Electoral Reform Talks Stall'
#     ],
#     'Excerpt': [
#         'Uefa has opened disciplinary proceedings against Barcelona, Juventus and Real Madrid over their involvement in the proposed breakaway Super League.',
#         'The increases are steeper than the 17 percent jump it implemented last year.',
#         "Nigeria's Senate on Tuesday passed the harmonised Electoral Bill amid controversy.",
#         'The Lagos State Governor, Mr. Babajide Sanwo-Olu, has tested positive for COVID-19.',
#         'South Africa has raised concerns about the deterioration of the political situation in Lesotho and called for calm.'
#     ],
#     'Category': ['sports', 'business', 'politics', 'health', 'politics']
# }

# Loading Dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("okite97/news-data")
df = pd.DataFrame(dataset['train'])
# Filter data for only 'health', 'sports', and 'business' categories
# df = df[df['Category'].isin(['health', 'sports', 'business'])]

# Data Cleaning

In [5]:
# Drop rows with missing values in 'Title' or 'Excerpt' columns
df = df.dropna(subset=['Title', 'Excerpt'])
print(df.head())

                                               Title  \
0  Uefa Opens Proceedings against Barcelona, Juve...   
1  Amazon Blames Inflation as It Increases Cost o...   
2  Nigeria’s Parliament Passes Amended Electoral ...   
3  Nigeria: Lagos Governor Tests Positive for Cov...   
4  South Africa Calls For Calm as Electoral Refor...   

                                             Excerpt  Category  
0  Uefa has opened disciplinary proceedings again...    sports  
1  The increases are steeper than the 17 percent ...  business  
2  Nigeria's Senate on Tuesday passed the harmoni...  politics  
3  The Lagos State Governor, Mr. Babajide Sanwo-O...    health  
4  South Africa has raised concerns about the det...  politics  


# Vectorize the Text

In [6]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english', use_idf=True)
tfidf_matrix = vectorizer.fit_transform(df['Title'] + ' ' + df['Excerpt'])

# Cluster the Data: 

In [7]:
# Clustering (K-means)
n_clusters = 6  # Number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

  super()._check_params_vs_input(X, default_n_init=10)


# Predict Cluster: 

In [8]:
# Predict cluster for user input
def predict_cluster_and_category(input_text):
    input_vector = vectorizer.transform([input_text])
    cluster = kmeans.predict(input_vector)[0]
    category = df.iloc[cluster]['Category']
    return cluster, category

# Display Cluster Documents

In [9]:
window = tk.Tk()
window.title("Document Clustering")
window.minsize(600, 400)

text_box = ScrolledText(window)
text_box.grid(row=0, column=0, padx=5, pady=5, sticky="nsew")

predicted_cluster_label = tk.Label(window, text="Predicted Cluster:")
predicted_cluster_label.grid(row=1, column=0, padx=5, pady=5, sticky="w")

predicted_category_label = tk.Label(window, text="Predicted Category:")
predicted_category_label.grid(row=2, column=0, padx=5, pady=5, sticky="w")

def process_input():
    input_text = text_box.get("1.0", tk.END)
    predicted_cluster, category = predict_cluster_and_category(input_text)
    predicted_cluster_label.config(text=f"Predicted Cluster: {predicted_cluster}")
    predicted_category_label.config(text=f"Predicted Category: {category}")
    print("Predicted Cluster:", predicted_cluster)
    print("Category Name:", category)

# Tkinter GUI components
btn = tk.Button(window, text="Cluster Documents", command=process_input)
btn.grid(row=3, column=0, padx=5, pady=5)

window.mainloop()

Predicted Cluster: 3
Category Name: health
Predicted Cluster: 3
Category Name: health
