In [None]:
# Import Libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import tkinter as tk
from tkinter.scrolledtext import ScrolledText

df = pd.read_csv('./data/NewsCategorizer.csv', usecols=['category','headline','short_description'])

df = df.dropna(subset=['headline'])

print(df.head())

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english', use_idf=True)
tfidf_matrix = vectorizer.fit_transform(df['headline']+ ' ' + df['short_description'])

n_clusters = 3  # Number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Predict cluster for user input
def predict_cluster_and_category(input_text):
    input_vector = vectorizer.transform([input_text])
    cluster = kmeans.predict(input_vector)[0]
    print(cluster)
    category = df.iloc[cluster]['category']
    return cluster, category

window = tk.Tk()
window.title("Document Clustering")
window.minsize(600, 400)

text_box = ScrolledText(window)
text_box.grid(row=0, column=0, padx=5, pady=5, sticky="nsew")

predicted_cluster_label = tk.Label(window, text="Predicted Cluster:")
predicted_cluster_label.grid(row=1, column=0, padx=5, pady=5, sticky="w")

predicted_category_label = tk.Label(window, text="Predicted Category:")
predicted_category_label.grid(row=2, column=0, padx=5, pady=5, sticky="w")

def process_input():
    input_text = text_box.get("1.0", tk.END)
    print(input_text)
    predicted_cluster, category = predict_cluster_and_category(input_text)
    predicted_cluster_label.config(text=f"Predicted Cluster: {predicted_cluster}")
    predicted_category_label.config(text=f"Predicted Category: {category}")
    print("Predicted Cluster:", predicted_cluster)
    print("Category Name:", category)

# Tkinter GUI components
btn = tk.Button(window, text="Cluster Documents", command=process_input)
btn.grid(row=3, column=0, padx=5, pady=5)

window.mainloop()