In [1]:
!pip install gradio
import pandas as pd
import numpy as np



In [2]:
import matplotlib.pyplot as plt
import seaborn as sn

In [3]:
from sklearn.datasets import fetch_20newsgroups

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
from sklearn.cluster import KMeans, AgglomerativeClustering

In [6]:
from sklearn.decomposition import TruncatedSVD

In [8]:
newsgroups = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))

In [9]:
text = newsgroups.data[:1000]

In [10]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=3000)

In [13]:
X_tfidf = vectorizer.fit_transform(text)

In [14]:
svd = TruncatedSVD(n_components=100, random_state=42)

In [15]:
X_reduced = svd.fit_transform(X_tfidf)

In [16]:
def get_top_words_per_cluster(X_tfidf, labels, vectorizer, n_words=10):
    feature_names = vectorizer.get_feature_names_out()
    top_words = {}

    for cluster in np.unique(labels):
        cluster_indices = np.where(labels == cluster)[0]
        if len(cluster_indices) == 0:
            continue
        cluster_tfidf = X_tfidf[cluster_indices].sum(axis=0).A1  # sum of word importance
        top_idx = cluster_tfidf.argsort()[-n_words:][::-1]
        top_words[cluster] = [feature_names[i] for i in top_idx]

    return top_words

In [17]:
def cluster_text(method, n_clusters, user_text):
    if method == "KMeans":
        model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        labels = model.fit_predict(X_reduced)
    else:
        model = AgglomerativeClustering(n_clusters=n_clusters)
        labels = model.fit_predict(X_reduced)

    df = pd.DataFrame({"Text": texts, "Cluster": labels})

    # Predict cluster for user input
    user_vec = vectorizer.transform([user_text])
    user_reduced = svd.transform(user_vec)

    if method == "KMeans":
        user_cluster = model.predict(user_reduced)[0]
    else:
        # Approximate for hierarchical: nearest centroid
        centers = pd.DataFrame(X_reduced).groupby(labels).mean().values
        sims = centers.dot(user_reduced.T).flatten()
        user_cluster = sims.argmax()

    # Compute top words
    top_words_dict = get_top_words_per_cluster(X_tfidf, labels, vectorizer, n_words=10)
    cluster_keywords = top_words_dict.get(user_cluster, [])

    return f"Predicted Cluster: {user_cluster}\n\nTop Words: {', '.join(cluster_keywords)}", df.head(10)

In [18]:
def plot_clusters(n_clusters):
    model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = model.fit_predict(X_reduced)

    # Reduce to 2D for visualization
    svd_2d = TruncatedSVD(n_components=2, random_state=42)
    reduced_2d = svd_2d.fit_transform(X_tfidf)

    plt.figure(figsize=(6, 4))
    sns.scatterplot(x=reduced_2d[:, 0], y=reduced_2d[:, 1], hue=labels, palette="tab10", s=30)
    plt.title(f"KMeans Clusters (n={n_clusters})")
    plt.xlabel("Topic Dimension 1")
    plt.ylabel("Topic Dimension 2")

    # Return numpy array instead of buffer
    plt.tight_layout()
    fig = plt.gcf()
    fig.canvas.draw()
    img = np.array(fig.canvas.renderer.buffer_rgba())
    plt.close(fig)
    return img

In [21]:
import gradio as gr

In [22]:
with gr.Blocks() as demo:
    gr.Markdown("## 📰 News Article Clustering (20 Newsgroups)")

    with gr.Row():
        method = gr.Dropdown(["KMeans", "Hierarchical"], label="Clustering Method", value="KMeans")
        n_clusters = gr.Slider(2, 15, value=5, step=1, label="Number of Clusters")

    user_text = gr.Textbox(label="Enter a News Article or Text", lines=5, value="NASA launched a new space mission today.")
    run_btn = gr.Button("Find Cluster")

    result = gr.Textbox(label="Cluster Result")
    table_output = gr.Dataframe(label="Sample Clustered Data")

    run_btn.click(cluster_text, inputs=[method, n_clusters, user_text], outputs=[result, table_output])

    gr.Markdown("### 🔍 Visualize Clusters (KMeans only)")
    viz_btn = gr.Button("Show Cluster Plot")
    viz_output = gr.Image(type="numpy")
    viz_btn.click(plot_clusters, inputs=[n_clusters], outputs=viz_output)

In [23]:
demo. launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://287c18379826996513.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


