In [None]:
!pip install scikit-learn numpy pandas torch



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    silhouette_score,
    adjusted_rand_score,
    normalized_mutual_info_score,
)
import numpy as np

df = pd.read_json("/content/drive/MyDrive/messages.json")
random_seed = 42
cluster_results: dict[str, dict] = {}
print(df.shape)

def generate_sillhoutte_for_N(N: int):
    numeric_transformer = StandardScaler()
    numeric_columns = ["timestamp"]
    preprocessor = ColumnTransformer(
        transformers=[
            ("scale_numbers", numeric_transformer, numeric_columns)
        ],
        remainder="drop"
    )

    clusters_num = df.shape[0] // N
    classification_model = KMeans(n_clusters=clusters_num, random_state=random_seed, n_init="auto")

    scaled_timestamps = preprocessor.fit_transform(df, y=None).reshape(-1, 1)
    predicted = classification_model.fit_predict(scaled_timestamps)
    log_cluster_results(f"KMeans(N={N})", scaled_timestamps, predicted)

    return predicted


def log_cluster_results(
    model_name: str,
    X: np.ndarray,
    labels: np.ndarray,
    y_true: np.ndarray | None = None,
):
    """
    Считаем базовые метрики кластеризации и сохраняем их
    в словарь cluster_results под ключом model_name.
    """

    metrics = {}

    unique_labels = np.unique(labels)
    metrics["n_clusters"] = len(unique_labels)

    if metrics["n_clusters"] > 1 and len(X) > metrics["n_clusters"]:
        metrics["silhouette"] = silhouette_score(X, labels)
    else:
        metrics["silhouette"] = np.nan

    if y_true is not None:
        metrics["ari"] = adjusted_rand_score(y_true, labels)
        metrics["nmi"] = normalized_mutual_info_score(y_true, labels)
    else:
        metrics["ari"] = np.nan
        metrics["nmi"] = np.nan

    cluster_results[model_name] = metrics
    return metrics

def results_to_dataframe(results: dict) -> pd.DataFrame:
    """Переводим словарь результатов в DataFrame для удобства сравнений."""
    if not results:
        return pd.DataFrame()

    df_res = pd.DataFrame.from_dict(results, orient="index")
    return df_res.sort_index()


(126733, 3)


In [19]:
for N in [8, 10, 12, 15, 16]:
    generate_sillhoutte_for_N(N)

results_to_dataframe(cluster_results)

Unnamed: 0,n_clusters,silhouette,ari,nmi
KMeans(N=10),12673,0.788752,,
KMeans(N=12),10561,0.796352,,
KMeans(N=15),8448,0.797094,,
KMeans(N=16),7920,0.792732,,
KMeans(N=8),15841,0.777139,,


In [20]:
N = 15
predicted = generate_sillhoutte_for_N(N)

In [23]:
import json

data = {}
data["context_1"] = []
data["context_2"] = []
data["context_3"] = []
data["response"] = []

topic = {}

with open("/content/drive/MyDrive/messages.json", "r") as f:
    current_json = json.load(f)
    for id, message in enumerate(current_json):
        topic_id = predicted[id]
        text = message["text"]
        if topic_id not in topic:
            topic[topic_id] = [None, None, None]

        data["context_1"].append(topic[topic_id][0])
        data["context_2"].append(topic[topic_id][1])
        data["context_3"].append(topic[topic_id][2])
        data["response"].append(text)

        topic[topic_id] = topic[topic_id][1:] + [text]

In [24]:
pd.DataFrame(data).to_csv("/content/dataframe.csv", index=False)