In [9]:
import sys
from pathlib import Path

REPO_ROOT = Path("..").resolve()
RAW_JSON = REPO_ROOT / "raw_data" / "messages.json"

if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

print("Repo root:", REPO_ROOT)
print("Raw JSON:", RAW_JSON, "exists:", RAW_JSON.exists())

from topic_segmentor import ReplyChainTopicSegmentor

topic_size = 4
reply_seg = ReplyChainTopicSegmentor(topic_size=topic_size, non_overlapping=True)
gold_topics = reply_seg.get_topics(str(RAW_JSON))

print("Gold topics:", len(gold_topics))
print("First 3 sizes:", [len(t) for t in gold_topics[:3]])
assert all(len(t) == topic_size for t in gold_topics), "gold_topics must be fixed size"

Repo root: /home/pppptttt/Desktop/study/ml/Intro_in_ML_project
Raw JSON: /home/pppptttt/Desktop/study/ml/Intro_in_ML_project/raw_data/messages.json exists: True
Gold topics: 5290
First 3 sizes: [4, 4, 4]


In [10]:
from dataclasses import dataclass


@dataclass
class WindowExampleRaw:
    msgs: list
    y: int
    ts_end: int

def window_to_text(msgs) -> str:
    return "\n".join([f"{m.user}: {m.text}" for m in msgs])

def ctx_to_text(msgs) -> str:
    return "\n".join([f"{m.user}: {m.text}" for m in msgs[:-1]])

def resp_to_text(msgs) -> str:
    m = msgs[-1]
    return f"{m.user}: {m.text}"

In [11]:
topic_size = 4
assert all(len(t) == topic_size for t in gold_topics)

pos_raw = [WindowExampleRaw(msgs=t, y=1, ts_end=t[-1].timestamp) for t in gold_topics]
print("Pos:", len(pos_raw))

Pos: 5290


In [None]:
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from typing import List
import numpy as np

rng = random.Random(42)

contexts = [ctx_to_text(t) for t in gold_topics]
responses = [resp_to_text(t) for t in gold_topics]

tfidf_miner = TfidfVectorizer(lowercase=True, max_features=50000, ngram_range=(1,2))
tfidf_miner.fit(contexts + responses)

C = tfidf_miner.transform(contexts)
R = tfidf_miner.transform(responses)

S = cosine_similarity(C, R)

np.fill_diagonal(S, -1.0)

def mined_negative_window(i: int, topk: int = 50) -> List:
    best_idx = np.argpartition(S[i], -topk)[-topk:]
    best_idx = best_idx[np.argsort(S[i][best_idx])[::-1]]

    for j in best_idx:
        if j != i:
            return gold_topics[i][:-1] + [gold_topics[j][-1]]
    j = rng.randrange(len(gold_topics))
    while j == i:
        j = rng.randrange(len(gold_topics))
    return gold_topics[i][:-1] + [gold_topics[j][-1]]

neg_per_pos = 2
neg_raw = []
for i in range(len(gold_topics)):
    for _ in range(neg_per_pos):
        w = mined_negative_window(i, topk=50)
        neg_raw.append(WindowExampleRaw(msgs=w, y=0, ts_end=w[-1].timestamp))

print("Neg (hard-mined):", len(neg_raw))
print("\nExample HARD NEG window:\n", window_to_text(neg_raw[0].msgs))

Neg (hard-mined): 10580

Example HARD NEG window:
 user24: Да, с первого августа открылся набор
user20: круто!) а реферал есть ? :D
user5: в мс передумал?
user5: +


In [13]:
dataset_raw = sorted(pos_raw + neg_raw, key=lambda e: e.ts_end)

split = int(len(dataset_raw) * 0.8)
train_raw = dataset_raw[:split]
test_raw  = dataset_raw[split:]

print("Train:", len(train_raw), "Test:", len(test_raw))
print("Train pos rate:", sum(e.y for e in train_raw)/len(train_raw))
print("Test  pos rate:", sum(e.y for e in test_raw)/len(test_raw))

Train: 12696 Test: 3174
Train pos rate: 0.33341209829867674
Test  pos rate: 0.33301827347195967


In [None]:
train_text_for_tfidf = [window_to_text(e.msgs) for e in train_raw]
tfidf_feat = TfidfVectorizer(lowercase=True, max_features=50000, ngram_range=(1,2))
tfidf_feat.fit(train_text_for_tfidf)

def tfidf_cos(a: str, b: str) -> float:
    X = tfidf_feat.transform([a, b])
    return float(cosine_similarity(X[0], X[1])[0,0])

def featurize_window(ex: WindowExampleRaw) -> np.ndarray:
    msgs = ex.msgs
    ctx = msgs[:-1]
    resp = msgs[-1]

    ctx_text = "\n".join([f"{m.user}: {m.text}" for m in ctx])
    resp_text = f"{resp.user}: {resp.text}"

    sim_ctx_resp = tfidf_cos(ctx_text, resp_text)

    dt12 = max(0, msgs[1].timestamp - msgs[0].timestamp)
    dt23 = max(0, msgs[2].timestamp - msgs[1].timestamp)
    dt34 = max(0, msgs[3].timestamp - msgs[2].timestamp)

    max_dt_neighbors = max(dt12, dt23, dt34)
    log_dt34 = float(np.log1p(dt34))

    users = [m.user for m in msgs]
    num_unique_users = len(set(users))
    resp_user_seen = 1.0 if resp.user in [m.user for m in ctx] else 0.0

    question_in_last_context = 1.0 if "?" in ctx[-1].text else 0.0

    return np.array([
        sim_ctx_resp,
        float(np.log1p(max_dt_neighbors)),
        log_dt34,
        float(num_unique_users),
        resp_user_seen,
        question_in_last_context
    ], dtype=np.float32)

X_train = np.vstack([featurize_window(e) for e in train_raw])
y_train = np.array([e.y for e in train_raw], dtype=np.int64)

X_test  = np.vstack([featurize_window(e) for e in test_raw])
y_test  = np.array([e.y for e in test_raw], dtype=np.int64)

feature_names = [
    "tfidf_sim_ctx_resp",
    "log1p_max_dt_neighbors",
    "log1p_dt34",
    "num_unique_users",
    "resp_user_seen_in_context",
    "question_in_last_context",
]

X_train.shape, y_train.mean()

((12696, 6), np.float64(0.33341209829867674))

In [15]:
from sklearn.ensemble import HistGradientBoostingClassifier

gbdt = HistGradientBoostingClassifier(
    max_depth=6,
    learning_rate=0.08,
    max_iter=400,
    random_state=0
)

gbdt.fit(X_train, y_train)

proba = gbdt.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

acc = accuracy_score(y_test, pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, pred, average="binary", zero_division=0)
auc = roc_auc_score(y_test, proba)

print({"Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1, "ROC_AUC": auc})
print("\nReport:\n", classification_report(y_test, pred, zero_division=0))

{'Accuracy': 0.9678638941398866, 'Precision': 0.9192273924495171, 'Recall': 0.9905392620624409, 'F1': 0.953551912568306, 'ROC_AUC': 0.9937568961271752}

Report:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98      2117
           1       0.92      0.99      0.95      1057

    accuracy                           0.97      3174
   macro avg       0.96      0.97      0.96      3174
weighted avg       0.97      0.97      0.97      3174



довольно высокий результат, но на деле модель решает задачу проще поставленной "определи, являются ли 4 сообщения из одного топика"

In [19]:

import joblib

joblib.dump(tfidf_feat, REPO_ROOT / "models/tfidf_feat.joblib")
joblib.dump(gbdt, REPO_ROOT / "models/gbdt_topic_window.joblib")

['/home/pppptttt/Desktop/study/ml/Intro_in_ML_project/models/gbdt_topic_window.joblib']