# –∏–º–ø–æ—Ä—Ç –±–∏–±–ª–∏–æ—Ç–µ–∫

In [13]:
import numpy as np
import pandas as pd
import nltk
import re
import json
from pathlib import Path
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\–º–∞—à–∞\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\–º–∞—à–∞\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\–º–∞—à–∞\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# –∑–∞–≥—Ä—É–∑–∫–∞ —Ñ–∞–π–ª–æ–≤ –≤ –¥–∞—Ç–∞—Ñ—Ä–µ–π–º

In [None]:
def load_all_jsons(root_dir: str, include_long_sc: bool = False):
    root = Path(root_dir)
    patterns = ["main/**/*.json"]
    records = []
    for pat in patterns:
        for fp in root.glob(pat):
            try:
                with open(fp, "r", encoding="utf-8") as f:
                    data = json.load(f)
            except Exception as e:
                print(f"–û—à–∏–±–∫–∞ —á—Ç–µ–Ω–∏—è {fp}: {e}")
                continue
            for it in data:
                rec = dict(it)
                rec["_source_file"] = str(fp.relative_to(root))
                records.append(rec)
    return records

RE_WS = re.compile(r"\s+", flags=re.UNICODE)
def clean_text(s):
    if not isinstance(s, str):
        return ""
    s = s.strip()
    s = RE_WS.sub(" ", s)
    return s


In [46]:
records = load_all_jsons('data')
print("–ó–∞–≥—Ä—É–∂–µ–Ω–æ –∑–∞–ø–∏—Å–µ–π:", len(records))

df = pd.DataFrame(records)
for col in ["id","text","dataset","source","model","paraphrasing_type"]:
    if col not in df.columns:
        df[col] = None

# –º–µ—Ç–∫–∏: human=0, ai & ai+rew = 1
df["label"] = df["source"].apply(lambda s: 0 if s == "human" else 1)

–ó–∞–≥—Ä—É–∂–µ–Ω–æ –∑–∞–ø–∏—Å–µ–π: 4317


In [47]:
X = df['text']
y = df['label']

# —Ñ—É–Ω–∫—Ü–∏—è –∏–∑–≤–ª–µ—á–µ–Ω–∏—è —Å—Ç–∏–ª–æ–º–µ—Ç—Ä–∏—á–µ—Å–∫–∏—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤

In [48]:
def extract_stylo_features(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    avg_sent_len = np.mean([len(word_tokenize(s)) for s in sentences]) if sentences else 0
    sent_len_var = np.var([len(word_tokenize(s)) for s in sentences]) if sentences else 0
    ttr = len(set(words)) / len(words) if words else 0
    markdown_bold = text.count("**")

    return pd.Series({
        "avg_sent_len": avg_sent_len,
        "sent_len_var": sent_len_var,
        "ttr": ttr,
        "markdown_bold": markdown_bold
    })

df_stylo = X.apply(extract_stylo_features)
df_features = pd.concat([df[['text']], df_stylo], axis=1)


# —Å–æ–∑–¥–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏


In [51]:
tfidf = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1,2),
    sublinear_tf=True
)

ct = ColumnTransformer([
    ("tfidf", tfidf, "text"),
    ("stylo", StandardScaler(), ["avg_sent_len", "sent_len_var", "ttr", "markdown_bold"]),
])

model = Pipeline([
    ('features', ct),
    ('clf', LogisticRegression(max_iter=400, class_weight="balanced"))
])


# –æ–±—É—á–µ–Ω–∏–µ –∏ –æ—Ü–µ–Ω–∫–∞ –∫–∞—á–µ—Å—Ç–≤–∞

In [52]:
X_train, X_test, y_train, y_test = train_test_split(df_features, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8206018518518519
F1-score: 0.8541862652869238
              precision    recall  f1-score   support

           0       0.65      0.93      0.77       273
           1       0.96      0.77      0.85       591

    accuracy                           0.82       864
   macro avg       0.81      0.85      0.81       864
weighted avg       0.86      0.82      0.83       864



In [53]:
feature_names = model.named_steps['features'].named_transformers_['tfidf'].get_feature_names_out()
coefs = model.named_steps['clf'].coef_[0][:len(feature_names)]

top_ai = np.argsort(coefs)[-20:][::-1]  
top_human = np.argsort(coefs)[:20]      

print("\nüîπ –ú–∞—Ä–∫–µ—Ä—ã AI:")
for i in top_ai:
    print(feature_names[i], round(coefs[i], 4))

print("\nüîπ –ú–∞—Ä–∫–µ—Ä—ã Human:")
for i in top_human:
    print(feature_names[i], round(coefs[i], 4))



üîπ –ú–∞—Ä–∫–µ—Ä—ã AI:
–æ–¥–Ω–∞–∫–æ 1.7649
–≤–∫–ª—é—á–∞—è 1.6421
–µ—ë 1.5101
–∏–Ω—Ñ–æ—Ä–º–∏—Ä—É–µ—Ç 1.4952
—ç—Ç–æ—Ç 1.4837
–¥–ª—è 1.4335
–Ω–µ —Ç–æ–ª—å–∫–æ 1.2585
—ç—Ç–æ–º –∏–Ω—Ñ–æ—Ä–º–∏—Ä—É–µ—Ç 1.2147
–∫–∞–∫ 1.1527
–æ—Å–æ–±–µ–Ω–Ω–æ 1.1425
–Ω–µ—Å–º–æ—Ç—Ä—è –Ω–∞ 1.1242
–Ω–µ—Å–º–æ—Ç—Ä—è 1.1023
–≤–∞–∂–Ω–æ 1.0976
–¥–µ–º–æ–Ω—Å—Ç—Ä–∏—Ä—É–µ—Ç 1.0758
–ø–æ–¥—á–µ—Ä–∫–∏–≤–∞–µ—Ç 1.0614
–≤–æ–ø—Ä–æ—Å 1.0315
—ç—Ç–æ 1.0279
—ç—Ç–∏ 1.007
—Å—Å—ã–ª–∞—è—Å—å 0.9596
—Å—Å—ã–ª–∞—è—Å—å –Ω–∞ 0.9596

üîπ –ú–∞—Ä–∫–µ—Ä—ã Human:
—Å–æ–æ–±—â–∞–µ—Ç -2.1748
—ç—Ç–æ–º —Å–æ–æ–±—â–∞–µ—Ç -1.6725
–æ–± -1.6064
–æ—á–µ–Ω—å -1.4899
—Å–æ —Å—Å—ã–ª–∫–æ–π -1.4605
—Å—Å—ã–ª–∫–æ–π -1.4431
—Å—Å—ã–ª–∫–æ–π –Ω–∞ -1.4153
–æ–± —ç—Ç–æ–º -1.4138
—á–∏—Å–ª–µ -1.3866
–ø–∏—à–µ—Ç -1.3734
—Ç–æ–º —á–∏—Å–ª–µ -1.3657
–ø–æ—Ç–æ–º—É -1.3353
–ø—Ä–æ–±–ª–µ–º—É -1.2053
—Å–∫–∞–∑–∞–ª -1.1995
–ø–æ—Ç–æ–º—É —á—Ç–æ -1.1925
—Å–æ -1.1635
–ø–æ -1.1587
–≥–≥ -1.1545
—Å–æ–æ–±—â–∞–µ—Ç—Å—è -1.1541
–Ω–∞ —Å–∞–π—Ç–µ -1.1127


In [54]:
stylo_importance = model.named_steps['clf'].coef_[0][-4:]
stylo_feats = ["avg_sent_len","sent_len_var","ttr","markdown_bold"]

print("–°—Ç–∏–ª–æ–º–µ—Ç—Ä–∏—è (–≤–ª–∏—è–Ω–∏–µ –Ω–∞ –∫–ª–∞—Å—Å AI):")
for f, w in zip(stylo_feats, stylo_importance):
    print(f"{f}: {round(w, 4)}")


–°—Ç–∏–ª–æ–º–µ—Ç—Ä–∏—è (–≤–ª–∏—è–Ω–∏–µ –Ω–∞ –∫–ª–∞—Å—Å AI):
avg_sent_len: 0.4925
sent_len_var: -0.8599
ttr: 0.2969
markdown_bold: 6.9352


# —Ñ—É–Ω–∫—Ü–∏—è –ø—Ä–æ–≤–µ—Ä–∫–∏ –º–æ–¥–µ–ª–∏

In [56]:
label_map = {1: "AI-generated", 0: "Human-written"}

def classify_text(text: str):
    sample = pd.DataFrame([{
        "text": text,
        **extract_stylo_features(text)
    }])
    pred = model.predict(sample)[0]
    proba = model.predict_proba(sample)[0][1]
    label = label_map[pred]
    confidence = proba if pred == 1 else 1 - proba
    return {"prediction": label, "confidence": float(confidence)}

# –ø—Ä–∏–º–µ—Ä
example = '''

**–ú–æ—Å–∫–≤–∞, 15 –º–∞—Ä—Ç–∞ 2024 –≥.** ‚Äî –ì—Ä—É–ø–ø–∞ –∏—Å—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª–µ–π –∏–∑ –ù–∞—Ü–∏–æ–Ω–∞–ª—å–Ω–æ–≥–æ –∏—Å—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å—Å–∫–æ–≥–æ —Ü–µ–Ω—Ç—Ä–∞ ¬´–ö—É—Ä—á–∞—Ç–æ–≤—Å–∫–∏–π –∏–Ω—Å—Ç–∏—Ç—É—Ç¬ª –∏ –ú–ò–§–ò –ø—Ä–µ–¥—Å—Ç–∞–≤–∏–ª–∞ –ø—Ä–æ—Ç–æ—Ç–∏–ø –∫–≤–∞–Ω—Ç–æ–≤–æ–≥–æ –∫–æ–º–ø—å—é—Ç–µ—Ä–∞, –∫–ª—é—á–µ–≤—ã–µ –∫–æ–º–ø–æ–Ω–µ–Ω—Ç—ã –∫–æ—Ç–æ—Ä–æ–≥–æ ‚Äî –∫—É–±–∏—Ç—ã –∏ —É–ø—Ä–∞–≤–ª—è—é—â–∞—è —ç–ª–µ–∫—Ç—Ä–æ–Ω–∏–∫–∞ ‚Äî –ø–æ–ª–Ω–æ—Å—Ç—å—é —Ä–∞–∑—Ä–∞–±–æ—Ç–∞–Ω—ã –∏ –ø—Ä–æ–∏–∑–≤–µ–¥–µ–Ω—ã –≤ –†–æ—Å—Å–∏–∏.

–£—Å—Ç—Ä–æ–π—Å—Ç–≤–æ, –ø–æ–ª—É—á–∏–≤—à–µ–µ –Ω–∞–∑–≤–∞–Ω–∏–µ ¬´–ö–≤–∞–Ω—Ç-1¬ª, –ø–æ–∫–∞ –æ–±–ª–∞–¥–∞–µ—Ç —Å–∫—Ä–æ–º–Ω–æ–π –º–æ—â–Ω–æ—Å—Ç—å—é –≤ 8 –∫—É–±–∏—Ç–æ–≤, –Ω–æ –µ–≥–æ –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞ –ø–æ–∑–≤–æ–ª—è–µ—Ç –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞—Ç—å —Å–∏—Å—Ç–µ–º—É –¥–æ —Å–æ—Ç–µ–Ω –∫—É–±–∏—Ç–æ–≤ –≤ –±–ª–∏–∂–∞–π—à–∏–µ —Ç—Ä–∏ –≥–æ–¥–∞. –ì–ª–∞–≤–Ω—ã–º –ø—Ä–æ—Ä—ã–≤–æ–º —É—á–µ–Ω—ã–µ –Ω–∞–∑—ã–≤–∞—é—Ç —Å–æ–∑–¥–∞–Ω–∏–µ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ–π —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–∏ –ø—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–∞ —Å–≤–µ—Ä—Ö–ø—Ä–æ–≤–æ–¥—è—â–∏—Ö –∫—É–±–∏—Ç–æ–≤ –Ω–∞ –æ—Å–Ω–æ–≤–µ –∞–ª—é–º–∏–Ω–∏–µ–≤—ã—Ö —Å—Ç—Ä—É–∫—Ç—É—Ä, –∫–æ—Ç–æ—Ä—ã–µ —Å–ø–æ—Å–æ–±–Ω—ã —Å–æ—Ö—Ä–∞–Ω—è—Ç—å –∫–≤–∞–Ω—Ç–æ–≤–æ–µ —Å–æ—Å—Ç–æ—è–Ω–∏–µ —Ä–µ–∫–æ—Ä–¥–Ω–æ–µ –¥–ª—è –æ—Ç–µ—á–µ—Å—Ç–≤–µ–Ω–Ω—ã—Ö —Ä–∞–∑—Ä–∞–±–æ—Ç–æ–∫ –≤—Ä–µ–º—è ‚Äî –±–æ–ª–µ–µ 100 –º–∏–∫—Ä–æ—Å–µ–∫—É–Ω–¥.

¬´–≠—Ç–æ –Ω–µ –ø—Ä–æ—Å—Ç–æ –ª–∞–±–æ—Ä–∞—Ç–æ—Ä–Ω—ã–π –º–∞–∫–µ—Ç. –ú—ã –ø–æ—Å—Ç—Ä–æ–∏–ª–∏ –ø–æ–ª–Ω—ã–π —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—á–µ—Å–∫–∏–π —Ü–∏–∫–ª: –æ—Ç –º–æ–¥–µ–ª–∏—Ä–æ–≤–∞–Ω–∏—è –∏ –ø—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–∞ —á–∏–ø–æ–≤ –¥–æ —Å–∏—Å—Ç–µ–º—ã —É–ø—Ä–∞–≤–ª–µ–Ω–∏—è –∏ –ø—Ä–æ–≥—Ä–∞–º–º–Ω–æ–≥–æ —Å—Ç–µ–∫–∞. –≠—Ç–æ —Ñ—É–Ω–¥–∞–º–µ–Ω—Ç –¥–ª—è –±—É–¥—É—â–µ–≥–æ –ø–æ–ª–Ω–æ—Ü–µ–Ω–Ω–æ–≥–æ –∫–≤–∞–Ω—Ç–æ–≤–æ–≥–æ –ø—Ä–æ—Ü–µ—Å—Å–æ—Ä–∞¬ª, ‚Äî –∑–∞—è–≤–∏–ª —Ä—É–∫–æ–≤–æ–¥–∏—Ç–µ–ª—å –ø—Ä–æ–µ–∫—Ç–∞, –∞–∫–∞–¥–µ–º–∏–∫ –ò–≤–∞–Ω –ü–µ—Ç—Ä–æ–≤.

–†–∞–∑—Ä–∞–±–æ—Ç–∫–∞ –≤–µ–ª–∞—Å—å –≤ —Ä–∞–º–∫–∞—Ö –Ω–∞—Ü–∏–æ–Ω–∞–ª—å–Ω–æ–≥–æ –ø—Ä–æ–µ–∫—Ç–∞ ¬´–ù–∞—É–∫–∞ –∏ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç—ã¬ª. –ü–µ—Ä–≤—ã–º–∏ –ø—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏–º–∏ –ø—Ä–∏–º–µ–Ω–µ–Ω–∏—è–º–∏ ¬´–ö–≤–∞–Ω—Ç–∞-1¬ª —Å—Ç–∞–Ω—É—Ç –∑–∞–¥–∞—á–∏ –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏ –ª–æ–≥–∏—Å—Ç–∏–∫–∏ –∏ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏ –Ω–æ–≤—ã—Ö –º–∞—Ç–µ—Ä–∏–∞–ª–æ–≤. –î–æ –∫–æ–Ω—Ü–∞ –≥–æ–¥–∞ –∫ —Å–∏—Å—Ç–µ–º–µ –ø–æ–ª—É—á–∞—Ç –¥–æ—Å—Ç—É–ø –∏—Å—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª–∏ –∏–∑ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö —Ä–æ—Å—Å–∏–π—Å–∫–∏—Ö —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–æ–≤ –∏ –∫–æ–º–ø–∞–Ω–∏–π-–ø–∞—Ä—Ç–Ω–µ—Ä–æ–≤.

–≠–∫—Å–ø–µ—Ä—Ç—ã –æ—Ç–º–µ—á–∞—é—Ç, —á—Ç–æ, –Ω–µ—Å–º–æ—Ç—Ä—è –Ω–∞ –æ—Ç—Å—Ç–∞–≤–∞–Ω–∏–µ –æ—Ç –ª–∏–¥–µ—Ä–æ–≤ –≤ –æ—Ç—Ä–∞—Å–ª–∏ (–∫–æ–º–ø–∞–Ω–∏–π Google –∏ IBM), —Å–æ–∑–¥–∞–Ω–∏–µ –ø–æ–ª–Ω–æ—Å—Ç—å—é –æ—Ç–µ—á–µ—Å—Ç–≤–µ–Ω–Ω–æ–π –ø–ª–∞—Ç—Ñ–æ—Ä–º—ã —è–≤–ª—è–µ—Ç—Å—è –∫—Ä–∏—Ç–∏—á–µ—Å–∫–∏ –≤–∞–∂–Ω—ã–º —à–∞–≥–æ–º –¥–ª—è —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—á–µ—Å–∫–æ–≥–æ —Å—É–≤–µ—Ä–µ–Ω–∏—Ç–µ—Ç–∞ –∏ –±–µ–∑–æ–ø–∞—Å–Ω–æ—Å—Ç–∏ —Å—Ç—Ä–∞–Ω—ã –≤ –¥–æ–ª–≥–æ—Å—Ä–æ—á–Ω–æ–π –ø–µ—Ä—Å–ø–µ–∫—Ç–∏–≤–µ.'''
print(classify_text(example))


{'prediction': 'AI-generated', 'confidence': 0.8843162495168472}


In [57]:
import joblib
joblib.dump(model, "model.pkl")

['model.pkl']