# –∏–º–ø–æ—Ä—Ç –±–∏–±–ª–∏–æ—Ç–µ–∫

In [1]:
import numpy as np
import pandas as pd
import nltk
import re

from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\–º–∞—à–∞\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\–º–∞—à–∞\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\–º–∞—à–∞\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# –∑–∞–≥—Ä—É–∑–∫–∞ —Ñ–∞–π–ª–æ–≤ –≤ –¥–∞—Ç–∞—Ñ—Ä–µ–π–º

In [2]:
df_orig = pd.read_json("original_news.json")
df_gen  = pd.read_json("generated_news.json")

df_orig = df_orig.rename(columns={"original_news": "text"})
df_gen  = df_gen.rename(columns={"generated_news": "text"})
df_orig["label"] = 0
df_gen["label"] = 1

df = pd.concat([df_orig, df_gen], ignore_index=True)
df = df[['text', 'label']]
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,text,label
0,"–í –ì—Ä—É–∑–∏–∏ –±—ã–ª –∑–∞–¥–µ—Ä–∂–∞–Ω –≥—Ä–∞–∂–¥–∞–Ω–∏–Ω –†–æ—Å—Å–∏–∏, –ø–æ–¥–æ–∑—Ä...",1
1,–ó–∞–∫–æ–Ω –æ–± –∏—Å–∫–ª—é—á–∏—Ç–µ–ª—å–Ω–æ—Å—Ç–∏ —É–∫—Ä–∞–∏–Ω—Å–∫–æ–≥–æ —è–∑—ã–∫–∞ –º–æ...,0
2,–†–æ—Å—Å–∏–π—Å–∫–æ–π –≤–∞–ª—é—Ç–µ –Ω–µ —Å—Ç–æ–∏—Ç –æ–∂–∏–¥–∞—Ç—å —É–∫—Ä–µ–ø–ª–µ–Ω–∏—è ...,0
3,4 —Å–µ–Ω—Ç—è–±—Ä—è –≤ –ú–æ—Å–∫–≤–µ —Å–æ—Å—Ç–æ—è–ª–æ—Å—å —Ç–æ—Ä–∂–µ—Å—Ç–≤–µ–Ω–Ω–æ–µ –æ...,1
4,–í 2022 –≥–æ–¥—É –≤ –†–æ—Å—Å–∏–∏ –∑–∞—Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–Ω—ã –∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω—ã...,1


In [3]:
X = df['text']
y = df['label']

# —Ñ—É–Ω–∫—Ü–∏—è –∏–∑–≤–ª–µ—á–µ–Ω–∏—è —Å—Ç–∏–ª–æ–º–µ—Ç—Ä–∏—á–µ—Å–∫–∏—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤

In [4]:
def extract_stylo_features(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    avg_sent_len = np.mean([len(word_tokenize(s)) for s in sentences]) if sentences else 0
    sent_len_var = np.var([len(word_tokenize(s)) for s in sentences]) if sentences else 0
    ttr = len(set(words)) / len(words) if words else 0
    markdown_bold = text.count("**")

    return pd.Series({
        "avg_sent_len": avg_sent_len,
        "sent_len_var": sent_len_var,
        "ttr": ttr,
        "markdown_bold": markdown_bold
    })

df_stylo = X.apply(extract_stylo_features)
df_features = pd.concat([df[['text']], df_stylo], axis=1)


# —Å–æ–∑–¥–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏


In [5]:
tfidf = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1,2),
    sublinear_tf=True
)

ct = ColumnTransformer([
    ("tfidf", tfidf, "text"),
    ("stylo", StandardScaler(), ["avg_sent_len", "sent_len_var", "ttr", "markdown_bold"]),
])

model = Pipeline([
    ('features', ct),
    ('clf', LogisticRegression(max_iter=400, class_weight="balanced"))
])


# –æ–±—É—á–µ–Ω–∏–µ –∏ –æ—Ü–µ–Ω–∫–∞ –∫–∞—á–µ—Å—Ç–≤–∞

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_features, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9270833333333334
F1-score: 0.9195402298850575
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       106
           1       0.91      0.93      0.92        86

    accuracy                           0.93       192
   macro avg       0.93      0.93      0.93       192
weighted avg       0.93      0.93      0.93       192



In [7]:
feature_names = model.named_steps['features'].named_transformers_['tfidf'].get_feature_names_out()
coefs = model.named_steps['clf'].coef_[0][:len(feature_names)]

top_ai = np.argsort(coefs)[-20:][::-1]  
top_human = np.argsort(coefs)[:20]      

print("\nüîπ –ú–∞—Ä–∫–µ—Ä—ã AI:")
for i in top_ai:
    print(feature_names[i], round(coefs[i], 4))

print("\nüîπ –ú–∞—Ä–∫–µ—Ä—ã Human:")
for i in top_human:
    print(feature_names[i], round(coefs[i], 4))



üîπ –ú–∞—Ä–∫–µ—Ä—ã AI:
–∏–Ω—Ü–∏–¥–µ–Ω—Ç 0.6905
–æ–¥–Ω–∞–∫–æ 0.6749
—ç—Ç–æ—Ç 0.6451
—ç—Ç–æ 0.6394
–¥–ª—è 0.6372
–Ω–µ —Ç–æ–ª—å–∫–æ 0.5968
–º—ã 0.5765
–±–µ–∑–æ–ø–∞—Å–Ω–æ—Å—Ç–∏ 0.5508
–Ω–µ—Å–º–æ—Ç—Ä—è 0.538
–Ω–µ—Å–º–æ—Ç—Ä—è –Ω–∞ 0.538
–≤–∫–ª—é—á–∞—è 0.5303
–æ—Å—Ç–∞–µ—Ç—Å—è 0.5263
—ç–∫—Å–ø–µ—Ä—Ç—ã 0.5118
—Ç–æ–ª—å–∫–æ 0.5086
–ø–æ–¥–æ–±–Ω—ã–µ 0.5075
–æ—Å–æ–±–µ–Ω–Ω–æ 0.4755
–≤–Ω–∏–º–∞–Ω–∏–µ 0.4708
–ø—Ä–æ–∏–∑–æ—à–µ–¥—à–µ–≥–æ 0.4668
–¥–∞–Ω–Ω—ã–π 0.4623
–ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç 0.4407

üîπ –ú–∞—Ä–∫–µ—Ä—ã Human:
—Å–æ–æ–±—â–∞–µ—Ç -0.7498
–æ–± -0.6957
–æ–± —ç—Ç–æ–º -0.6913
—ç—Ç–æ–º -0.661
–ø—Ä–æ—Ü–µ–Ω—Ç–æ–≤ -0.6268
–ø—Ä–æ—Ü–µ–Ω—Ç–∞ -0.5356
–≥–æ–¥–∞ -0.4561
—Å–æ —Å—Å—ã–ª–∫–æ–π -0.4047
—Å—Å—ã–ª–∫–æ–π –Ω–∞ -0.4041
—Å—Å—ã–ª–∫–æ–π -0.4035
—Ä–∏–∞ –Ω–æ–≤–æ—Å—Ç–∏ -0.4011
—Ä–∏–∞ -0.4011
–ø—Ä–∏ —ç—Ç–æ–º -0.3787
–Ω–æ–≤–æ—Å—Ç–∏ -0.3766
the -0.3752
–ø–∏—à–µ—Ç -0.37
—ç—Ç–æ–º —Å–æ–æ–±—â–∞–µ—Ç -0.37
—Å–æ–æ–±—â–∞–ª–æ—Å—å -0.3636
—Å–∫–∞–∑–∞–ª -0.3621
–ø—Ä–µ–∑–∏–¥–µ–Ω—Ç–∞ -0.3596


In [8]:
stylo_importance = model.named_steps['clf'].coef_[0][-4:]
stylo_feats = ["avg_sent_len","sent_len_var","ttr","markdown_bold"]

print("–°—Ç–∏–ª–æ–º–µ—Ç—Ä–∏—è (–≤–ª–∏—è–Ω–∏–µ –Ω–∞ –∫–ª–∞—Å—Å AI):")
for f, w in zip(stylo_feats, stylo_importance):
    print(f"{f}: {round(w, 4)}")


–°—Ç–∏–ª–æ–º–µ—Ç—Ä–∏—è (–≤–ª–∏—è–Ω–∏–µ –Ω–∞ –∫–ª–∞—Å—Å AI):
avg_sent_len: 1.0644
sent_len_var: -1.7677
ttr: -0.8524
markdown_bold: 2.5615


# —Ñ—É–Ω–∫—Ü–∏—è –ø—Ä–æ–≤–µ—Ä–∫–∏ –º–æ–¥–µ–ª–∏

In [9]:
label_map = {1: "AI-generated", 0: "Human-written"}

def classify_text(text: str):
    sample = pd.DataFrame([{
        "text": text,
        **extract_stylo_features(text)
    }])
    pred = model.predict(sample)[0]
    proba = model.predict_proba(sample)[0][1]
    label = label_map[pred]
    confidence = proba if pred == 1 else 1 - proba
    return {"prediction": label, "confidence": float(confidence)}

# –ø—Ä–∏–º–µ—Ä
example = '''–ú–æ—Å–∫–≤–∞, 30 –Ω–æ—è–±—Ä—è 2025 –≥–æ–¥–∞. –†–æ—Å–∞—Ç–æ–º –æ–±—ä—è–≤–∏–ª –æ –≤–≤–æ–¥–µ –≤ –ø—Ä–æ–º—ã—à–ª–µ–Ω–Ω—É—é —ç–∫—Å–ø–ª—É–∞—Ç–∞—Ü–∏—é –ø–ª–∞–≤—É—á–µ–π –∞—Ç–æ–º–Ω–æ–π —Ç–µ–ø–ª–æ—ç–ª–µ–∫—Ç—Ä–æ—Å—Ç–∞–Ω—Ü–∏–∏ (–ü–ê–¢–≠–°) –ø—Ä–æ–µ–∫—Ç–∞ 2025 ¬´–ê–∫–∞–¥–µ–º–∏–∫ –õ–æ–º–æ–Ω–æ—Å–æ–≤-2¬ª –≤ –∞–∫–≤–∞—Ç–æ—Ä–∏–∏ –ü–µ–≤–µ–∫–∞ (–ß—É–∫–æ—Ç—Å–∫–∏–π –ê–û).
–ù–æ–≤–∞—è —Å—Ç–∞–Ω—Ü–∏—è –æ—Å–Ω–∞—â–µ–Ω–∞ –¥–≤—É–º—è —Ä–µ–∞–∫—Ç–æ—Ä–∞–º–∏ –ø–æ—Å–ª–µ–¥–Ω–µ–≥–æ –ø–æ–∫–æ–ª–µ–Ω–∏—è –†–ò–¢–ú-200–ú –º–æ—â–Ω–æ—Å—Ç—å—é –ø–æ 60 –ú–í—Ç –∫–∞–∂–¥—ã–π –∏ —Å–ø–æ—Å–æ–±–Ω–∞ –æ–¥–Ω–æ–≤—Ä–µ–º–µ–Ω–Ω–æ –≤—ã—Ä–∞–±–∞—Ç—ã–≤–∞—Ç—å –¥–æ 120 –ú–í—Ç —ç–ª–µ–∫—Ç—Ä–æ—ç–Ω–µ—Ä–≥–∏–∏ –∏ –¥–æ 100 –ì–∫–∞–ª/—á —Ç–µ–ø–ª–∞. –ü–æ —Å—Ä–∞–≤–Ω–µ–Ω–∏—é —Å –ø–µ—Ä–≤–æ–π –ü–ê–¢–≠–° (¬´–ê–∫–∞–¥–µ–º–∏–∫ –õ–æ–º–æ–Ω–æ—Å–æ–≤¬ª, –∑–∞–ø—É—â–µ–Ω –≤ 2019 –≥–æ–¥—É), –Ω–æ–≤–∞—è —Å—Ç–∞–Ω—Ü–∏—è –Ω–∞ 30 % –∫–æ–º–ø–∞–∫—Ç–Ω–µ–µ, –Ω–∞ 20 % —ç–∫–æ–Ω–æ–º–∏—á–Ω–µ–µ –ø–æ —Ç–æ–ø–ª–∏–≤—É –∏ –∏–º–µ–µ—Ç —É–≤–µ–ª–∏—á–µ–Ω–Ω—ã–π —Å—Ä–æ–∫ —ç–∫—Å–ø–ª—É–∞—Ç–∞—Ü–∏–∏ –¥–æ 60 –ª–µ—Ç –±–µ–∑ –ø–µ—Ä–µ–∑–∞—Ä—è–¥–∫–∏.'''
print(classify_text(example))


{'prediction': 'Human-written', 'confidence': 0.9510883089432539}


In [None]:
import joblib
joblib.dump(model, "model.pkl")

['model.pkl']