In [339]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import numpy as np


In [305]:
df = pd.read_csv(r"C:\Users\Roman Shypka\OneDrive\–†–æ–±–æ—á–∏–π —Å—Ç—ñ–ª\news_data.csv")


In [306]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50030 entries, 0 to 50029
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  50030 non-null  int64 
 1   Text        50027 non-null  object
 2   Label       50030 non-null  bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 830.7+ KB


In [413]:
## clean data from diferent http
df["Text"] = df["Text"].astype(str).str.replace(r'https?://\S+|www\.\S+|bit\.ly\S+',"", regex=True).str.strip()

In [412]:
df[df["Text"].str.contains("https",na=False)]

Unnamed: 0.1,Unnamed: 0,Text,Label
12115,12115,7 –Ω–∞–π—Ü—ñ–∫–∞–≤—ñ—à–∏—Ö —Ç–µ–∫—Å—Ç—ñ–≤ –¢–°–ùua –∑–∞ —Ç–∏–∂–¥–µ–Ω—å–°–ø–∞–ª–∞—Ö ...,1


In [309]:
# conversion from bool type in int 
le = LabelEncoder()
df["Label"] = le.fit_transform(df["Label"])

In [414]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Text,Label
0,0,–í –£–∫—Ä–∞—ó–Ω—ñ –∑–∞–ø—É—Å—Ç—è—Ç—å –º–µ—Ö–∞–Ω—ñ–∑–º –ø–µ—Ä–µ–≤—ñ—Ä–∫–∏ –Ω–∞—è–≤–Ω–æ—Å...,1
1,1,–£ –§–ª–æ—Ä–∏–¥—ñ —á–µ—Ä–µ–∑ —Ä–µ–∫–æ—Ä–¥–Ω–æ –Ω–∏–∑—å–∫—ñ —Ç–µ–º–ø–µ—Ä–∞—Ç—É—Ä–∏ –Ω–∞...,1
2,2,–¢–ì-–∫–∞–Ω–∞–ª FUCK–¢–ò –ø–æ–≤—ñ–¥–æ–º–∏–≤ –ø—Ä–æ –±–æ–º–±–∞—Ä–¥—É–≤–∞–Ω–Ω—è –ö–∏...,0
3,3,–û–∫—É–ø–æ–≤–∞–Ω–∏–π –ú–∞—Ä—ñ—É–ø–æ–ª—å –Ω–µ –∑–¥–∞—î—Ç—å—Å—è. –Æ–Ω–∞–∫ –≤–∏–π—à–æ–≤ ...,1
4,4,"–ß–µ—Ä–Ω—ñ–≥—ñ–≤, –ö—Ä–æ–ø–∏–≤–Ω–∏—Ü—å–∫–∏–π —ñ –æ–±–ª–∞—Å—Ç—å ‚Äî –ø–æ–≤—ñ—Ç—Ä—è–Ω–∞ ...",1


In [415]:
df["Label"].value_counts()

Label
1    48006
0     2024
Name: count, dtype: int64

## create balance data

In [312]:
fakes = df[df["Label"] == 0] 

In [313]:
trues = df[df["Label"] == 1 ].sample(n=7000,random_state=42)


In [416]:
df_balanced = pd.concat([trues,fakes]).sample(frac=1,random_state=42).reset_index(drop=True)
df_balanced["Label"].value_counts()

Label
1    7000
0    2024
Name: count, dtype: int64

In [383]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

## Train

In [384]:
X = (df_balanced["Text"])
y = df_balanced["Label"]

In [385]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [386]:
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

In [387]:
model = LogisticRegression()
model.fit(X_train,y_train)

In [388]:
predictions = model.predict(X_test)

## Metrics

In [389]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [390]:
metrics = ["accuracy","precision","recall","f1"]

In [394]:
print(classification_report(y_test, predictions , target_names = ["Fake(0)","True(1)"]))

              precision    recall  f1-score   support

     Fake(0)       0.99      0.63      0.77       437
     True(1)       0.89      1.00      0.94      1368

    accuracy                           0.91      1805
   macro avg       0.94      0.81      0.86      1805
weighted avg       0.92      0.91      0.90      1805



In [441]:
my_text = "–ó–µ–ª–µ–Ω—Å—å–∫–∏–π –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç —É–∫—Ä–∞—ó–Ω–∏ —Ç–µ–ª–µ–≥—Ä–∞–º " 

In [442]:
my_vector = vectorizer.transform([my_text])

In [443]:
prediction = model.predict(my_vector)

In [444]:
# model primitive and not smart 
prediction

array([0])

In [445]:
pd.Series(predictions).value_counts()

1    1529
0     276
Name: count, dtype: int64

In [411]:
y_train.value_counts()

Label
1    5632
0    1587
Name: count, dtype: int64

In [419]:
# 1. Retrieve all features (vocabulary) learned by the vectorizer
feature_names = vectorizer.get_feature_names_out()

# 2. Extract weights (coefficients) from the trained logistic regression model
# model.coef_[0] contains the importance values for each feature
coefficients = model.coef_[0]

# 3. Pair each feature name with its corresponding coefficient
word_weights = list(zip(feature_names, coefficients))

# 4. Sort the list by weights (from the most negative to the most positive)
sorted_words = sorted(word_weights, key=lambda x: x[1])

# --- DISPLAYING THE RESULTS ---

print("üõë TOP-20 words that strongly indicate 'FAKE' (pulling towards 0):")
print("-" * 50)
# Slice the first 20 words with the lowest (negative) coefficients
for word, weight in sorted_words[:20]:
    print(f"{word}: {weight:.4f}")

print("\n ‚úÖ TOP-20 words that strongly indicate 'TRUE' (pulling towards 1):")
print("-" * 50)
# Slice the last 20 words with the highest (positive) coefficients and reverse them
for word, weight in reversed(sorted_words[-20:]):
    print(f"{word}: {weight:.4f}")

üõë TOP-20 words that strongly indicate 'FAKE' (pulling towards 0):
--------------------------------------------------
—Ç–µ–ª–µ–≥—Ä–∞–º: -7.7016
–ø—Ä–æ —Ü–µ: -6.0191
–∫–∞–Ω–∞–ª–∏: -4.7637
–∑–º—ñ: -4.5522
—É–∫—Ä–∞—ó–Ω—Å—å–∫—ñ: -4.3787
—Ä–æ—Å—ñ—ó: -4.3354
—Ç–µ–ª–µ–≥—Ä–∞–º –∫–∞–Ω–∞–ª–∏: -3.6781
–∫–∞–Ω–∞–ª–∞—Ö: -3.6043
–ø–æ–≤—ñ–¥–æ–º–ª–µ–Ω–Ω—è: -3.3272
–ø—Ä–æ: -3.0683
—Ü–µ: -3.0306
telegram: -2.9411
—Ä–æ—Å—ñ–π—Å—å–∫—ñ: -2.9208
–ø–æ—à–∏—Ä—é—î—Ç—å—Å—è: -2.8752
—Ç–∞–∫—ñ: -2.6878
—Ç–µ–ª–µ–≥—Ä–∞–º –∫–∞–Ω–∞–ª–∞—Ö: -2.6540
—Ç–∞–∫—ñ –ø–æ–≤—ñ–¥–æ–º–ª–µ–Ω–Ω—è: -2.5574
–ø–∏—à—É—Ç—å: -2.4398
—É–∫—Ä–∞—ó–Ω—Å—å–∫—ñ —Ç–µ–ª–µ–≥—Ä–∞–º: -2.3926
–ø–æ–≤—ñ–¥–æ–º–ª—è—é—Ç—å: -2.2967

 ‚úÖ TOP-20 words that strongly indicate 'TRUE' (pulling towards 1):
--------------------------------------------------
—Ç—Å–Ω: 4.2081
—á–∏—Ç–∞–π—Ç–µ: 3.1856
–¥–∞–ª—ñ: 3.1262
—á–∏—Ç–∞–π—Ç–µ –¥–∞–ª—ñ: 3.0597
–æ–∫—É–ø–∞–Ω—Ç–∏: 2.4218
—â–æ: 2.2923
ua: 2.1572
—Ç—Å–Ω ua: 2.0880
—Ç—Ä–∏–≤–æ–≥–∞: 2.0733
–∑–∞: 2.0688
—è–∫: 1.8887
–ø–æ–≤—ñ—Ç—