In [1]:
# %pip install torch pandas transformers scikit-learn matplotlib nltk pymorphy2 pymorphy2-dicts-uk

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from preprocessing import (get_x, 
                           get_y, 
                           tokenize_x, 
                           tokenize_titles,
                           balance_data, 
                           get_x1, 
                           cat_titles_and_texts)

In [3]:
# import nltk
# nltk.download('punkt')

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU instead.")

Using GPU: NVIDIA A10G


In [5]:
df = pd.read_csv("translated.csv")
x = get_x(df)
y = get_y(df)
x_titles = get_x1(df)


In [6]:
df_zepopo = balance_data(pd.read_csv("src/data/data_set_4.csv"))
print(len(df_zepopo))
x1 = get_x(df_zepopo)
y1 = get_y(df_zepopo)
x1_titles = get_x1(df_zepopo)

Int64Index([ 706, 5325, 1665, 6675, 5605, 2317, 1691, 5030,  318, 3745,
            ...
            6500,  422, 1220, 3439, 3877, 7724, 6732,  102, 2418, 7796],
           dtype='int64', length=2498)
4913


In [7]:
df_realdata = pd.read_csv("src/data/new_real_news.csv")
print(len(df_realdata))
x2 = get_x(df_realdata)
y2 = get_y(df_realdata)
x2_titles = get_x1(df_realdata)

1188


In [8]:
# merging two datasets together
x += x1
x += x2

y += y1
y += y2

x_titles += x1_titles
x_titles += x2_titles

x = cat_titles_and_texts(titles=x_titles, texts=x)

In [9]:
y = torch.tensor(y, dtype=torch.float).view(-1, 1) 

In [10]:
import json
# loading an existing dictionary
with open("src/misc/dictionary.json", "r", encoding="utf-8") as f:
    dictionary = json.load(f)

In [11]:
X_train, X_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

In [12]:
len(X_train)

9069

In [None]:
X_train_tensor, X_val_tensor, X_test_tensor = torch.LongTensor(tokenize_x(X_train, dictionary=dictionary)), torch.LongTensor(tokenize_x(X_val, dictionary=dictionary)), torch.LongTensor(tokenize_x(X_test, dictionary=dictionary)),


In [None]:
print(device)

In [None]:
print(X_test[0], X_test_tensor[0])

In [None]:
# making sure the dataset is balanced
print("real news")
print(len([i for i in y if i == 0]))
print("fake news")
print(len([i for i in y if i == 1]))

In [None]:
X_train_tensor = X_train_tensor.to(device)
X_val_tensor = X_val_tensor.to(device)
X_test_tensor = X_test_tensor.to(device)

y_train = y_train.to(device)
y_val = y_val.to(device)
y_test = y_test.to(device)


In [None]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs = torch.sigmoid(self.linear(x))
        return outputs

In [None]:
input_dim = len(X_train_tensor[0])
output_dim = 1
learning_rate=0.005
num_epochs = 5000

In [None]:
model = LogisticRegression(input_dim=input_dim, 
                          output_dim=output_dim)
model.to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
import numpy as np

best_loss = float('inf')
best_epoch = 0
accuracies_train = []
accuracies_val = []
losses_train = []
losses_val = []

patience = 50
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor.float())
    print(outputs)
    
    # loss
    loss = criterion(outputs, y_train)
    losses_train.append(loss.item())  # Logging the loss value
    loss.backward()
    optimizer.step()

    # accuracy (need to move tensors to CPU before converting to NumPy)
    with torch.no_grad():
        predicted = (outputs >= 0.5).squeeze().long()
        accuracy = accuracy_score(y_train.cpu().numpy(), predicted.cpu().numpy())
        accuracies_train.append(accuracy)
    
    # validation
    model.eval()
    val_outputs = model(X_val_tensor.float())

    # val loss
    val_loss = criterion(val_outputs, y_val)
    losses_val.append(val_loss.item())  # Logging the validation loss value

    # val accuracy (move to CPU and convert to NumPy)
    with torch.no_grad():
        predicted_val = (val_outputs >= 0.5).squeeze().long()
        accuracy_val = accuracy_score(y_val.cpu().numpy(), predicted_val.cpu().numpy())
        accuracies_val.append(accuracy_val)
    
    if val_loss < best_loss:
        best_loss = val_loss
        best_epoch = epoch
        best_weights = model.state_dict()
        patience_counter = 0  # reset patience counter on improvement
    else:
        patience_counter += 1
    
    print(f"Epoch: {epoch+1}/{num_epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}")

    if patience_counter >= patience:
        print("Early stopping triggered")
        break
    
print("Best epoch: ", best_epoch)


In [None]:
if best_weights is not None:
    model.load_state_dict(best_weights)

In [None]:
print(accuracies_train[best_epoch],
accuracies_val[best_epoch],
losses_train[best_epoch],
losses_val[best_epoch])

In [None]:
# lists to tensors
losses_train = torch.tensor(losses_train)
losses_val = torch.tensor(losses_val)
accuracies_train = torch.tensor(accuracies_train)
accuracies_val = torch.tensor(accuracies_val)

# training and validation losses
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(losses_train.detach().numpy(), label='Training Loss')
plt.plot(losses_val.detach().numpy(), label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# training and validation accuracies
plt.subplot(1, 2, 2)
plt.plot(accuracies_train.detach().numpy(), label='Training Accuracy')
plt.plot(accuracies_val.detach().numpy(), label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import (precision_score, recall_score, f1_score)

In [None]:
test_outputs = model(X_test_tensor.float())
test_outputs = (test_outputs >= 0.5).squeeze().long()

In [None]:
test_outputs = test_outputs.cpu().numpy()
y_test = y_test.cpu().numpy()

In [None]:
test_accuracy = accuracy_score(y_test, test_outputs)
test_precision = precision_score(y_test, test_outputs)
test_recall = recall_score(y_test, test_outputs)
test_f1 = f1_score(y_test, test_outputs)

In [None]:
print("Accuracy:", test_accuracy)
print("Precision:", test_precision)
print("Recall:", test_recall)
print("f1:", test_f1)

test dataset (87 texts)

In [None]:
import json
with open("src/data/test_set.json", "r", encoding="utf-8") as f: 
    test_texts = json.load(f)

fake_texts = [t["text"] for t in test_texts if t["label"] == "Fake"]
fake_titles = [t["title"] for t in test_texts if t["label"] == "Fake"]
real_texts = [t["text"] for t in test_texts if t["label"] == "Real"]
real_titles = [t["text"] for t in test_texts if t["label"] == "Real"]

In [None]:
print(len(fake_texts), len(real_texts))

In [None]:
df = pd.DataFrame(columns = ["ukr_text", "label", "title_ukr"])
for i, text in enumerate(fake_texts): 
    df.loc[i] = {
        "ukr_text": text, 
        "label": "Fake", 
        "title_ukr": fake_titles[i]
    }

for i, text in enumerate(real_texts): 
    df.loc[i+42] = {
        "ukr_text": text, 
        "label": "Real", 
        "title_ukr": real_titles[i]
    }

In [None]:
test_data_x = torch.LongTensor(tokenize_x(get_x(df), dictionary=dictionary))
test_data_x_titles = torch.LongTensor(tokenize_titles(get_x1(df), dictionary=dictionary))
test_data_y = torch.tensor(get_y(df), dtype=torch.float).view(-1, 1).cpu()

In [None]:
test_data_x = test_data_x.to(device)
test_data_x_titles = test_data_x_titles.to(device)
# test_data_y = test_data_x.cpu()

In [None]:
test_outputs = model(test_data_x.float())
test_outputs = (test_outputs >= 0.5).squeeze().long()

In [None]:
test_outputs

In [None]:
test_outputs = test_outputs.cpu().numpy()
test_data_y = test_data_y.cpu().numpy()

In [None]:
test_accuracy = accuracy_score(test_data_y, test_outputs)
test_precision = precision_score(test_data_y, test_outputs)
test_recall = recall_score(test_data_y, test_outputs)
test_f1 = f1_score(test_data_y, test_outputs)

In [None]:
print(test_accuracy, test_precision, test_recall, test_f1)