## Логістична регресія 

In [1]:
import os
os.environ['http_proxy'] = 'HTTP_PROXY=http://10.144.1.10:8080/ '
os.environ['https_proxy'] = 'HTTPS_PROXY=http://10.144.1.10:8080/'

In [2]:
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from preprocessing import (get_x, get_y, tokenize_x)

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


### Setup

In [3]:
X = tokenize_x(get_x())
y = get_y()

In [4]:
X = torch.tensor(X, dtype=torch.float)
y = torch.tensor(y, dtype=torch.float).view(-1, 1) 

In [5]:
# train, val, test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [6]:
num_epochs = 100
input_dim = 25
output_dim = 1
learning_rate = 0.01

In [7]:
class LogisticRegression(torch.nn.Module):
     def __init__(self, input_dim, output_dim):
         super(LogisticRegression, self).__init__()
         self.linear = torch.nn.Linear(input_dim, output_dim)
     def forward(self, x):
        outputs = torch.sigmoid(self.linear(x))
        return outputs

In [8]:
model = LogisticRegression(input_dim=input_dim, output_dim=output_dim)

In [9]:
criterion = torch.nn.BCELoss()

In [10]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Тренування

In [11]:
from sklearn.metrics import accuracy_score

In [12]:
best_loss = float('inf')
best_epoch = 0
accuracies_train = []
accuracies_val = []
losses_train = []
losses_val = []

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    
    # loss
    loss = criterion(outputs, y_train)
    losses_train.append(loss)
    loss.backward()
    optimizer.step()

    # accuracy
    with torch.no_grad():
        predicted = (outputs >= 0.5).squeeze().long()
        accuracy = accuracy_score(y_train, predicted)
        accuracies_train.append(accuracy)
    
    # validation
    model.eval()
    val_outputs = model(X_val)

    # val loss
    val_loss = criterion(val_outputs, y_val)
    losses_val.append(val_loss)

    # val accuracy
    with torch.no_grad():
        predicted_val = (val_outputs >= 0.5).squeeze().long() 
        accuracy_val = accuracy_score(y_val, predicted_val)
        accuracies_val.append(accuracy_val)
    
    if val_loss < best_loss:
        best_loss = val_loss
        best_epoch = epoch
        best_weights = model.state_dict()
    
    print(f"Epoch: {epoch+1}/{num_epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}")
print("Best epoch: ", best_epoch)


Epoch: 1/100, Training Loss: 37.56047439575195, Validation Loss: 33.027069091796875
Epoch: 2/100, Training Loss: 34.56238555908203, Validation Loss: 29.333694458007812
Epoch: 3/100, Training Loss: 31.857528686523438, Validation Loss: 27.703554153442383
Epoch: 4/100, Training Loss: 29.49394416809082, Validation Loss: 25.809852600097656
Epoch: 5/100, Training Loss: 27.81968116760254, Validation Loss: 23.443328857421875
Epoch: 6/100, Training Loss: 26.00849723815918, Validation Loss: 22.17647361755371
Epoch: 7/100, Training Loss: 24.736053466796875, Validation Loss: 21.478273391723633
Epoch: 8/100, Training Loss: 23.189212799072266, Validation Loss: 19.96458625793457
Epoch: 9/100, Training Loss: 22.026399612426758, Validation Loss: 18.40545654296875
Epoch: 10/100, Training Loss: 20.717071533203125, Validation Loss: 17.837322235107422
Epoch: 11/100, Training Loss: 19.581762313842773, Validation Loss: 17.029876708984375
Epoch: 12/100, Training Loss: 18.4271297454834, Validation Loss: 16.216

In [None]:
# lists to tensors
losses_train = torch.tensor(losses_train)
losses_val = torch.tensor(losses_val)
accuracies_train = torch.tensor(accuracies_train)
accuracies_val = torch.tensor(accuracies_val)

# training and validation losses
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(losses_train.detach().numpy(), label='Training Loss')
plt.plot(losses_val.detach().numpy(), label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# training and validation accuracies
plt.subplot(1, 2, 2)
plt.plot(accuracies_train.detach().numpy(), label='Training Accuracy')
plt.plot(accuracies_val.detach().numpy(), label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


### Метрики на тестових даних

*Метрики*:
- Accuracy
- Precision
- Recall 
- f1

In [14]:
from sklearn.metrics import (precision_score, recall_score, f1_score)

In [15]:
test_outputs = model(X_test)
test_outputs = (test_outputs >= 0.5).squeeze().long()


In [16]:
test_accuracy = accuracy_score(y_test, test_outputs)
test_precision = precision_score(y_test, test_outputs)
test_recall = recall_score(y_test, test_outputs)
test_f1 = f1_score(y_test, test_outputs)

In [17]:
print("Accuracy:", test_accuracy)
print("Precision:", test_precision)
print("Recall:", test_recall)
print("f1:", test_f1)

Accuracy: 0.8564593301435407
Precision: 0.21428571428571427
Recall: 0.026785714285714284
f1: 0.047619047619047616


Precision, recall and f1 близькі до 0. Це означає, що модель взагалі не визначає true positives.