## Логістична регресія 

In [None]:
import torch
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from preprocessing import (get_x, get_y, tokenize_x)

### Setup

In [42]:
df = pd.read_csv(r'data.csv', encoding="utf-8")

In [4]:
df = df.sample(frac=1)

In [5]:
X = tokenize_x(get_x(df))
y = get_y(df)

In [6]:
X = torch.tensor(X, dtype=torch.float)
y = torch.tensor(y, dtype=torch.float).view(-1, 1) 

In [43]:
# train, val, test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [52]:
num_epochs = 1000
input_dim = 50
output_dim = 1

In [85]:
class LogisticRegression(torch.nn.Module):
     def __init__(self, input_dim, output_dim):
         super(LogisticRegression, self).__init__()
         self.linear = torch.nn.Linear(input_dim, output_dim)
     def forward(self, x):
        outputs = torch.sigmoid(self.linear(x))
        return outputs

In [86]:
model = LogisticRegression(input_dim=input_dim, output_dim=output_dim)

In [87]:
criterion = torch.nn.BCELoss()

In [88]:
learning_rate = 0.001

In [89]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Тренування

In [90]:
from sklearn.metrics import accuracy_score

In [91]:
best_loss = float('inf')
best_epoch = 0
accuracies_train = []
accuracies_val = []
losses_train = []
losses_val = []

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    
    # loss
    loss = criterion(outputs, y_train)
    losses_train.append(loss)
    loss.backward()
    optimizer.step()

    # accuracy
    with torch.no_grad():
        predicted = (outputs >= 0.5).squeeze().long()
        accuracy = accuracy_score(y_train, predicted)
        accuracies_train.append(accuracy)
    
    # validation
    model.eval()
    val_outputs = model(X_val)

    # val loss
    val_loss = criterion(val_outputs, y_val)
    losses_val.append(val_loss)

    # val accuracy
    with torch.no_grad():
        predicted_val = (val_outputs >= 0.5).squeeze().long() 
        accuracy_val = accuracy_score(y_val, predicted_val)
        accuracies_val.append(accuracy_val)
    
    if val_loss < best_loss:
        best_loss = val_loss
        best_epoch = epoch
        best_weights = model.state_dict()
    
    print(f"Epoch: {epoch+1}/{num_epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}")
print("Best epoch: ", best_epoch)


Epoch: 1/1000, Training Loss: 64.33207702636719, Validation Loss: 62.55073165893555
Epoch: 2/1000, Training Loss: 62.329071044921875, Validation Loss: 61.70651626586914
Epoch: 3/1000, Training Loss: 61.4993896484375, Validation Loss: 60.452850341796875
Epoch: 4/1000, Training Loss: 60.362857818603516, Validation Loss: 59.59870529174805
Epoch: 5/1000, Training Loss: 59.40127944946289, Validation Loss: 58.10063552856445
Epoch: 6/1000, Training Loss: 57.87353515625, Validation Loss: 56.708160400390625
Epoch: 7/1000, Training Loss: 56.58474349975586, Validation Loss: 55.08082580566406
Epoch: 8/1000, Training Loss: 55.06804656982422, Validation Loss: 53.58285903930664
Epoch: 9/1000, Training Loss: 53.52397918701172, Validation Loss: 51.767669677734375
Epoch: 10/1000, Training Loss: 51.78346633911133, Validation Loss: 50.64125061035156
Epoch: 11/1000, Training Loss: 50.67353057861328, Validation Loss: 48.47507858276367
Epoch: 12/1000, Training Loss: 48.49757385253906, Validation Loss: 46.883

In [None]:
# lists to tensors
losses_train = torch.tensor(losses_train)
losses_val = torch.tensor(losses_val)
accuracies_train = torch.tensor(accuracies_train)
accuracies_val = torch.tensor(accuracies_val)

# training and validation losses
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(losses_train.detach().numpy(), label='Training Loss')
plt.plot(losses_val.detach().numpy(), label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# training and validation accuracies
plt.subplot(1, 2, 2)
plt.plot(accuracies_train.detach().numpy(), label='Training Accuracy')
plt.plot(accuracies_val.detach().numpy(), label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


### Метрики на тестових даних

*Метрики*:
- Accuracy
- Precision
- Recall 
- f1

In [98]:
from sklearn.metrics import (precision_score, recall_score, f1_score)

In [99]:
test_outputs = model(X_test)
test_outputs = (test_outputs >= 0.5).squeeze().long()


In [100]:
test_accuracy = accuracy_score(y_test, test_outputs)
test_precision = precision_score(y_test, test_outputs)
test_recall = recall_score(y_test, test_outputs)
test_f1 = f1_score(y_test, test_outputs)

In [96]:
print("Accuracy:", test_accuracy)
print("Precision:", test_precision)
print("Recall:", test_recall)
print("f1:", test_f1)

Accuracy: 0.9218838127467569
Precision: 0.9065520945220193
Recall: 0.9424902289223898
f1: 0.9241719134957569
