In [None]:
import numpy as np
import pandas as pd
import os


MODEL KLASYCZNY ML


In [None]:
from scipy.sparse import load_npz
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

X_train = load_npz('data/processed/X_train_tfidf.npz')
X_test  = load_npz('data/processed/X_test_tfidf.npz')

y_train = np.load('data/processed/y_train.npy')
y_test  = np.load('data/processed/y_test.npy')

X_train.shape, X_test.shape


In [None]:
ml_model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

ml_model.fit(X_train, y_train)


In [None]:
y_pred_ml = ml_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_ml))
print(classification_report(y_test, y_pred_ml))


In [None]:
os.makedirs('models', exist_ok=True)
joblib.dump(ml_model, 'models/logistic_regression.pkl')


SIEÄ† NEURONOWA OD ZERA

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset


In [None]:
X_train_dense = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_test_dense  = torch.tensor(X_test.toarray(), dtype=torch.float32)

y_train_t = torch.tensor(y_train, dtype=torch.float32)
y_test_t  = torch.tensor(y_test, dtype=torch.float32)

train_ds = TensorDataset(X_train_dense, y_train_t)
test_ds  = TensorDataset(X_test_dense, y_test_t)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)


In [None]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_nn = SimpleNN(X_train.shape[1]).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model_nn.parameters(), lr=0.001)

epochs = 5

for epoch in range(epochs):
    model_nn.train()
    total_loss = 0

    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device).unsqueeze(1)

        optimizer.zero_grad()
        preds = model_nn(Xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f}")


In [None]:
model_nn.eval()
with torch.no_grad():
    preds = model_nn(X_test_dense.to(device)).cpu().numpy()
    preds = (preds > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))


In [None]:
torch.save(model_nn.state_dict(), 'models/nn_from_scratch.pt')


TRANSFORMER (FINE-TUNING)

In [None]:
!pip install -q transformers datasets accelerate


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset


In [None]:
train_df = pd.read_csv('data/processed/train_transformer.csv')
test_df  = pd.read_csv('data/processed/test_transformer.csv')

train_df = train_df.sample(3000, random_state=42)
test_df  = test_df.sample(500, random_state=42)



train_df.head()


In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch['clean_text'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)

train_ds = train_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

train_ds = train_ds.rename_column("label", "labels")
test_ds  = test_ds.rename_column("label", "labels")

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
transformer_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)


In [None]:
training_args = TrainingArguments(
    output_dir="models/transformer",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="results/logs",
    report_to="none"
)


trainer = Trainer(
    model=transformer_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer
)

trainer.train()


In [None]:
trainer.evaluate()


In [None]:
trainer.save_model("models/transformer")
tokenizer.save_pretrained("models/transformer")
