In [31]:
# import bibliotek
import pandas as pd
import numpy as np

In [32]:
# odczyt danych
train = pd.read_csv("../data/raw/emotion_train.csv")
valid = pd.read_csv("../data/raw/emotion_validation.csv")

X_train = np.load("../data/processed/X_train.npy")
X_valid = np.load("../data/processed/X_valid.npy")

y_train = pd.read_csv("../data/processed/y_train.csv")['label']
y_valid = pd.read_csv("../data/processed/y_valid.csv")['label']

In [33]:
# klasyczny model ML (Logistic Regression)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

baseline_clf = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

baseline_clf.fit(X_train, y_train)

y_pred_lr = baseline_clf.predict(X_valid)

print("Dokładność:", accuracy_score(y_valid, y_pred_lr))
print(classification_report(y_valid, y_pred_lr))

Dokładność: 0.6176470588235294
              precision    recall  f1-score   support

           0       0.59      0.88      0.71       160
           1       0.78      0.41      0.54        97
           2       0.67      0.07      0.13        28
           3       0.58      0.54      0.56        89

    accuracy                           0.62       374
   macro avg       0.66      0.48      0.48       374
weighted avg       0.65      0.62      0.59       374



In [34]:
# sieć neuronowa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

num_features = X_train.shape[1]
num_classes = 4

nn_model = Sequential([
    Dense(256, activation='relu', input_shape=(num_features,)),
    Dropout(0.5),

    Dense(128, activation='relu'),
    
    Dense(num_classes, activation='softmax')
])

nn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [35]:
# analiza działania sieci
history = nn_model.fit(
    X_train,
    y_train,
    validation_data=(X_valid, y_valid),
    epochs=10,
    batch_size=32,
    verbose=1
)

Epoch 1/10
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4473 - loss: 1.2269 - val_accuracy: 0.5642 - val_loss: 1.0895
Epoch 2/10
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7587 - loss: 0.6677 - val_accuracy: 0.6310 - val_loss: 0.9531
Epoch 3/10
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9091 - loss: 0.2715 - val_accuracy: 0.6390 - val_loss: 1.0824
Epoch 4/10
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9579 - loss: 0.1337 - val_accuracy: 0.6257 - val_loss: 1.1605
Epoch 5/10
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9671 - loss: 0.0953 - val_accuracy: 0.6283 - val_loss: 1.2409
Epoch 6/10
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9662 - loss: 0.0941 - val_accuracy: 0.6257 - val_loss: 1.2794
Epoch 7/10
[1m102/102[0m 

In [36]:
# ewaluacja
y_pred_nn = nn_model.predict(X_valid).argmax(axis=1)

print(classification_report(y_valid, y_pred_nn))

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
              precision    recall  f1-score   support

           0       0.72      0.74      0.73       160
           1       0.58      0.58      0.58        97
           2       0.28      0.29      0.28        28
           3       0.56      0.54      0.55        89

    accuracy                           0.61       374
   macro avg       0.54      0.53      0.54       374
weighted avg       0.61      0.61      0.61       374



In [37]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

# import danych
from datasets import Dataset

train_df = pd.read_csv("../data/processed/train_clean.csv")
valid_df = pd.read_csv("../data/processed/valid_clean.csv")

train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)

# model transformerowy (distilBERT)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch['clean_text'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

train_ds = train_ds.map(tokenize, batched=True)
valid_ds = valid_ds.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(['clean_text'])
valid_ds = valid_ds.remove_columns(['clean_text'])

train_ds.set_format("torch")
valid_ds.set_format("torch")

transformer_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4
)

Map:   0%|          | 0/3257 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# fine-tuning
training_args = TrainingArguments(
    output_dir="../results",
    do_train=True,
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="../results/logs",
    save_total_limit=1
)

trainer = Trainer(
    model=transformer_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds
)

trainer.train()

Step,Training Loss
500,0.3462


TrainOutput(global_step=612, training_loss=0.3159285775976243, metrics={'train_runtime': 889.8437, 'train_samples_per_second': 10.981, 'train_steps_per_second': 0.688, 'total_flos': 323596279415808.0, 'train_loss': 0.3159285775976243, 'epoch': 3.0})

In [40]:
# ewaulacja
trainer.evaluate()



{'eval_loss': 0.7747447490692139,
 'eval_runtime': 8.1612,
 'eval_samples_per_second': 45.827,
 'eval_steps_per_second': 2.941,
 'epoch': 3.0}

In [41]:
# porównanie wyników
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

y_true = y_valid 

preds_output = trainer.predict(valid_ds)
y_pred_tr = preds_output.predictions.argmax(axis=1)


def get_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    return acc, precision, recall, f1

metrics = {}

metrics['Logistic Regression'] = get_metrics(y_true, y_pred_lr)
metrics['Neural Network'] = get_metrics(y_true, y_pred_nn)
metrics['Transformer'] = get_metrics(y_true, y_pred_tr)

df_results = pd.DataFrame(metrics, index=['Dokładność', 'Precyzja', 'Pełność', 'F-miara']).T
print(df_results)

                     Dokładność  Precyzja   Pełność   F-miara
Logistic Regression    0.617647  0.656058  0.476094  0.484510
Neural Network         0.614973  0.535315  0.534965  0.535059
Transformer            0.759358  0.698598  0.700092  0.697481


In [43]:
# zapis modeli
import joblib
joblib.dump(clf, "../models/logistic_regression.joblib")
nn_model.save("../models/nn_model.h5")
transformer_model.save_pretrained("../models/transformer_model")
tokenizer.save_pretrained("../models/transformer_model")



('../models/transformer_model\\tokenizer_config.json',
 '../models/transformer_model\\special_tokens_map.json',
 '../models/transformer_model\\vocab.txt',
 '../models/transformer_model\\added_tokens.json',
 '../models/transformer_model\\tokenizer.json')