# Домашнее задание. Часть 2 (HW)

Задача

1. Обучите несколько моделей рекуррентных нейронных сетей, например LSTM, GRU, Bidirectional-LSTM.
   
2. Посчитайте значение метрики, которую вы предложили в части №1 и сравните результаты для разных RNN, эвристик и классического ML.

## Обучение моделей RNN

### LTSM модель

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import pandas as pd

# метки классов
labels = [0, 1, 2, 3, 4]

# Считываем данные из датафрейма полученного в первом задании
df = pd.read_json(r'data\data.json')

# Переименовываем целевые классы
df['labels'] = df['sentiment'].map({'extremely negative': 0, 'negative': 1, 'neutral':2, 'positive':3, "extremely positive":4})

# Разделение на обучающую и тестовуую выборки
X_train, X_test, y_train, y_test = train_test_split(df['lemmatization'], df["labels"], test_size=0.2, random_state=42)

# Параметры
max_words = 10000  # Максимальное количество слов в словаре

# Токенизация
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# дополненние последовательностей
max_length = max(len(x) for x in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

# one hot кодирование
y_train = to_categorical(y_train, num_classes=5)
y_test = to_categorical(y_test, num_classes=5)


In [3]:
import numpy as np

embedding_dim = 100

# Загрузка GloVe
embedding_index = {}
with open(r"data\glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = coefs

# Создание матрицы эмбеддингов
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam


# Встраивание предобученных эмбеддингов в модель
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=True))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(5, activation='softmax'))

# Оптимизатор
optimizer = Adam(learning_rate=0.0001)

# Компиляция модели
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Вывод структуры модели
model.summary()



In [5]:
# Обучение
history = model.fit(X_train_pad, y_train,
                    epochs=10,   # Количество эпох
                    batch_size=32,  # Размер батча
                    validation_data=(X_test_pad, y_test))


loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")



Epoch 1/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 20ms/step - accuracy: 0.2916 - loss: 1.5381 - val_accuracy: 0.4138 - val_loss: 1.3271
Epoch 2/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20ms/step - accuracy: 0.4366 - loss: 1.2932 - val_accuracy: 0.4819 - val_loss: 1.2074
Epoch 3/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.5096 - loss: 1.1552 - val_accuracy: 0.5309 - val_loss: 1.0963
Epoch 4/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.5584 - loss: 1.0558 - val_accuracy: 0.5672 - val_loss: 1.0361
Epoch 5/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.6074 - loss: 0.9659 - val_accuracy: 0.5997 - val_loss: 0.9705
Epoch 6/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 21ms/step - accuracy: 0.6507 - loss: 0.8930 - val_accuracy: 0.6318 - val_loss: 0.9227
Epoc

In [6]:
# Итоговая оценка модели после обучения
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6821 - loss: 0.8410
Test Loss: 0.8308866620063782
Test Accuracy: 0.6869533658027649


In [7]:
# Прогноз
pred = model.predict(X_test_pad)

[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [8]:
import numpy as np

# Преобразование one-hot в классы
predicted_classes = np.argmax(pred, axis=1)
true_classes = np.argmax(y_test, axis=1)

In [9]:
from sklearn.metrics import classification_report, confusion_matrix

# Матрица ошибок
conf_matrix = confusion_matrix(true_classes, predicted_classes)
print("Confusion Matrix:")
print(conf_matrix)

# Отчёт по классификации для модели LTSM 
class_report = classification_report(true_classes, predicted_classes)
print("Classification Report:")
print(class_report)

Confusion Matrix:
[[ 682  371    8   28    8]
 [ 157 1397  172  291   26]
 [   3  214 1061  213   21]
 [  15  256  180 1487  324]
 [   3   32    5  250 1028]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.62      0.70      1097
           1       0.62      0.68      0.65      2043
           2       0.74      0.70      0.72      1512
           3       0.66      0.66      0.66      2262
           4       0.73      0.78      0.75      1318

    accuracy                           0.69      8232
   macro avg       0.71      0.69      0.70      8232
weighted avg       0.69      0.69      0.69      8232



### GRU

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam


# Встраивание предобученных эмбеддингов в модель
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=True))
model.add(GRU(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(5, activation='sigmoid'))


optimizer = Adam(learning_rate=0.0001)

# Компиляция модели
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Вывод структуры модели
model.summary()



In [11]:
# Обучение
history = model.fit(X_train_pad, y_train,
                    epochs=10,   # Количество эпох
                    batch_size=32,  # Размер батча
                    validation_data=(X_test_pad, y_test))


loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 21ms/step - accuracy: 0.2545 - loss: 1.6073 - val_accuracy: 0.3059 - val_loss: 1.5254
Epoch 2/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21ms/step - accuracy: 0.3240 - loss: 1.4902 - val_accuracy: 0.4510 - val_loss: 1.2639
Epoch 3/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 21ms/step - accuracy: 0.4655 - loss: 1.2246 - val_accuracy: 0.5260 - val_loss: 1.1213
Epoch 4/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23ms/step - accuracy: 0.5450 - loss: 1.0889 - val_accuracy: 0.5832 - val_loss: 1.0199
Epoch 5/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 23ms/step - accuracy: 0.6018 - loss: 0.9777 - val_accuracy: 0.6142 - val_loss: 0.9513
Epoch 6/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 21ms/step - accuracy: 0.6445 - loss: 0.8958 - val_accuracy: 0.6464 - val_loss: 0.8943
Epoc

In [12]:
# Итоговая оценка модели после обучения
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6984 - loss: 0.8046
Test Loss: 0.7969692945480347
Test Accuracy: 0.7021380066871643


In [13]:
# Прогноз
pred = model.predict(X_test_pad)

[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [14]:
# Конвертация one-hot в классы
import numpy as np

predicted_classes = np.argmax(pred, axis=1)
true_classes = np.argmax(y_test, axis=1)

In [15]:
from sklearn.metrics import classification_report, confusion_matrix

# Матрица ошибок
conf_matrix = confusion_matrix(true_classes, predicted_classes)
print("Confusion Matrix:")
print(conf_matrix)

# Отчёт классификации
class_report = classification_report(true_classes, predicted_classes)
print("Classification Report:")
print(class_report)

Confusion Matrix:
[[ 806  252   10   22    7]
 [ 276 1346  162  248   11]
 [   8  195 1151  142   16]
 [  24  276  227 1527  208]
 [   3   33   10  322  950]]
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.73      0.73      1097
           1       0.64      0.66      0.65      2043
           2       0.74      0.76      0.75      1512
           3       0.68      0.68      0.68      2262
           4       0.80      0.72      0.76      1318

    accuracy                           0.70      8232
   macro avg       0.71      0.71      0.71      8232
weighted avg       0.70      0.70      0.70      8232



### Bidirectional RNN

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [17]:
# Встраивание предобученных эмбеддингов в модель
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=True))
model.add(Bidirectional(LSTM(units=64, return_sequences=False)))
model.add(Dense(5, activation='softmax'))


optimizer = Adam(learning_rate=0.0001)

# Компиляция модели
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Вывод структуры модели
model.summary()



In [18]:
# Обучение
history = model.fit(X_train_pad, y_train,
                    epochs=10,   # Количество эпох
                    batch_size=32,  # Размер батча
                    validation_data=(X_test_pad, y_test))


loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 24ms/step - accuracy: 0.3142 - loss: 1.5074 - val_accuracy: 0.4395 - val_loss: 1.2914
Epoch 2/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 27ms/step - accuracy: 0.4660 - loss: 1.2419 - val_accuracy: 0.5131 - val_loss: 1.1473
Epoch 3/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 26ms/step - accuracy: 0.5323 - loss: 1.1166 - val_accuracy: 0.5488 - val_loss: 1.0686
Epoch 4/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 26ms/step - accuracy: 0.5880 - loss: 1.0058 - val_accuracy: 0.5781 - val_loss: 1.0175
Epoch 5/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 26ms/step - accuracy: 0.6252 - loss: 0.9353 - val_accuracy: 0.6143 - val_loss: 0.9520
Epoch 6/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 26ms/step - accuracy: 0.6604 - loss: 0.8656 - val_accuracy: 0.6322 - val_loss: 0.9152
Epoc

In [19]:
# Итоговая оценка модели после обучения
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6734 - loss: 0.8626
Test Loss: 0.8486164808273315
Test Accuracy: 0.6745626926422119


In [20]:
# Прогноз
pred = model.predict(X_test_pad)

[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step


In [21]:
# Конвертация классов one-hot в значения
import numpy as np

predicted_classes = np.argmax(pred, axis=1)
true_classes = np.argmax(y_test, axis=1)

In [22]:
from sklearn.metrics import classification_report, confusion_matrix

# Матрица ошибок
conf_matrix = confusion_matrix(true_classes, predicted_classes)
print("Confusion Matrix:")
print(conf_matrix)

# Отчёт классификации
class_report = classification_report(true_classes, predicted_classes)
print("Classification Report:")
print(class_report)

Confusion Matrix:
[[ 721  332   11   29    4]
 [ 197 1366  199  274    7]
 [   5  199 1147  150   11]
 [  16  313  278 1486  169]
 [   6   36    9  434  833]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.66      0.71      1097
           1       0.61      0.67      0.64      2043
           2       0.70      0.76      0.73      1512
           3       0.63      0.66      0.64      2262
           4       0.81      0.63      0.71      1318

    accuracy                           0.67      8232
   macro avg       0.70      0.67      0.68      8232
weighted avg       0.68      0.67      0.68      8232



## Выводы:

По результатам обучения RNN наиболее точной является GRU модель.