# Домашнее задание. Часть 2 (HW)

Задача

1. Обучите несколько моделей рекуррентных нейронных сетей, например LSTM, GRU, Bidirectional-LSTM.
   
2. Посчитайте значение метрики, которую вы предложили в части №1 и сравните результаты для разных RNN, эвристик и классического ML.

## Обучение моделей RNN

### LTSM 

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import pandas as pd

labels = [0, 1, 2, 3, 4]

df = pd.read_json(r'data\data.json')

df['labels'] = df['sentiment'].map({'extremely negative': 0, 'negative': 1, 'neutral':2, 'positive':3, "extremely positive":4})


X_train, X_test, y_train, y_test = train_test_split(df['lemmatization'], df["labels"], test_size=0.2, random_state=42)

# Параметры
max_words = 10000  # Максимальное количество слов в словаре

# Токенизация
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)


max_length = max(len(x) for x in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

y_train = to_categorical(y_train, num_classes=5)

y_test = to_categorical(y_test, num_classes=5)


In [2]:
print(y_train)

[[0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]]


In [3]:
import numpy as np

embedding_dim = 100

# Загрузка GloVe
embedding_index = {}
with open(r"data\glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = coefs

# Создание матрицы эмбеддингов
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam


# Встраивание предобученных эмбеддингов в модель
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=True))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(5, activation='softmax'))


optimizer = Adam(learning_rate=0.0001)

# Компиляция модели
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Вывод структуры модели
model.summary()



In [5]:
# Обучение
history = model.fit(X_train_pad, y_train,
                    epochs=10,   # Количество эпох
                    batch_size=32,  # Размер батча
                    validation_data=(X_test_pad, y_test))


loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")



Epoch 1/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 23ms/step - accuracy: 0.2825 - loss: 1.5527 - val_accuracy: 0.3994 - val_loss: 1.3632
Epoch 2/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 24ms/step - accuracy: 0.4230 - loss: 1.3202 - val_accuracy: 0.4718 - val_loss: 1.2224
Epoch 3/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21ms/step - accuracy: 0.4935 - loss: 1.1889 - val_accuracy: 0.5268 - val_loss: 1.1163
Epoch 4/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.5513 - loss: 1.0761 - val_accuracy: 0.5605 - val_loss: 1.0407
Epoch 5/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.6010 - loss: 0.9817 - val_accuracy: 0.6006 - val_loss: 0.9816
Epoch 6/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.6354 - loss: 0.9155 - val_accuracy: 0.6150 - val_loss: 0.9429
Epoc

In [None]:
# Итоговая оценка модели после обучения
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6821 - loss: 0.8341
Test Loss: 0.8316106796264648
Test Accuracy: 0.6853741407394409


In [7]:
# Прогноз
pred = model.predict(X_test_pad)

[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


In [8]:
import numpy as np

predicted_classes = np.argmax(pred, axis=1)
true_classes = np.argmax(y_test, axis=1)

In [9]:
from sklearn.metrics import classification_report, confusion_matrix

# Матрица ошибок
conf_matrix = confusion_matrix(true_classes, predicted_classes)
print("Confusion Matrix:")
print(conf_matrix)

# Отчёт классификации
class_report = classification_report(true_classes, predicted_classes)
print("Classification Report:")
print(class_report)

Confusion Matrix:
[[ 715  334   11   30    7]
 [ 193 1316  182  331   21]
 [   5  193 1060  236   18]
 [  20  242  179 1557  264]
 [   2   27    6  289  994]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.65      0.70      1097
           1       0.62      0.64      0.63      2043
           2       0.74      0.70      0.72      1512
           3       0.64      0.69      0.66      2262
           4       0.76      0.75      0.76      1318

    accuracy                           0.69      8232
   macro avg       0.70      0.69      0.70      8232
weighted avg       0.69      0.69      0.69      8232



### GRU

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam


# Встраивание предобученных эмбеддингов в модель
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=True))
model.add(GRU(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(5, activation='sigmoid'))


optimizer = Adam(learning_rate=0.0001)

# Компиляция модели
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Вывод структуры модели
model.summary()



In [11]:
# Обучение
history = model.fit(X_train_pad, y_train,
                    epochs=10,   # Количество эпох
                    batch_size=32,  # Размер батча
                    validation_data=(X_test_pad, y_test))


loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 22ms/step - accuracy: 0.2544 - loss: 1.5946 - val_accuracy: 0.3263 - val_loss: 1.5139
Epoch 2/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 23ms/step - accuracy: 0.3547 - loss: 1.4518 - val_accuracy: 0.4807 - val_loss: 1.2215
Epoch 3/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 23ms/step - accuracy: 0.4892 - loss: 1.1865 - val_accuracy: 0.5411 - val_loss: 1.0892
Epoch 4/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 24ms/step - accuracy: 0.5610 - loss: 1.0474 - val_accuracy: 0.5847 - val_loss: 1.0084
Epoch 5/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 25ms/step - accuracy: 0.6158 - loss: 0.9504 - val_accuracy: 0.6181 - val_loss: 0.9408
Epoch 6/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 22ms/step - accuracy: 0.6558 - loss: 0.8742 - val_accuracy: 0.6542 - val_loss: 0.8904
Epoc

In [None]:
# Итоговая оценка модели после обучения
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

In [12]:
# Прогноз
pred = model.predict(X_test_pad)

[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


In [13]:
# Конвертация классов в значения
import numpy as np

predicted_classes = np.argmax(pred, axis=1)
true_classes = np.argmax(y_test, axis=1)

In [14]:
from sklearn.metrics import classification_report, confusion_matrix

# Матрица ошибок
conf_matrix = confusion_matrix(true_classes, predicted_classes)
print("Confusion Matrix:")
print(conf_matrix)

# Отчёт классификации
class_report = classification_report(true_classes, predicted_classes)
print("Classification Report:")
print(class_report)

Confusion Matrix:
[[ 743  315    8   25    6]
 [ 174 1440  165  256    8]
 [   3  175 1147  172   15]
 [  15  264  196 1615  172]
 [   4   27    7  363  917]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.68      0.73      1097
           1       0.65      0.70      0.68      2043
           2       0.75      0.76      0.76      1512
           3       0.66      0.71      0.69      2262
           4       0.82      0.70      0.75      1318

    accuracy                           0.71      8232
   macro avg       0.74      0.71      0.72      8232
weighted avg       0.72      0.71      0.71      8232



### Bidirectional RNN

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [19]:
# Встраивание предобученных эмбеддингов в модель
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=True))
model.add(Bidirectional(LSTM(units=64, return_sequences=False)))
model.add(Dense(5, activation='softmax'))


optimizer = Adam(learning_rate=0.0001)

# Компиляция модели
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Вывод структуры модели
model.summary()

In [20]:
# Обучение
history = model.fit(X_train_pad, y_train,
                    epochs=10,   # Количество эпох
                    batch_size=32,  # Размер батча
                    validation_data=(X_test_pad, y_test))


loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 27ms/step - accuracy: 0.3123 - loss: 1.5104 - val_accuracy: 0.4444 - val_loss: 1.2868
Epoch 2/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 26ms/step - accuracy: 0.4616 - loss: 1.2437 - val_accuracy: 0.5157 - val_loss: 1.1528
Epoch 3/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 25ms/step - accuracy: 0.5390 - loss: 1.1012 - val_accuracy: 0.5571 - val_loss: 1.0620
Epoch 4/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 26ms/step - accuracy: 0.5858 - loss: 1.0083 - val_accuracy: 0.5719 - val_loss: 1.0314
Epoch 5/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 26ms/step - accuracy: 0.6312 - loss: 0.9252 - val_accuracy: 0.6161 - val_loss: 0.9559
Epoch 6/10
[1m1029/1029[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 25ms/step - accuracy: 0.6722 - loss: 0.8516 - val_accuracy: 0.6424 - val_loss: 0.9113
Epoc

In [21]:
# Итоговая оценка модели после обучения
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.6800 - loss: 0.8493
Test Loss: 0.8386980891227722
Test Accuracy: 0.6830660700798035


In [22]:
# Прогноз
pred = model.predict(X_test_pad)

[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step


In [23]:
# Конвертация классов one-hot в значения
import numpy as np

predicted_classes = np.argmax(pred, axis=1)
true_classes = np.argmax(y_test, axis=1)

In [24]:
from sklearn.metrics import classification_report, confusion_matrix

# Матрица ошибок
conf_matrix = confusion_matrix(true_classes, predicted_classes)
print("Confusion Matrix:")
print(conf_matrix)

# Отчёт классификации
class_report = classification_report(true_classes, predicted_classes)
print("Classification Report:")
print(class_report)

Confusion Matrix:
[[ 742  299    9   40    7]
 [ 199 1287  129  406   22]
 [   3  215  979  300   15]
 [  18  218  118 1583  325]
 [   3   23    5  255 1032]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.68      0.72      1097
           1       0.63      0.63      0.63      2043
           2       0.79      0.65      0.71      1512
           3       0.61      0.70      0.65      2262
           4       0.74      0.78      0.76      1318

    accuracy                           0.68      8232
   macro avg       0.71      0.69      0.69      8232
weighted avg       0.69      0.68      0.68      8232



## Выводы: