In [41]:
import sys
from pathlib import Path

_BASE_DIR = Path().resolve().parent.parent
print("(!) Make sure this dir is project directory: ", _BASE_DIR)
sys.path.append(str(_BASE_DIR))

# from utils.settings import get_in_out_dirs

# === Настройка директорий под Колаб ===
def get_in_out_dirs(base_name: str):
    base_dir = Path.cwd()
    input_dir = base_dir / "data" / "input" / base_name
    output_dir = base_dir / "data" / "output" / base_name
    input_dir.mkdir(parents=True, exist_ok=True)
    output_dir.mkdir(parents=True, exist_ok=True)
    return input_dir, output_dir

_LAB_NAME = "lab10"

INPUT_DIR, OUTPUT_DIR = get_in_out_dirs(base_name=_LAB_NAME)

DATA_DIR = INPUT_DIR / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_DIR = DATA_DIR / "train"
TRAIN_DIR.mkdir(parents=True, exist_ok=True)

TEST_DIR = DATA_DIR / "test"
TEST_DIR.mkdir(parents=True, exist_ok=True)

(!) Make sure this dir is project directory:  /


In [42]:
!ls data

input  output


# Лаб. 10

## Задание 1.  (из  тем  10.1–10.2):

Обучите рекуррентную нейронную сеть распознаванию тональности отзывов на тестовых данных открытого набора данных imdb.


In [43]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [44]:
# -----------------------------
# Загружаем данные
# -----------------------------
NUM_WORDS = 10000  # используем топ-N слов
MAX_LEN = 256      # максимальная длина отзыва

(train_x, train_y), (test_x, test_y) = imdb.load_data(num_words=NUM_WORDS)

# Дополняем последовательности до одинаковой длины
train_x = pad_sequences(train_x, maxlen=MAX_LEN, padding='post')
test_x = pad_sequences(test_x, maxlen=MAX_LEN, padding='post')

print(f"Train shape: {train_x.shape}, Test shape: {test_x.shape}")

Train shape: (25000, 256), Test shape: (25000, 256)


In [45]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [46]:
# -----------------------------
# Строим модель LSTM
# -----------------------------
embedding_dim = 128
lstm_units = 128

model = Sequential([
    Embedding(input_dim=NUM_WORDS, output_dim=embedding_dim),
    # LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2),
    LSTM(lstm_units), # без dropout, так быстрее на GPU
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

In [47]:
MODEL_DIR = OUTPUT_DIR / "model"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

OUT_MODEL_PATH = MODEL_DIR / "imdb_lstm_model.h5"

In [48]:
# import tensorflow as tf
# tf.config.run_functions_eagerly(True)

In [57]:
import os
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# -----------------------------
# Обучение или загрузка
# -----------------------------

force_train = True

if not force_train and os.path.exists(OUT_MODEL_PATH):
    print("Модель найдена, загружаем вместо тренировки...")
    model = load_model(OUT_MODEL_PATH)

else:
    print("Модель не найдена, тренируем...")

    # защита от перетренировки
    callbacks = [
      EarlyStopping(
          monitor="val_loss",      # следим за валидацией
          patience=3,              # стоп если N эпохи подряд ухудшение
          restore_best_weights=True
      ),
      ModelCheckpoint(
          filepath=OUT_MODEL_PATH,
          monitor="val_loss",
          save_best_only=True,
          verbose=2
      )
    ]

    history = model.fit(
        train_x, train_y,
        epochs=15,
        batch_size=128,
        validation_split=0.2,
        callbacks=callbacks
    )

    print(f"Модель сохранена в {OUT_MODEL_PATH}")

Модель не найдена, тренируем...
Epoch 1/15
[1m155/157[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - accuracy: 0.6386 - loss: 0.5705
Epoch 1: val_loss improved from inf to 0.53913, saving model to /content/data/output/lab10/model/imdb_lstm_model.h5




[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.6390 - loss: 0.5709 - val_accuracy: 0.8110 - val_loss: 0.5391
Epoch 2/15
[1m156/157[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - accuracy: 0.7930 - loss: 0.4983
Epoch 2: val_loss did not improve from 0.53913
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - accuracy: 0.7919 - loss: 0.4993 - val_accuracy: 0.5398 - val_loss: 0.6695
Epoch 3/15
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.6298 - loss: 0.5958
Epoch 3: val_loss improved from 0.53913 to 0.46245, saving model to /content/data/output/lab10/model/imdb_lstm_model.h5




[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.6303 - loss: 0.5954 - val_accuracy: 0.8180 - val_loss: 0.4625
Epoch 4/15
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8632 - loss: 0.3550
Epoch 4: val_loss improved from 0.46245 to 0.36295, saving model to /content/data/output/lab10/model/imdb_lstm_model.h5




[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 25ms/step - accuracy: 0.8634 - loss: 0.3547 - val_accuracy: 0.8510 - val_loss: 0.3630
Epoch 5/15
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9164 - loss: 0.2420
Epoch 5: val_loss improved from 0.36295 to 0.33933, saving model to /content/data/output/lab10/model/imdb_lstm_model.h5




[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.9164 - loss: 0.2418 - val_accuracy: 0.8730 - val_loss: 0.3393
Epoch 6/15
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9135 - loss: 0.2906
Epoch 6: val_loss did not improve from 0.33933
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9136 - loss: 0.2903 - val_accuracy: 0.8706 - val_loss: 0.3555
Epoch 7/15
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9611 - loss: 0.1413
Epoch 7: val_loss did not improve from 0.33933
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.9611 - loss: 0.1413 - val_accuracy: 0.8702 - val_loss: 0.3763
Epoch 8/15
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.9738 - loss: 0.1067
Epoch 8: va

In [58]:
# -----------------------------
# Оценка модели на тесте
# -----------------------------
loss, acc = model.evaluate(test_x, test_y)
print(f"Test loss: {loss:.4f}, Test accuracy: {acc:.4f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8647 - loss: 0.3525
Test loss: 0.3527, Test accuracy: 0.8634


## Задание 2.  (из  тем  10.1–10.2):

С использованием предварительно обученной нейронной сети определите тональность своего отзыва.

Прилагаю файл reviews retrieving - скрипт, позволяющий восстанавливать тексты отзывов с Intenet Movie DataBase.

В качестве входных данных нужно предоставить текст своего отзыва (можно написать прямо в окне "Ответ в виде текста").

В качестве результатов, пожалуйста, сдайте обученную рекуррентную нейронную сеть и результат распознавания ею Вашего отзыва.

In [59]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.datasets import imdb


class TextPreprocessor:
    def __init__(self):
        """
        num_words: ограничиваем словарь топ-словами (как при обучении IMDB)
        max_len: максимальная длина последовательности
        """
        self.num_words = NUM_WORDS
        self.max_len = MAX_LEN
        self.word_to_id = imdb.get_word_index()
        self.word_to_id = {k: (v + 3) for k, v in self.word_to_id.items() if v < self.num_words}
        self.word_to_id["<PAD>"] = 0
        self.word_to_id["<START>"] = 1
        self.word_to_id["<UNK>"] = 2
        self.word_to_id["<UNUSED>"] = 3

    def _clean_text(self, text):
        """Удаляем пунктуацию, переводим в нижний регистр"""
        text = text.lower()
        text = re.sub(r"[^\w\s]", "", text)
        return text

    def _text_to_sequence(self, text):
        """Конвертируем текст в список индексов"""
        cleaned = self._clean_text(text)
        words = cleaned.split()
        sequence = [1]  # <START>
        sequence += [self.word_to_id.get(word, 2) for word in words] # 2 = <UNK>
        return sequence

    def _pad_sequence(self, sequence):
        """Дополняем последовательность до max_len"""
        return pad_sequences([sequence], maxlen=self.max_len, padding='post')

    def preprocess(self, text):
        """Полная конвертация текста в готовую последовательность для модели"""
        seq = self._text_to_sequence(text)
        print("seq: ",seq)
        print("count of <UNK>: ", seq.count(2))
        padded = self._pad_sequence(seq)
        return padded


In [65]:
class ReviewTonalityPredictor:
  def __init__(self, model):
    self.model = model
    self.text_preprocessor = TextPreprocessor()

  def predict(self, review: str):
    seq = self.text_preprocessor.preprocess(review)
    pred = self.model.predict(seq)[0][0]
    sep_str = "\n" + "="*50
    print(f"{sep_str} Отзыв: {review[:50]}... {sep_str}")
    print(f"Тональность вашего отзыва: {'Положительная' if pred>=0.5 else 'Отрицательная'} ({pred:.4f})\n")

In [66]:
# -----------------------------
# Распознать мой отзыв
# -----------------------------

good_review = "This movie was amazing! The plot was engaging and the characters were believable."

predictor = ReviewTonalityPredictor(model=model)

reviews = [
    "Good but idk",

    "OK its ok",

    "Bad its bad",

    # Английский хороший
    "This movie was amazing! The plot was engaging and the characters were believable.",

    # Английский плохой
    "I hated this film. The story was boring, and the acting was terrible.",

    # Английский смешанный
    "The cinematography was beautiful, but the story was a bit predictable.",

    # Русский
    "Фильм понравился, сюжет интересный, актеры отлично справились с ролями.",

    # Длинный английский хороший
    """Quite entertaining
Worth watching twice, first time for the plot, second time to absorb some of the sharp dialogue.

Judging by some of the 1/10 reviews on here I'd say you probably have to be over the age of 16 and an IQ above 80 to appreciate some of the irony, or the reviewrs just like releasing some of their angst online. It's interesting to note that the most liked review, a 1/10, was given by someone with 87% 1/10 reviews! Form your own conclusions about that!

Brianna Roy has a bit of a Reese Witherspoon thing going on about her.""",

    # Длинный английский плохой
    """Loved the excellent original with Chris Pratt, so was looking forward to this season. Total waste of time. Why does anyone want to watch the greedy traitor's story? The character and the actor portraying him is a total weeny! Especially after watching Chris Pratt creatively and effectively terminate the bad guys in the first season. Taylor Kitsch does not have the star power or acting skills to be the lead. And what's with the annoying long hair, scruffy beard and pucker lips as his only expression? He looks ridiculous and unbelievable as a Navy seal.

There is no real plot that grabs you. 3 episodes in and we're snoring. Don't waste your time on this one.
  """,
]

predictor = ReviewTonalityPredictor(model=model)

for r in reviews:
    predictor.predict(r)

seq:  [1, 52, 21, 2]
count of <UNK>:  1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step

Тональность вашего отзыва: Положительная (0.7862)

seq:  [1, 608, 94, 608]
count of <UNK>:  0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step

Тональность вашего отзыва: Отрицательная (0.0318)

seq:  [1, 78, 94, 78]
count of <UNK>:  0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step

Тональность вашего отзыва: Отрицательная (0.0415)

seq:  [1, 14, 20, 16, 480, 4, 114, 16, 1728, 5, 4, 105, 71, 867]
count of <UNK>:  0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step

Тональность вашего отзыва: Положительная (0.8287)

seq:  [1, 13, 1800, 14, 22, 4, 65, 16, 357, 5, 4, 116, 16, 394]
count of <UNK>:  0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step

Тональность вашего отзыва: Отрицательная (0.0284)

seq:  [1, 4, 627, 16, 307, 21, 4, 65, 16, 6, 227, 727]
count of <UNK>:  0
[1m1/1[0m [3