In [1]:
#датасет взят с
#https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
from sklearn.preprocessing import LabelBinarizer
from bs4 import BeautifulSoup
import re
import torch.nn as nn
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk import FreqDist, word_tokenize
import nltk
nltk.download('punkt')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

top_words = 5000
max_veview_length = 100

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sergey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
%matplotlib widget
def loss_plot(fig, ax, train_loss, test_loss, loss_name):
    train_line = ax.plot(train_loss, color = 'black')
    test_line = ax.plot(test_loss, color = 'red')
    ax.set_xlabel('Эпоха')
    ax.set_ylabel(loss_name)
    ax.legend(('Тренировочная выборка', 'Тестовая выборка'))
    fig.canvas.draw()

In [3]:
imdb_data=pd.read_csv('IMDB Dataset.csv')

In [4]:
#взял немного кода с https://www.kaggle.com/lakshmi25npathi/sentiment-analysis-of-imdb-movie-reviews
#для предобработки

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(denoise_text)

#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_special_characters)

In [5]:
#подсчитаем кол-во слов за исключением стоп-слов

tokens = nltk.tokenize.word_tokenize(imdb_data['review'].str.lower().str.cat(sep = ' '))
stopwords = nltk.corpus.stopwords.words('english')
words_count = nltk.FreqDist(w for w in tokens if w not in stopwords) 
words_to_index = dict(zip([x[0] for x in words_count.most_common(top_words)], range(5000)))

In [6]:
#переведём отзывы в списки индексов
def to_list_indices(s):
    return [words_to_index[x.lower()] for x in nltk.tokenize.word_tokenize(s) if x.lower() in words_to_index]

imdb_data['indices'] = imdb_data['review'].apply(to_list_indices)

In [33]:
#пофильтруем по длине отзывов
data = imdb_data[imdb_data['indices'].apply(len) < max_veview_length]

In [34]:
#переводим метки в числа
lb=LabelBinarizer()
sentiment_data=lb.fit_transform(data['sentiment'])

In [35]:
#нарезаем выборки
train_x, test_x = data['indices'][:int(len(data)*0.8)], data['indices'][int(len(data)*0.8):]
train_y, test_y = sentiment_data[:int(len(data)*0.8)], sentiment_data[int(len(data)*0.8):]

In [36]:
#дополняем короткие отзывы, переводим в тензоры
pad_token = top_words
train_x = nn.utils.rnn.pad_sequence([torch.Tensor(x).to(device).long() for x in train_x], padding_value = pad_token, batch_first = True)
test_x = nn.utils.rnn.pad_sequence([torch.Tensor(x).to(device).long() for x in test_x], padding_value = pad_token, batch_first = True)
test_y = torch.Tensor(test_y).to(device)
train_y = torch.Tensor(train_y).to(device)

In [37]:
#готовим загрузчик батчей
class Dataset(torch.utils.data.Dataset):
        def __init__(self, x, y):
            self.y = y
            self.x = x

        def __len__(self):
            return len(self.x)

        def __getitem__(self, index):
            X = self.x[index]
            y = self.y[index]
            return X, y
    
train_set = Dataset(train_x, train_y)
test_set = Dataset(test_x, test_y)
train_loader = torch.utils.data.DataLoader(train_set, batch_size = 32)
test_loader = torch.utils.data.DataLoader(test_set, batch_size = 16)

In [44]:
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embed_dim, padding_idx=pad_token)
        self.lstm = nn.LSTM(embed_dim, 100, 1, batch_first = True)
        self.fc = nn.Linear(100, 1)
        self.drop = nn.Dropout(0.2)
        self.sig = nn.Sigmoid()


    def forward(self, text):
        x = self.embedding(text)
        x = self.drop(x)
        _, (x, _) = self.lstm(x)
        #только последний выход lstm
        x = x.view(-1, 100)
        x = self.drop(x)
        x = self.fc(x)
        x = self.sig(x)
        return x
    
model = TextSentiment(top_words, 32).to(device)

In [45]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)
cross_entropy = nn.BCELoss()

In [46]:
%matplotlib widget
fig = plt.figure(figsize=(8,10))
ax = fig.subplots(2,1)
fig.tight_layout(pad=3.0)

ax1, ax2 = ax
train_loss = []
test_loss = []
train_acc = []
test_acc = []
epochs = 45
for i in range(epochs):
    train_loss_epoch = 0
    train_acc_epoch = 0
    test_loss_epoch = 0
    test_acc_epoch = 0
    for x, y in train_loader:
        y_hat = model(x)
        loss = cross_entropy(y_hat, y)
        train_loss_epoch += loss.item()
        train_acc_epoch += torch.mean((y == (y_hat > 0.5).float()).float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss.append(train_loss_epoch / len(train_loader))
    train_acc.append(train_acc_epoch / len(train_loader))
    with torch.no_grad():
        for x, y in test_loader:
            y_hat = model(x)
            loss = cross_entropy(y_hat, y)
            test_loss_epoch += loss.item()
            test_acc_epoch += torch.mean((y == (y_hat > 0.5).float()).float())
    test_loss.append(test_loss_epoch / len(test_loader))
    test_acc.append(test_acc_epoch / len(test_loader))
    fig.suptitle('Архитектура с одной LSTM', fontsize=16)
    loss_plot(fig,ax1,train_loss, test_loss, "Кросс-энтропия")
    print(f'epoch{i} train_loss:{train_loss[-1]} test_loss:{test_loss[-1]}')
    print(f'train_acc:{train_acc[-1]} test_acc:{test_acc[-1]}')
    loss_plot(fig,ax2,train_acc, test_acc, "Точность")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

epoch0 train_loss:0.6934694310862979 test_loss:0.6932963947038273
train_acc:0.49819809198379517 test_acc:0.4999887943267822
epoch1 train_loss:0.6931414217243106 test_loss:0.6933853341427161
train_acc:0.49819809198379517 test_acc:0.4961897134780884
epoch2 train_loss:0.6929936456930387 test_loss:0.6930097763910716
train_acc:0.49819809198379517 test_acc:0.5023534297943115
epoch3 train_loss:0.6797938072459125 test_loss:0.6529885433909498
train_acc:0.518546462059021 test_acc:0.6494531035423279
epoch4 train_loss:0.6463825758590009 test_loss:0.637794469272618
train_acc:0.5092588663101196 test_acc:0.6600546836853027
epoch5 train_loss:0.6373666166772931 test_loss:0.6279243233598474
train_acc:0.5731198787689209 test_acc:0.6748027801513672
epoch6 train_loss:0.6270363118959752 test_loss:0.6197482044602329
train_acc:0.5502104759216309 test_acc:0.6797224879264832
epoch7 train_loss:0.6174012815757787 test_loss:0.6200338483273566
train_acc:0.6130114197731018 test_acc:0.6711269617080688
epoch8 train_lo

In [57]:
class TextConvSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embed_dim, padding_idx=pad_token)
        self.conv = nn.Conv1d(32, 32, 3, padding = 3 // 2)
        self.pool = nn.MaxPool1d(2)
        self.lstm = nn.LSTM(32, 100, 1, batch_first = True)
        self.fc = nn.Linear(100, 1)
        self.sig = nn.Sigmoid()


    def forward(self, text):
        x = self.embedding(text)
        x = torch.transpose(x, 1, 2)
        x = self.conv(x)
        x = self.pool(x)
        x = torch.transpose(x, 1, 2)
        _, (x, _) = self.lstm(x)
        #только последний выход lstm
        x = x.view(-1, 100)
        x = self.fc(x)
        x = self.sig(x)
        return x
    
model_conv = TextConvSentiment(top_words, 32).to(device)
optimizer = torch.optim.Adam(model_conv.parameters(), lr = 0.0001)
cross_entropy = nn.BCELoss()

In [58]:
#у меня тут быстро произошёл оверфитинг
%matplotlib widget
fig = plt.figure(figsize=(8,10))
ax = fig.subplots(2,1)
fig.tight_layout(pad=3.0)

ax1, ax2 = ax
train_loss = []
test_loss = []
train_acc = []
test_acc = []
epochs = 45
for i in range(epochs):
    train_loss_epoch = 0
    train_acc_epoch = 0
    test_loss_epoch = 0
    test_acc_epoch = 0
    for x, y in train_loader:
        y_hat = model_conv(x)
        loss = cross_entropy(y_hat, y)
        train_loss_epoch += loss.item()
        train_acc_epoch += torch.mean((y == (y_hat > 0.5).float()).float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss.append(train_loss_epoch / len(train_loader))
    train_acc.append(train_acc_epoch / len(train_loader))
    with torch.no_grad():
        for x, y in test_loader:
            y_hat = model_conv(x)
            loss = cross_entropy(y_hat, y)
            test_loss_epoch += loss.item()
            test_acc_epoch += torch.mean((y == (y_hat > 0.5).float()).float())
    test_loss.append(test_loss_epoch / len(test_loader))
    test_acc.append(test_acc_epoch / len(test_loader))
    fig.suptitle('Архитектура с одной LSTM и свёрткой', fontsize=16)
    loss_plot(fig,ax1,train_loss, test_loss, "Кросс-энтропия")
    print(f'epoch{i} train_loss:{train_loss[-1]} test_loss:{test_loss[-1]}')
    print(f'train_acc:{train_acc[-1]} test_acc:{test_acc[-1]}')
    loss_plot(fig,ax2,train_acc, test_acc, "Точность")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

epoch0 train_loss:0.6932148238023123 test_loss:0.6931363497024927
train_acc:0.5023482441902161 test_acc:0.5032275319099426
epoch1 train_loss:0.6930073624724274 test_loss:0.6928830571941562
train_acc:0.5064160227775574 test_acc:0.5061413049697876
epoch2 train_loss:0.6722722182740698 test_loss:0.6203679828654914
train_acc:0.5810214877128601 test_acc:0.6640891432762146
epoch3 train_loss:0.5820476729553062 test_loss:0.5557728660551262
train_acc:0.6965287923812866 test_acc:0.7128608822822571
epoch4 train_loss:0.5243713517254327 test_loss:0.510332511195214
train_acc:0.743125855922699 test_acc:0.7461897134780884
epoch5 train_loss:0.4814610298905339 test_loss:0.4818242602137141
train_acc:0.7706857323646545 test_acc:0.7683678865432739
epoch6 train_loss:0.45092977467047307 test_loss:0.46538287543611373
train_acc:0.7879133224487305 test_acc:0.7791487574577332
epoch7 train_loss:0.42820677911484994 test_loss:0.45432201569730585
train_acc:0.8010252118110657 test_acc:0.7893469333648682
epoch8 train_l