In [1]:
import pandas as pd
from nltk import word_tokenize
from string import punctuation
from tqdm import tqdm

import numpy as np
from string import punctuation
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from torchmetrics import F1
from torchmetrics.functional import f1, recall
import ipdb

In [89]:
negative = pd.read_csv('negative.csv', sep=';', usecols=[3,4], names=['text', 'class'])
positive = pd.read_csv('positive.csv', sep=';', usecols=[3,4], names=['text', 'class'])

In [90]:
dataset = pd.concat([negative, positive])

In [91]:
def simple_preprocess(text):
    return [token.lower() for token in word_tokenize(text) if token not in punctuation]

In [92]:
dataset['tokenized'] = dataset['text'].apply(lambda t: simple_preprocess(t))

In [93]:
dataset['class'] = dataset['class'].apply(lambda c: 0 if c == -1 else 1)

In [94]:
voc = Counter()
for words in dataset['tokenized']:
    voc.update(words)
print('всего уникальных слов:', len(voc))

всего уникальных слов: 323810


In [95]:
filtered = set()
for word in voc:
     if voc[word] > 5:
        filtered.add(word)
print('уникальных слов, вcтретившихся больше 5 раз:', len(filtered))

уникальных слов, вcтретившихся больше 5 раз: 31743


In [96]:
word2id = {'PAD': 0}
for word in filtered:
    word2id[word] = len(word2id)

In [97]:
id2word = {i:word for word, i in word2id.items()}

In [98]:
class TwitterDataset(Dataset):
    def __init__(self, dataset, symbol2id):
        self.dataset = dataset['tokenized'].values
        self.word2id = word2id
        self.length = dataset.shape[0]
        self.target = dataset['class'].values
        # self.device = DEVICE

    def __len__(self):
        return self.length

    def __getitem__(self, index): 
        words = self.dataset[index]
        ids = torch.LongTensor([self.word2id[word] for word in words if word in self.word2id])
        y = [self.target[index]]
        return ids, y

    def collate_fn(self, batch):
        ids, y = list(zip(*batch))
        padded_ids = pad_sequence(ids, batch_first=True) # .to(self.device)
        y = torch.Tensor(y) # .to(self.device)
        return padded_ids, y

In [99]:
train, test = train_test_split(dataset, test_size=0.2)

In [100]:
train_dataset = TwitterDataset(train, word2id)
train_sampler = RandomSampler(train_dataset)
train_iterator = DataLoader(train_dataset, collate_fn = train_dataset.collate_fn, sampler=train_sampler, batch_size=1024)

In [101]:
test_dataset = TwitterDataset(test, word2id)
test_sampler = SequentialSampler(test_dataset)
test_iterator = DataLoader(test_dataset, collate_fn = test_dataset.collate_fn, sampler=test_sampler, batch_size=1024)

In [102]:
batch = next(iter(train_iterator))

In [103]:
batch[0].shape

torch.Size([1024, 30])

In [104]:
test_batch = next(iter(test_iterator))
test_batch[0].shape

torch.Size([1024, 29])

In [105]:
class CNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bigrams = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=2, padding='same')
        self.trigrams = nn.Conv1d(in_channels=embedding_dim, out_channels=80, kernel_size=3, padding='same')
        # self.pooling = nn.MaxPool1d(kernel_size=2, stride=2)
        # self.relu = nn.ReLU()
        self.lastconv = nn.Conv1d(in_channels=180, out_channels=150, kernel_size=2, padding='same')
        self.hidden = nn.Linear(in_features=150, out_features=1)
        self.dropout = nn.Dropout(p=0.5)
        self.out = nn.Sigmoid()

    def forward(self, word):
        embedded = self.embedding(word)
        embedded = embedded.transpose(1,2)
        feature_map_bigrams = self.dropout(self.bigrams(embedded))
        feature_map_trigrams = self.dropout(self.trigrams(embedded))
        concat = torch.cat((feature_map_bigrams, feature_map_trigrams), 1)
        feature_map_last = self.dropout(self.lastconv(concat))
        pooling = feature_map_last.max(2)[0]
        logits = self.hidden(pooling) 
        logits = self.out(logits)
        return logits

In [114]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0 # для подсчета среднего лосса на всех батчах

    model.train()  # ставим модель в обучение, явно указываем, что сейчас надо будет хранить градиенты у всех весов

    for i, (texts, ys) in enumerate(iterator): #итерируемся по батчам
        optimizer.zero_grad()  #обнуляем градиенты
        preds = model(texts)  #прогоняем данные через модель
        loss = criterion(preds, ys) #считаем значение функции потерь  
        loss.backward() #считаем градиенты  
        optimizer.step() #обновляем веса 
        epoch_loss += loss.item() #сохраняем значение функции потерь
        if not (i + 1) % int(len(iterator)/5):
            print(f'Train loss: {epoch_loss/i}')      
    return  epoch_loss / len(iterator) # возвращаем среднее значение лосса по всей выборке

In [115]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_metric = 0
    model.eval() 
    with torch.no_grad():
        for i, (texts, ys) in enumerate(iterator):   
            preds = model(texts)  # делаем предсказания на тесте
            loss = criterion(preds, ys)   # считаем значения функции ошибки для статистики  
            epoch_loss += loss.item()
            batch_metric = f1(preds.round().long(), ys.long(), ignore_index=0)
            epoch_metric += batch_metric

            if not (i + 1) % int(len(iterator)/5):
                print(f'Test loss: {epoch_loss/i}, Test f1: {epoch_metric/i}')
        
    return epoch_metric / len(iterator), epoch_loss / len(iterator) # возвращаем среднее значение по всей выборке

In [116]:
model = CNN(len(word2id), 200)
optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.BCELoss()  

# # веса модели и значения лосса храним там же, где и все остальные тензоры
# model = model.to(DEVICE)
# criterion = criterion.to(DEVICE)

In [117]:
losses = []
losses_eval = []
f1s = []
f1s_eval = []

for i in range(3):
    print(f'epoch {i}')
    print('Training...')
    epoch_loss = train(model, train_iterator, optimizer, criterion)
    losses.append(epoch_loss)
    print('\nEvaluating on train...')
    f1_on_train,_ = evaluate(model, train_iterator, criterion)
    f1s.append(f1_on_train)
    print('\nEvaluating on test...')
    f1_on_test, epoch_loss_on_test = evaluate(model, test_iterator, criterion)
    losses_eval.append(epoch_loss_on_test)
    f1s_eval.append(f1_on_test)


starting Epoch 0
Training...
Train loss: 0.7029531720806571
Train loss: 0.6649209183195363
Train loss: 0.6458420776403867
Train loss: 0.6321522674972205
Train loss: 0.6236960425459105

Evaluating on train...
Test loss: 0.5929365403511945, Test f1: 0.7306426167488098
Test loss: 0.5843801593434983, Test f1: 0.7180718779563904
Test loss: 0.5817254902078555, Test f1: 0.7139438390731812
Test loss: 0.5802110330663997, Test f1: 0.7125498652458191
Test loss: 0.5794639731275624, Test f1: 0.7102539539337158

Evaluating on test...


NameError: name 'val_iterator' is not defined