# Домашнее задачние №2 (CNN)

In [138]:
import pandas as pd
from nltk import word_tokenize
from string import punctuation
from tqdm import tqdm

import numpy as np
from string import punctuation
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from torchmetrics import F1
from torchmetrics.functional import f1, recall
import ipdb
from gensim.models import fasttext 
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [139]:
pip install gensim==3.8.0



## Dataset

In [140]:
negative = pd.read_csv('negative.csv', sep=';', usecols=[3,4], names=['text', 'class'])
positive = pd.read_csv('positive.csv', sep=';', usecols=[3,4], names=['text', 'class'])

In [141]:
dataset = pd.concat([negative, positive])

In [142]:
def simple_preprocess(text):
    return [token.lower() for token in word_tokenize(text) if token not in punctuation]

In [143]:
dataset['tokenized'] = dataset['text'].apply(lambda t: simple_preprocess(t))

In [144]:
dataset['class'] = dataset['class'].apply(lambda c: 0 if c == -1 else 1)

In [145]:
voc = Counter()
for words in dataset['tokenized']:
    voc.update(words)
print('всего уникальных слов:', len(voc))

всего уникальных слов: 323845


In [146]:
filtered = set()
for word in voc:
     if voc[word] > 5:
        filtered.add(word)
print('уникальных слов, вcтретившихся больше 5 раз:', len(filtered))

уникальных слов, вcтретившихся больше 5 раз: 31739


In [147]:
word2id = {'PAD': 0}
for word in filtered:
    word2id[word] = len(word2id)

In [148]:
id2word = {i:word for word, i in word2id.items()}

In [149]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [150]:
class TwitterDataset(Dataset):
    def __init__(self, dataset, word2id, DEVICE):
        self.dataset = dataset['tokenized'].values
        self.word2id = word2id
        self.length = dataset.shape[0]
        self.target = dataset['class'].values
        self.device = DEVICE

    def __len__(self):
        return self.length

    def __getitem__(self, index): 
        words = self.dataset[index]
        ids = torch.LongTensor([self.word2id[word] for word in words if word in self.word2id])
        y = [self.target[index]]
        return ids, y

    def collate_fn(self, batch):
        ids, y = list(zip(*batch))
        padded_ids = pad_sequence(ids, batch_first=True).to(self.device)
        y = torch.Tensor(y).to(self.device)
        return padded_ids, y

In [151]:
train, test = train_test_split(dataset, test_size=0.2)

In [152]:
train_dataset = TwitterDataset(train, word2id, DEVICE)
train_sampler = RandomSampler(train_dataset)
train_iterator = DataLoader(train_dataset, collate_fn = train_dataset.collate_fn, sampler=train_sampler, batch_size=1024)

In [153]:
test_dataset = TwitterDataset(test, word2id, DEVICE)
test_sampler = SequentialSampler(test_dataset)
test_iterator = DataLoader(test_dataset, collate_fn = test_dataset.collate_fn, sampler=test_sampler, batch_size=1024)

## 1. CNN for words

In [154]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bigrams = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=2, padding='same')
        self.trigrams = nn.Conv1d(in_channels=embedding_dim, out_channels=80, kernel_size=3, padding='same')
        self.lastconv = nn.Conv1d(in_channels=180, out_channels=150, kernel_size=2, padding='same')
        self.hidden = nn.Linear(in_features=150, out_features=1)
        self.out = nn.Sigmoid()

    def forward(self, word):
        embedded = self.embedding(word)
        embedded = embedded.transpose(1,2)
        feature_map_bigrams = self.bigrams(embedded)
        feature_map_trigrams = self.trigrams(embedded)
        concat = torch.cat((feature_map_bigrams, feature_map_trigrams), 1)
        feature_map_last = self.lastconv(concat)
        pooling = feature_map_last.max(2)[0]
        logits = self.hidden(pooling) 
        logits = self.out(logits)
        return logits

In [155]:
def training(model, iterator, optimizer, criterion):
    epoch_loss = 0 

    model.train()

    for i, (texts, ys) in enumerate(iterator):
        optimizer.zero_grad()
        preds = model(texts)
        loss = criterion(preds, ys)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    return  epoch_loss / len(iterator)

In [156]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_metric = 0
    model.eval() 
    with torch.no_grad():
        for i, (texts, ys) in enumerate(iterator):   
            preds = model(texts)
            loss = criterion(preds, ys)
            epoch_loss += loss.item()
            batch_metric = f1(preds.round().long(), ys.long(), ignore_index=0)
            epoch_metric += batch_metric.item()
        
    return epoch_metric / len(iterator), epoch_loss / len(iterator)

In [157]:
model = CNN(len(word2id), 200)
optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.BCELoss()  

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

In [158]:
losses = []
losses_eval = []
f1s = []
f1s_eval = []

for i in range(5):
    print(f'\nepoch {i}')
    epoch_loss = training(model, train_iterator, optimizer, criterion)
    print('epoch loss on train:', epoch_loss)
    losses.append(epoch_loss)
    print('\ntest evaluation:')
    f1_on_test, epoch_loss_on_test = evaluate(model, test_iterator, criterion)
    print('epoch loss on test:', epoch_loss_on_test)
    print('F1 on test:', f1_on_test)
    losses_eval.append(epoch_loss_on_test)
    f1s_eval.append(f1_on_test)


epoch 0
epoch loss on train: 0.5776815804537763

test evaluation:
epoch loss on test: 0.5407159686088562
F1 on test: 0.7172049668100146

epoch 1
epoch loss on train: 0.5041556649663476

test evaluation:
epoch loss on test: 0.5074664976861741
F1 on test: 0.7464439895417955

epoch 2
epoch loss on train: 0.45719039557355173

test evaluation:
epoch loss on test: 0.4889123499393463
F1 on test: 0.7568130996492174

epoch 3
epoch loss on train: 0.4178060271431891

test evaluation:
epoch loss on test: 0.48286848862965903
F1 on test: 0.7594486660427517

epoch 4
epoch loss on train: 0.38079599734772457

test evaluation:
epoch loss on test: 0.48223473164770336
F1 on test: 0.7655813998646206


Улучшение: добавлен Dropout.

In [183]:
class CNN_DO(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bigrams = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=2, padding='same')
        self.trigrams = nn.Conv1d(in_channels=embedding_dim, out_channels=200, kernel_size=3, padding='same')
        self.lastconv = nn.Conv1d(in_channels=300, out_channels=200, kernel_size=2, padding='same')
        self.hidden = nn.Linear(in_features=200, out_features=1)
        self.dropout = nn.Dropout(p=0.5)
        self.out = nn.Sigmoid()

    def forward(self, word):
        embedded = self.embedding(word)
        embedded = embedded.transpose(1,2)
        feature_map_bigrams = self.dropout(self.bigrams(embedded))
        feature_map_trigrams = self.dropout(self.trigrams(embedded))
        concat = torch.cat((feature_map_bigrams, feature_map_trigrams), 1)
        feature_map_last = self.dropout(self.lastconv(concat))
        pooling = feature_map_last.max(2)[0]
        logits = self.hidden(pooling) 
        logits = self.out(logits)
        return logits

In [184]:
model = CNN_DO(len(word2id), 200)
optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.BCELoss()  

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

In [185]:
losses = []
losses_eval = []
f1s = []
f1s_eval = []

for i in range(5):
    print(f'\nepoch {i}')
    epoch_loss = training(model, train_iterator, optimizer, criterion)
    print('epoch loss on train:', epoch_loss)
    losses.append(epoch_loss)
    print('\ntest evaluation:')
    f1_on_test, epoch_loss_on_test = evaluate(model, test_iterator, criterion)
    print('epoch loss on test:', epoch_loss_on_test)
    print('F1 on test:', f1_on_test)
    losses_eval.append(epoch_loss_on_test)
    f1s_eval.append(f1_on_test)


epoch 0


ValueError: ignored

### 1.1. CNN for words with pretrained embeddings

In [162]:
!wget http://vectors.nlpl.eu/repository/20/214.zip

--2021-12-10 20:28:30--  http://vectors.nlpl.eu/repository/20/214.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.181
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1920218982 (1.8G) [application/zip]
Saving to: ‘214.zip.1’



In [180]:
!unzip 214.zip

Archive:  214.zip
replace meta.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [164]:
ft = fasttext.FastTextKeyedVectors.load('model.model')

In [165]:
weights = np.zeros((len(word2id), 300))
for word, i in word2id.items():
    if word == 'PAD':
        continue   
    try:
        weights[i] = ft[word]    
    except KeyError:
        weights[i] = np.random.normal(0,0.1,300)

In [166]:
class CNN_FT(nn.Module):
    def __init__(self, vocab_size, embedding_dim=300):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.from_pretrained(torch.tensor(weights), freeze=True)
        self.bigrams = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=2, padding='same')
        self.trigrams = nn.Conv1d(in_channels=embedding_dim, out_channels=80, kernel_size=3, padding='same')
        self.lastconv = nn.Conv1d(in_channels=180, out_channels=150, kernel_size=2, padding='same')
        self.hidden = nn.Linear(in_features=150, out_features=1)
        self.dropout = nn.Dropout(p=0.5)
        self.out = nn.Sigmoid()

    def forward(self, word):
        embedded = self.embedding(word)
        embedded = embedded.transpose(1,2)
        feature_map_bigrams = self.dropout(self.bigrams(embedded))
        feature_map_trigrams = self.dropout(self.trigrams(embedded))
        concat = torch.cat((feature_map_bigrams, feature_map_trigrams), 1)
        feature_map_last = self.dropout(self.lastconv(concat))
        pooling = feature_map_last.max(2)[0]
        logits = self.hidden(pooling) 
        logits = self.out(logits)
        return logits

In [167]:
model = CNN_FT(len(word2id))
optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.BCELoss()  

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

In [168]:
losses = []
losses_eval = []
f1s = []
f1s_eval = []

for i in range(5):
    print(f'\nepoch {i}')
    epoch_loss = training(model, train_iterator, optimizer, criterion)
    print('epoch loss on train:', epoch_loss)
    losses.append(epoch_loss)
    print('\ntest evaluation:')
    f1_on_test, epoch_loss_on_test = evaluate(model, test_iterator, criterion)
    print('epoch loss on test:', epoch_loss_on_test)
    print('F1 on test:', f1_on_test)
    losses_eval.append(epoch_loss_on_test)
    f1s_eval.append(f1_on_test)


epoch 0
epoch loss on train: 0.6227839518798871

test evaluation:
epoch loss on test: 0.5767073671023051
F1 on test: 0.6986798935466343

epoch 1
epoch loss on train: 0.5517790782987402

test evaluation:
epoch loss on test: 0.5376903878317939
F1 on test: 0.7279510166909959

epoch 2
epoch loss on train: 0.5117080710577161

test evaluation:
epoch loss on test: 0.514421033859253
F1 on test: 0.7459974884986877

epoch 3
epoch loss on train: 0.4800851465610976

test evaluation:
epoch loss on test: 0.4952606174680922
F1 on test: 0.7491783420244853

epoch 4
epoch loss on train: 0.4540397950102774

test evaluation:
epoch loss on test: 0.48092230757077536
F1 on test: 0.7657908492618137


## 2. CNN for words and symbols

In [169]:
vocab_symbols = Counter()

for words in dataset['tokenized']:
    for word in words:
        vocab_symbols.update(list(word))
print('всего уникальных символов:', len(vocab_symbols))

всего уникальных символов: 367


In [170]:
filtered_vocab_symbols = set()

for symbol in vocab_symbols:
    if vocab_symbols[symbol] > 5:
        filtered_vocab_symbols.add(symbol)
print('уникальных символов, втретившихся больше 5 раз:', len(filtered_vocab_symbols))

уникальных символов, втретившихся больше 5 раз: 151


In [171]:
symbol2id = {'PAD':0}

for symbol in filtered_vocab_symbols:
    symbol2id[symbol] = len(symbol2id)

id2symbol = {i:symbol for symbol, i in symbol2id.items()}

In [172]:
class TwitterDataset_WS(Dataset):
    def __init__(self, dataset, word2id, symbol2id, DEVICE):
        self.dataset = dataset['tokenized'].values
        self.word2id = word2id
        self.symbol2id = symbol2id
        self.length = dataset.shape[0]
        self.target = dataset['class'].values
        self.device = DEVICE

    def __len__(self):
        return self.length

    def __getitem__(self, index): 
        words = self.dataset[index]
        ids_w = torch.LongTensor([self.word2id[word] for word in words if word in self.word2id])
        symbols = [symb for word in self.dataset[index] for symb in word]
        ids_s = torch.LongTensor([self.symbol2id[symbol] for symbol in symbols if symbol in self.symbol2id])
        y = [self.target[index]]
        return ids_w, ids_s, y

    def collate_fn(self, batch):
        ids_w, ids_s, y = list(zip(*batch))
        padded_ids_w = pad_sequence(ids_w, batch_first=True).to(self.device)
        padded_ids_s = pad_sequence(ids_s, batch_first=True).to(self.device)
        y = torch.Tensor(y).to(self.device)
        return padded_ids_w, padded_ids_s, y

In [173]:
train_dataset = TwitterDataset_WS(train, word2id, symbol2id, DEVICE)
train_sampler = RandomSampler(train_dataset)
train_iterator = DataLoader(train_dataset, collate_fn = train_dataset.collate_fn, sampler=train_sampler, batch_size=1024)

In [174]:
test_dataset = TwitterDataset_WS(test, word2id, symbol2id, DEVICE)
test_sampler = SequentialSampler(test_dataset)
test_iterator = DataLoader(test_dataset, collate_fn = test_dataset.collate_fn, sampler=test_sampler, batch_size=1024)

In [175]:
for i, (padded_ids_w, padded_ids_s, ys) in enumerate(train_iterator):
    print(padded_ids_w)
    break

tensor([[14525,  1458,  5809,  ...,     0,     0,     0],
        [ 2256,  5809,  7242,  ...,     0,     0,     0],
        [ 1895,  4911, 28460,  ...,     0,     0,     0],
        ...,
        [ 1818, 16127, 10889,  ...,     0,     0,     0],
        [19384, 31385, 31606,  ...,     0,     0,     0],
        [12790, 22873, 28227,  ...,     0,     0,     0]], device='cuda:0')


In [208]:
def training2(model, iterator, optimizer, criterion):
    epoch_loss = 0 

    model.train()

    for i, (words, symbols, ys) in enumerate(iterator):
        optimizer.zero_grad()
        preds = model(words, symbols)
        loss = criterion(preds, ys)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return  epoch_loss / len(iterator)

In [209]:
def evaluate2(model, iterator, criterion):
    epoch_loss = 0
    epoch_metric = 0
    model.eval() 
    with torch.no_grad():
        for i, (words, symbols, ys) in enumerate(iterator):   
            preds = model(words, symbols)
            loss = criterion(preds, ys)
            epoch_loss += loss.item()
            batch_metric = f1(preds.round().long(), ys.long(), ignore_index=0)
            epoch_metric += batch_metric.item()
        
    return epoch_metric / len(iterator), epoch_loss / len(iterator)

In [211]:
class CNN_WS_DO(nn.Module):
    def __init__(self, vocab_size_words, embedding_dim_words,
                 vocab_size_symbols, embedding_dim_symbols):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size_words, embedding_dim_words)
        self.embedding.from_pretrained(torch.tensor(weights), freeze=True)
        self.hidden = nn.Linear(in_features=300, out_features=100)

        self.embedding_symbols = nn.Embedding(vocab_size_symbols, embedding_dim_symbols)
        self.bigrams = nn.Conv1d(in_channels=embedding_dim_symbols, out_channels=80, kernel_size=2, padding='same')
        self.trigrams = nn.Conv1d(in_channels=embedding_dim_symbols, out_channels=100, kernel_size=3, padding='same')
        self.second_hidden = nn.Linear(in_features=280, out_features=1)
        self.dropout = nn.Dropout(p=0.1)
        self.out = nn.Sigmoid()

    def forward(self, word, symbols):
        embedded_words = self.embedding(word)
        embedded_words = embedded_words.max(1)[0]
        X = self.hidden(embedded_words)

        embedded_symbols = self.embedding_symbols(symbols)
        embedded_symbols = embedded_symbols.transpose(1,2)
        feature_map_bigrams = self.dropout(self.bigrams(embedded_symbols))
        feature_map_trigrams = self.dropout(self.trigrams(embedded_symbols))
        pooling1 = feature_map_bigrams.max(2)[0] 
        pooling2 = feature_map_trigrams.max(2)[0]
        concat = torch.cat((pooling1, pooling2), 1)

        concat = torch.cat((concat, X), 1)

        logits = self.second_hidden(concat) 
        logits = self.out(logits)
        return logits

In [212]:
model = CNN_WS_DO(len(word2id), 300, len(symbol2id), 15)
optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.BCELoss()  

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

losses = []
losses_eval = []
f1s = []
f1s_eval = []

for i in range(5):
    print(f'\nepoch {i}')
    epoch_loss = training2(model, train_iterator, optimizer, criterion)
    print('epoch loss on train:', epoch_loss)
    losses.append(epoch_loss)
    print('\ntest evaluation:')
    f1_on_test, epoch_loss_on_test = evaluate2(model, test_iterator, criterion)
    print('epoch loss on test:', epoch_loss_on_test)
    print('F1 on test:', f1_on_test)
    losses_eval.append(epoch_loss_on_test)
    f1s_eval.append(f1_on_test)


epoch 0
epoch loss on train: 0.6523520427473476

test evaluation:
epoch loss on test: 0.6160269962416755
F1 on test: 0.7055394755469429

epoch 1
epoch loss on train: 0.5728454201408987

test evaluation:
epoch loss on test: 0.565004743470086
F1 on test: 0.6557340463002522

epoch 2
epoch loss on train: 0.5282727764563614

test evaluation:
epoch loss on test: 0.5431005941496955
F1 on test: 0.6726143015755548

epoch 3
epoch loss on train: 0.4957212819142288

test evaluation:
epoch loss on test: 0.5131849845250448
F1 on test: 0.7221371690432231

epoch 4
epoch loss on train: 0.46915717372733556

test evaluation:
epoch loss on test: 0.4983571635352241
F1 on test: 0.7401250110732185


Улучшение: убран Dropout, изменены параметры in/out_chanels и размер символьного эмбеддинга.

In [207]:
class CNN_WS(nn.Module):
    def __init__(self, vocab_size_words, embedding_dim_words,
                 vocab_size_symbols, embedding_dim_symbols):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size_words, embedding_dim_words)
        self.embedding.from_pretrained(torch.tensor(weights), freeze=True)
        self.hidden = nn.Linear(in_features=300, out_features=50)

        self.embedding_symbols = nn.Embedding(vocab_size_symbols, embedding_dim_symbols)
        self.bigrams = nn.Conv1d(in_channels=embedding_dim_symbols, out_channels=50, kernel_size=2, padding='same')
        self.trigrams = nn.Conv1d(in_channels=embedding_dim_symbols, out_channels=50, kernel_size=3, padding='same')
        self.second_hidden = nn.Linear(in_features=150, out_features=1)
        self.out = nn.Sigmoid()

    def forward(self, word, symbols):
        embedded_words = self.embedding(word)
        embedded_words = embedded_words.max(1)[0]
        X = self.hidden(embedded_words)

        embedded_symbols = self.embedding_symbols(symbols)
        embedded_symbols = embedded_symbols.transpose(1,2)
        feature_map_bigrams = self.bigrams(embedded_symbols)
        feature_map_trigrams = self.trigrams(embedded_symbols)
        pooling1 = feature_map_bigrams.max(2)[0] 
        pooling2 = feature_map_trigrams.max(2)[0]
        concat = torch.cat((pooling1, pooling2), 1)

        concat = torch.cat((concat, X), 1)

        logits = self.second_hidden(concat) 
        logits = self.out(logits)
        return logits

In [210]:
model = CNN_WS(len(word2id), 300, len(symbol2id), 15)
optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.BCELoss()  

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

losses = []
losses_eval = []
f1s = []
f1s_eval = []

for i in range(5):
    print(f'\nepoch {i}')
    epoch_loss = training2(model, train_iterator, optimizer, criterion)
    print('epoch loss on train:', epoch_loss)
    losses.append(epoch_loss)
    print('\ntest evaluation:')
    f1_on_test, epoch_loss_on_test = evaluate2(model, test_iterator, criterion)
    print('epoch loss on test:', epoch_loss_on_test)
    print('F1 on test:', f1_on_test)
    losses_eval.append(epoch_loss_on_test)
    f1s_eval.append(f1_on_test)


epoch 0
epoch loss on train: 0.6546003433425774

test evaluation:
epoch loss on test: 0.6117788314819336
F1 on test: 0.646149484316508

epoch 1
epoch loss on train: 0.5774652757001727

test evaluation:
epoch loss on test: 0.5631250182787577
F1 on test: 0.6986657116148207

epoch 2
epoch loss on train: 0.5345779094803199

test evaluation:
epoch loss on test: 0.5468138708008661
F1 on test: 0.7337905367215475

epoch 3
epoch loss on train: 0.5021812693121728

test evaluation:
epoch loss on test: 0.5368823170661926
F1 on test: 0.7483912971284654

epoch 4
epoch loss on train: 0.47435664745529044

test evaluation:
epoch loss on test: 0.5165335496266683
F1 on test: 0.7569814072714911
