## Assignment 2.2: Text classification via CNN (50 points)

In this assignment you should perform sentiment analysis of the IMDB reviews based on CNN architecture. Read carefully [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/pdf/1408.5882.pdf) by Yoon Kim.

In [1]:
!pip install torch==1.6.0
!pip install torchtext==0.7
!pip install numpy
!pip install pandas



In [2]:
import numpy as np
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext import datasets
from torchtext.data import Field, LabelField
from torchtext.data import Iterator

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Preparing Data

In [3]:
TEXT = Field(sequential=True, lower=True, batch_first=True)
LABEL = LabelField(batch_first=True)



In [4]:
train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split()



In [5]:
# %%time
TEXT.build_vocab(trn)

In [6]:
LABEL.build_vocab(trn)

### Creating the Iterator


Define an iterator here

In [7]:
train_iter, val_iter, test_iter = Iterator.splits((trn, vld, tst), 
                                                  batch_sizes=(64,64,64), 
                                                  sort_key=lambda x: len(x.text), 
                                                  device='cuda', 
                                                  sort=True,
                                                  sort_within_batch=True,
                                                  repeat=False
                                                 )



### Define CNN-based text classification model (20 points)

Т.к. длинна предложений разная, то есть идея: разворачивать двумерные вектора эмбеддингов в одномерные, но при этом увеличивать размер ядра, паддинг и страйд в D (размерность эмбеддинга слова) раз.

In [8]:
class CNN(nn.Module):
    def __init__(self, V, D, sent_length, kernel_sizes, num_classes, dropout=0.5, filters=100):
        super(CNN, self).__init__()

        self.embed = nn.Embedding(V+1, D, padding_idx=1)

        self.convs = nn.ModuleList()

        for kernel_size in kernel_sizes:
          self.convs.append(nn.Conv1d(in_channels=1, out_channels=filters, kernel_size=kernel_size*D, padding=kernel_size//2*D, stride=D))

        self.ac = nn.ReLU()
        self.pool = F.max_pool1d
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(filters*len(kernel_sizes), num_classes)
        self.sm = nn.Sigmoid()
        
    def forward(self, x):
        x = self.embed(x)
        # print(x.shape)
        x = x.view(x.shape[0], 1, -1)
        # print(x.shape)
        x = [self.ac(conv(x)) for conv in self.convs]

        x = torch.cat([self.pool(x_, x_.shape[2]).squeeze(2) for x_ in x], dim=1)

        x = self.linear(self.dropout(x))
        logit = self.sm(x)
        # print(logit.shape)

        return logit.squeeze(1)

In [9]:
kernel_sizes = [3,4,5]
vocab_size = len(TEXT.vocab)
dropout = 0.5
dim = 300
n = 2470

model = CNN(vocab_size, dim, n, kernel_sizes, num_classes=1, dropout=dropout)

In [10]:
model.cuda()

CNN(
  (embed): Embedding(201524, 300, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(1, 100, kernel_size=(900,), stride=(300,), padding=(300,))
    (1): Conv1d(1, 100, kernel_size=(1200,), stride=(300,), padding=(600,))
    (2): Conv1d(1, 100, kernel_size=(1500,), stride=(300,), padding=(600,))
  )
  (ac): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (linear): Linear(in_features=300, out_features=1, bias=True)
  (sm): Sigmoid()
)

### The training loop (10 points)

Define the optimization function and the loss functions.

In [11]:
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

Think carefully about the stopping criteria. 

In [12]:
epochs = 20

In [13]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() 
    for batch in train_iter:         
        
        # x = torch.ones(batch.text.shape[0], n, dtype=torch.int64).cuda()
        # x[:, :batch.text.shape[1]] = batch.text
        # print(x)
        # print(batch.text.shape[1])
        x = batch.text
        y = batch.label.float()

        opt.zero_grad()
        preds = model(x)
        # print(preds)
        # print('=======================================')
        # print(y)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        running_loss += loss.item()
        
    epoch_loss = running_loss / len(trn)
    
    val_loss = 0.0
    model.eval()
    correct = 0
    total = 0 
    for batch in val_iter:
        
        # x = torch.ones(batch.text.shape[0], n, dtype=torch.int64).cuda()
        # x[:, :batch.text.shape[1]] = batch.text
        # print(x)
        # print(batch.text.shape[1])

        x = batch.text
        y = batch.label.float()
        
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.item()
        
    val_loss /= len(vld)
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, val_loss))



Epoch: 1, Training Loss: 0.010273323764119829, Validation Loss: 0.009711514965693157
Epoch: 2, Training Loss: 0.009726892719949995, Validation Loss: 0.009583226561546325
Epoch: 3, Training Loss: 0.009472737111364092, Validation Loss: 0.009250420832633973
Epoch: 4, Training Loss: 0.00925515822342464, Validation Loss: 0.00915322651863098
Epoch: 5, Training Loss: 0.009072249221801758, Validation Loss: 0.009301971423625946
Epoch: 6, Training Loss: 0.008915570860249656, Validation Loss: 0.008996675491333008
Epoch: 7, Training Loss: 0.008769469048295703, Validation Loss: 0.009068146514892577
Epoch: 8, Training Loss: 0.008640025976725987, Validation Loss: 0.008946968531608581
Epoch: 9, Training Loss: 0.008521641763619013, Validation Loss: 0.009055007334550221
Epoch: 10, Training Loss: 0.008446630115168436, Validation Loss: 0.008988609270254772
Epoch: 11, Training Loss: 0.008376587048598698, Validation Loss: 0.00922734846274058
Epoch: 12, Training Loss: 0.008282399928569793, Validation Loss: 0

### Calculate performance of the trained model (10 points)

In [14]:
def get_metrics(pred, gt):
    print(f'Accuracy: {accuracy_score(gt, pred):.2f}')
    print(f'Precision: {precision_score(gt, pred):.2f}')
    print(f'Recall: {recall_score(gt, pred):.2f}')
    print(f'F1: {f1_score(gt, pred):.2f}')

In [15]:
pred = []
gt = []

for batch in test_iter:
    # x = torch.ones(batch.text.shape[0], n, dtype=torch.int64).cuda()
    # x[:, :batch.text.shape[1]] = batch.text
    # print(x)
    # print(batch.text.shape[1])

    x = batch.text
    y = batch.label.float()
    
    pred += model(x).round().tolist()
    gt += y.tolist()

get_metrics(gt, pred)



Accuracy: 0.84
Precision: 0.93
Recall: 0.79
F1: 0.85


Write down the calculated performance

### Accuracy: 0.83
### Precision: 0.94
### Recall: 0.77
### F1: 0.84

### Experiments (10 points)

Experiment with the model and achieve better results. Implement and describe your experiments in details, mention what was helpful.

### 1. Добавил больше ядер
`kernel_sizes = [3,4,5,6,7]`
#### Accuracy: 0.86
#### Precision: 0.89
#### Recall: 0.83
#### F1: 0.86
### 2. Больше ядер + увеличил размер скрытого слоя + число фильтров в свертках
`kernel_sizes = [3,4,5,6,7]`

`dim = 500`

`filters=150`
#### Accuracy: 0.86
#### Precision: 0.89
#### Recall: 0.85
#### F1: 0.87
### 3. Добавил механизм внимания (Self-Attention)
Работает нестабильно, метрики силно скачут
#### Accuracy: 0.50
#### Precision: 1.00
#### Recall: 0.50
#### F1: 0.67

In [16]:
kernel_sizes = [3,4,5,6,7]
vocab_size = len(TEXT.vocab)
dropout = 0.5
dim = 300
n = 2470

model = CNN(vocab_size, dim, n, kernel_sizes, num_classes=1, dropout=dropout, filters=100)

In [17]:
model.cuda()

CNN(
  (embed): Embedding(201524, 300, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(1, 100, kernel_size=(900,), stride=(300,), padding=(300,))
    (1): Conv1d(1, 100, kernel_size=(1200,), stride=(300,), padding=(600,))
    (2): Conv1d(1, 100, kernel_size=(1500,), stride=(300,), padding=(600,))
    (3): Conv1d(1, 100, kernel_size=(1800,), stride=(300,), padding=(900,))
    (4): Conv1d(1, 100, kernel_size=(2100,), stride=(300,), padding=(900,))
  )
  (ac): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (linear): Linear(in_features=500, out_features=1, bias=True)
  (sm): Sigmoid()
)

In [18]:
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

In [19]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() 
    for batch in train_iter:         
        
        # x = torch.ones(batch.text.shape[0], n, dtype=torch.int64).cuda()
        # x[:, :batch.text.shape[1]] = batch.text
        # print(x)
        # print(batch.text.shape[1])
        x = batch.text
        y = batch.label.float()

        opt.zero_grad()
        preds = model(x)
        # print(preds)
        # print('=======================================')
        # print(y)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        running_loss += loss.item()
        
    epoch_loss = running_loss / len(trn)
    
    val_loss = 0.0
    model.eval()
    correct = 0
    total = 0 
    for batch in val_iter:
        
        # x = torch.ones(batch.text.shape[0], n, dtype=torch.int64).cuda()
        # x[:, :batch.text.shape[1]] = batch.text
        # print(x)
        # print(batch.text.shape[1])

        x = batch.text
        y = batch.label.float()
        
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.item()
        
    val_loss /= len(vld)
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, val_loss))



Epoch: 1, Training Loss: 0.010222412603242057, Validation Loss: 0.010115425745646158
Epoch: 2, Training Loss: 0.00965090147767748, Validation Loss: 0.009374126116434733
Epoch: 3, Training Loss: 0.009379771035058158, Validation Loss: 0.009380531819661458
Epoch: 4, Training Loss: 0.00915208581515721, Validation Loss: 0.009241183737913768
Epoch: 5, Training Loss: 0.008982938744340625, Validation Loss: 0.009085426104068756
Epoch: 6, Training Loss: 0.008819983398914337, Validation Loss: 0.009032229089736938
Epoch: 7, Training Loss: 0.008695067395482745, Validation Loss: 0.009321585726737976
Epoch: 8, Training Loss: 0.008578485330513546, Validation Loss: 0.009006008557478587
Epoch: 9, Training Loss: 0.00848745161635535, Validation Loss: 0.009030736911296845
Epoch: 10, Training Loss: 0.008422586875302451, Validation Loss: 0.009055590105056762
Epoch: 11, Training Loss: 0.008351166718346731, Validation Loss: 0.009323565800984701
Epoch: 12, Training Loss: 0.00828600993837629, Validation Loss: 0.

In [20]:
pred = []
gt = []

for batch in test_iter:
    # x = torch.ones(batch.text.shape[0], n, dtype=torch.int64).cuda()
    # x[:, :batch.text.shape[1]] = batch.text
    # print(x)
    # print(batch.text.shape[1])

    x = batch.text
    y = batch.label.float()
    
    pred += model(x).round().tolist()
    gt += y.tolist()

get_metrics(gt, pred)



Accuracy: 0.86
Precision: 0.89
Recall: 0.83
F1: 0.86


In [25]:
kernel_sizes = [3,4,5,6,7]
vocab_size = len(TEXT.vocab)
dropout = 0.5
dim = 500
n = 2470

model = CNN(vocab_size, dim, n, kernel_sizes, num_classes=1, dropout=dropout, filters=150)

In [26]:
model.cuda()

CNN(
  (embed): Embedding(201524, 500, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(1, 150, kernel_size=(1500,), stride=(500,), padding=(500,))
    (1): Conv1d(1, 150, kernel_size=(2000,), stride=(500,), padding=(1000,))
    (2): Conv1d(1, 150, kernel_size=(2500,), stride=(500,), padding=(1000,))
    (3): Conv1d(1, 150, kernel_size=(3000,), stride=(500,), padding=(1500,))
    (4): Conv1d(1, 150, kernel_size=(3500,), stride=(500,), padding=(1500,))
  )
  (ac): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (linear): Linear(in_features=750, out_features=1, bias=True)
  (sm): Sigmoid()
)

In [27]:
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

In [28]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() 
    for batch in train_iter:         
        
        # x = torch.ones(batch.text.shape[0], n, dtype=torch.int64).cuda()
        # x[:, :batch.text.shape[1]] = batch.text
        # print(x)
        # print(batch.text.shape[1])
        x = batch.text
        y = batch.label.float()

        opt.zero_grad()
        preds = model(x)
        # print(preds)
        # print('=======================================')
        # print(y)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        running_loss += loss.item()
        
    epoch_loss = running_loss / len(trn)
    
    val_loss = 0.0
    model.eval()
    correct = 0
    total = 0 
    for batch in val_iter:
        
        # x = torch.ones(batch.text.shape[0], n, dtype=torch.int64).cuda()
        # x[:, :batch.text.shape[1]] = batch.text
        # print(x)
        # print(batch.text.shape[1])

        x = batch.text
        y = batch.label.float()
        
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.item()
        
    val_loss /= len(vld)
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, val_loss))



Epoch: 1, Training Loss: 0.01019511307988848, Validation Loss: 0.009714871831734976
Epoch: 2, Training Loss: 0.009716641112736294, Validation Loss: 0.009441871412595114
Epoch: 3, Training Loss: 0.00948992839881352, Validation Loss: 0.009371318940321604
Epoch: 4, Training Loss: 0.009354449023519243, Validation Loss: 0.009398946475982667
Epoch: 5, Training Loss: 0.009223553809097835, Validation Loss: 0.009362363692124684
Epoch: 6, Training Loss: 0.009088335469790867, Validation Loss: 0.009152149403095245
Epoch: 7, Training Loss: 0.009054400174958366, Validation Loss: 0.009875938284397125
Epoch: 8, Training Loss: 0.008933635129247393, Validation Loss: 0.009517063240210216
Epoch: 9, Training Loss: 0.008871829991681235, Validation Loss: 0.009251532562573751
Epoch: 10, Training Loss: 0.00879664854322161, Validation Loss: 0.009853382921218873
Epoch: 11, Training Loss: 0.008780704346724918, Validation Loss: 0.009905289403597514
Epoch: 12, Training Loss: 0.008697654153619493, Validation Loss: 0

In [29]:
pred = []
gt = []

for batch in test_iter:
    # x = torch.ones(batch.text.shape[0], n, dtype=torch.int64).cuda()
    # x[:, :batch.text.shape[1]] = batch.text
    # print(x)
    # print(batch.text.shape[1])

    x = batch.text
    y = batch.label.float()
    
    pred += model(x).round().tolist()
    gt += y.tolist()

get_metrics(gt, pred)



Accuracy: 0.86
Precision: 0.89
Recall: 0.85
F1: 0.87


In [46]:
class CNN_Attn(nn.Module):
    def __init__(self, V, D, sent_length, kernel_sizes, num_classes, dropout=0.5, filters=100):
        super(CNN_Attn, self).__init__()

        self.embed = nn.Embedding(V+1, D, padding_idx=1)

        self.convs = nn.ModuleList()

        for kernel_size in kernel_sizes:
          self.convs.append(nn.Conv1d(in_channels=1, out_channels=filters, kernel_size=kernel_size*D, padding=kernel_size//2*D, stride=D))

        self.ac = nn.ReLU()
        self.pool = F.max_pool1d        

        self.attn = nn.MultiheadAttention(D, 1)

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(filters*len(kernel_sizes), num_classes)
        self.sm = nn.Sigmoid()
        
    def forward(self, x):
        x = self.embed(x)
        # print(x.shape)
        x = x.view(x.shape[0], 1, -1)
        # print(x.shape)
        x = [self.ac(conv(x)) for conv in self.convs]

        x = torch.cat([self.pool(x_, x_.shape[2]).squeeze(2) for x_ in x], dim=1)

        x = torch.reshape(x, (1, x.shape[0], -1,))
        # print(x.shape)        

        x, _ = self.attn(x, x, x)
        # print(x.shape)

        x = self.linear(self.dropout(x))
        logit = self.sm(x)
        # print(logit.shape)
        return torch.reshape(logit, (-1,))

In [47]:
kernel_sizes = [3,4,5]
vocab_size = len(TEXT.vocab)
dropout = 0.5
dim = 300
n = 2470

model = CNN_Attn(vocab_size, dim, n, kernel_sizes, num_classes=1, dropout=dropout)

In [48]:
model.cuda()

CNN_Attn(
  (embed): Embedding(201524, 300, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(1, 100, kernel_size=(900,), stride=(300,), padding=(300,))
    (1): Conv1d(1, 100, kernel_size=(1200,), stride=(300,), padding=(600,))
    (2): Conv1d(1, 100, kernel_size=(1500,), stride=(300,), padding=(600,))
  )
  (ac): ReLU()
  (attn): MultiheadAttention(
    (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (linear): Linear(in_features=300, out_features=1, bias=True)
  (sm): Sigmoid()
)

In [49]:
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

In [50]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() 
    for batch in train_iter:         
        
        # x = torch.ones(batch.text.shape[0], n, dtype=torch.int64).cuda()
        # x[:, :batch.text.shape[1]] = batch.text
        # print(x)
        # print(batch.text.shape[1])
        x = batch.text
        y = batch.label.float()

        opt.zero_grad()
        preds = model(x)
        # print(preds)
        # print('=======================================')
        # print(y)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        running_loss += loss.item()
        
    epoch_loss = running_loss / len(trn)
    
    val_loss = 0.0
    model.eval()
    correct = 0
    total = 0 
    for batch in val_iter:
        
        # x = torch.ones(batch.text.shape[0], n, dtype=torch.int64).cuda()
        # x[:, :batch.text.shape[1]] = batch.text
        # print(x)
        # print(batch.text.shape[1])

        x = batch.text
        y = batch.label.float()
        
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.item()
        
    val_loss /= len(vld)
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, val_loss))



Epoch: 1, Training Loss: 0.0108502379826137, Validation Loss: 0.010905509614944457
Epoch: 2, Training Loss: 0.01085207280090877, Validation Loss: 0.010905515670776368
Epoch: 3, Training Loss: 0.010852704375130789, Validation Loss: 0.010905515670776368
Epoch: 4, Training Loss: 0.010852704402378626, Validation Loss: 0.010905515670776368
Epoch: 5, Training Loss: 0.010852704402378626, Validation Loss: 0.010905515670776368
Epoch: 6, Training Loss: 0.010852704804284232, Validation Loss: 0.010905515670776368
Epoch: 7, Training Loss: 0.010852704419408526, Validation Loss: 0.010905515670776368
Epoch: 8, Training Loss: 0.010852704289981297, Validation Loss: 0.010905515670776368
Epoch: 9, Training Loss: 0.010852704920087542, Validation Loss: 0.010905515670776368
Epoch: 10, Training Loss: 0.010852704467092241, Validation Loss: 0.010905515670776368
Epoch: 11, Training Loss: 0.01085270448071616, Validation Loss: 0.010905515670776368
Epoch: 12, Training Loss: 0.010852704426220485, Validation Loss: 0.

In [51]:
pred = []
gt = []

for batch in test_iter:
    # x = torch.ones(batch.text.shape[0], n, dtype=torch.int64).cuda()
    # x[:, :batch.text.shape[1]] = batch.text
    # print(x)
    # print(batch.text.shape[1])

    x = batch.text
    y = batch.label.float()
    
    pred += model(x).round().tolist()
    gt += y.tolist()

get_metrics(gt, pred)



Accuracy: 0.50
Precision: 1.00
Recall: 0.50
F1: 0.67
