In [None]:
#!pip install torchtext==0.11.2

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue May 24 16:44:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import torch
import torchtext
torch.__version__, torchtext.__version__

('1.10.2+cu102', '0.11.2')

In [None]:
import pandas as pd
from collections import Counter
import nltk
nltk.download('punkt')
import time
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np

import torch.optim as optim
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F
from torchtext.legacy import data

from google.colab import drive 
drive.mount('/content/gdrive')
PATH = "gdrive/My Drive/project_data/"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Mounted at /content/gdrive


In [None]:
from torchtext.vocab import vocab
from torch.utils.data import DataLoader
from itertools import combinations
from torchtext.vocab import GloVe

In [None]:
glove = GloVe(name='6B')

.vector_cache/glove.6B.zip: 862MB [02:39, 5.40MB/s]                           
100%|█████████▉| 399999/400000 [00:34<00:00, 11684.25it/s]


In [None]:
SEED = 1515

#random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1515)
torch.backends.cudnn.deterministic = True

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
df = pd.read_csv(PATH+'train.csv')

In [None]:
df['toxic'] = np.where(df['target'] > 0.5, 1, 0)
data = df[['comment_text','toxic']]

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(data, test_size=0.2)

In [None]:
def custom_tokenize(text):
    if not text:
        return ''
    return nltk.word_tokenize(text.lower())

In [None]:
train = list(train.to_records(index=False))
val = list(val.to_records(index=False))

In [None]:
def collate_into_cbow(batch):    
    label_vec = []
    cbow_vec = []
    for idx, (txt, l) in enumerate(batch):
        label_vec.append(l)
        tokenized = custom_tokenize(txt)
        vecs = glove.get_vecs_by_tokens(tokenized)
        c_vecs = torch.div(vecs.sum(dim=0), vecs.size()[0])
        c_vecs = c_vecs.unsqueeze(0)
        if idx == 0:
            cbow_vec = c_vecs
        else:
            cbow_vec = torch.cat([cbow_vec, c_vecs])

    labels = torch.tensor(label_vec)    
    return cbow_vec.to(device), labels.to(device)

In [None]:
train_cbow = DataLoader(train, batch_size=64, shuffle=False, 
                        collate_fn=collate_into_cbow)

val_cbow = DataLoader(train, batch_size=64, shuffle=False, 
                        collate_fn=collate_into_cbow)

for idx, (lt, tt) in enumerate(train_cbow):
    print(idx, lt.shape, tt.shape)
    if idx == 2: break

0 torch.Size([64, 300]) torch.Size([64])
1 torch.Size([64, 300]) torch.Size([64])
2 torch.Size([64, 300]) torch.Size([64])


In [None]:
from torch import nn
import torch.nn.functional as F

# task 6
class cBoWClassifier(nn.Module):
    
    def __init__(self, num_labels, vocab_size):
        super(cBoWClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, 50)
        self.hidden = nn.Linear(50, num_labels)
        self.nonlinearity = nn.Tanh()

    def forward(self, bow_vec):
        out = self.nonlinearity(self.linear(bow_vec))
        return F.log_softmax(self.hidden(out), dim=1)  

In [None]:
BATCH_SIZE = 64
num_labels = 2
vocab_size = 300
model = cBoWClassifier(num_labels, vocab_size).to(device)

In [None]:
def get_accuracy(dataloader):
    model.eval()
    with torch.no_grad():
        correct_pred = 0
        total_samples = 0
        correct_pos = 0
        pos_samples = 0
        for idx, (text, label) in enumerate(dataloader):
            total_samples += text.size()[0]
            log_probs = model(text)
            predictions = torch.argmax(log_probs, dim=1)
            correct_pred += torch.eq(predictions, label).long().sum().item()
            pos_samples += torch.sum(predictions).float()
    return (correct_pred/total_samples, correct_pred/pos_samples)

In [None]:
import time

loss_function = torch.nn.NLLLoss()

def train_an_epoch(dataloader):
    model.train() # Sets the module in training mode.
    log_interval = 5000

    for idx, (text, label) in enumerate(dataloader):
        model.zero_grad()
        log_probs = model(text)
        loss = loss_function(log_probs, label)
        loss.backward()
        optimizer.step()
        if idx % log_interval == 0 and idx > 0:
            print(f'At iteration {idx} the loss is {loss:.3f}.')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

EPOCHS = 3 # epoch
optimizer = torch.optim.SGD(model.parameters(), lr=3)

accuracies=[]
precisions =[]
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train_an_epoch(train_cbow)
    accuracy, precision = get_accuracy(val_cbow)
    accuracies.append(accuracy)
    precisions.append(precision)
    time_taken = time.time() - epoch_start_time
    print()
    print(f'After epoch {epoch} the validation accuracy is {accuracy:.3f}.')
    print(f'After epoch {epoch} the validation precision is {precision:.3f}.')
    print()
    
plt.plot(range(1, EPOCHS+1), accuracies)

At iteration 5000 the loss is 0.928.
At iteration 10000 the loss is 0.670.
At iteration 15000 the loss is 4.736.
At iteration 20000 the loss is 1.292.

After epoch 1 the validation accuracy is 0.908.
After epoch 1 the validation precision is 12.154.

At iteration 5000 the loss is 0.737.
At iteration 10000 the loss is 0.132.
At iteration 15000 the loss is 4.434.
At iteration 20000 the loss is 1.366.

After epoch 2 the validation accuracy is 0.941.
After epoch 2 the validation precision is 12820.066.

At iteration 5000 the loss is 2.419.
At iteration 10000 the loss is 0.284.
At iteration 15000 the loss is 4.343.
At iteration 20000 the loss is 1.180.
