In [1]:
device='cuda'

In [2]:
import os
import numpy as np
import pandas as pd
import math
from google.colab import userdata
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
from transformers import BertTokenizer

## Download Dataset and Preprocess it

In [3]:
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

In [4]:
!kaggle competitions download -c nlp-getting-started

Downloading nlp-getting-started.zip to /content
100% 593k/593k [00:00<00:00, 757kB/s]
100% 593k/593k [00:00<00:00, 757kB/s]


In [5]:
!mkdir '/content/Dataset'

In [6]:
!unzip "/content/nlp-getting-started.zip" -d "/content/Dataset"

Archive:  /content/nlp-getting-started.zip
  inflating: /content/Dataset/sample_submission.csv  
  inflating: /content/Dataset/test.csv  
  inflating: /content/Dataset/train.csv  


In [7]:
train_df = pd.read_csv('/content/Dataset/train.csv')

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
class TwitterDisasterDataset(Dataset):
    def __init__(self, file_path, min_freq=4, max_length=64):
        train_df = pd.read_csv(file_path)

        corpus = train_df.iloc[:, 3].tolist()
        self.X = torch.tensor(tokenizer(corpus, truncation=True, padding=True, max_length=max_length)['input_ids'])
        self.y = torch.tensor(train_df.iloc[:, 4])

    def __len__(self):
        return len(self.y)


    def __getitem__(self, index):
        return (self.X[index], self.y[index])


In [12]:
full_dataset = TwitterDisasterDataset('/content/Dataset/train.csv')

In [13]:
vocab_size = len(tokenizer.vocab)

In [50]:
train_size = int(0.95 * len(full_dataset))
val_size = len(full_dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)

## build model

In [51]:
def create_positional_encoding(max_length, d_model):
    assert d_model % 2 == 0, "Dimension model must be even"

    pos = torch.arange(0, max_length).unsqueeze(1) # (max_length, 1)
    pos_expanded = pos.repeat(1, d_model // 2) # (max_length, d_model // 2)

    power = torch.arange(0, d_model, 2).float() / d_model
    div_term = torch.pow(10000, power).unsqueeze(0) # (1, d_model // 2)
    div_term_expanded = div_term.repeat(max_length, 1)  # (max_length, d_model // 2)

    pe = torch.zeros(max_length, d_model) # (max_length, d_model)
    pe[:, 0::2] = torch.sin(pos_expanded / div_term_expanded) # (max_length, d_model // 2)
    pe[:, 1::2] = torch.cos(pos_expanded / div_term_expanded) # (max_length, d_model // 2)

    return pe

class Embedding(nn.Module):
  def __init__(self, vocab_size, max_length, d_model):
    super(Embedding, self).__init__()
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
    self.pos_encoding = create_positional_encoding(max_length, d_model).to(device) # (seq_length, d_model)

  def forward(self, x):
    """ Apply embedding and positional encoding to the input

    Input:
      x: (N, seq_length)
    Output:
      x: (N, seq_length, d_model)
    """
    # apply embedding
    x = self.embedding(x)
    # add positional encoding
    x += self.pos_encoding.unsqueeze(0)
    return x


class SelfAttention(nn.Module):
  def __init__(self, d_model, num_heads, dropout):
    super(SelfAttention, self).__init__()
    self.mha = nn.MultiheadAttention(embed_dim=d_model, num_heads=num_heads, dropout=dropout, batch_first=True)
    self.layer_norm = nn.LayerNorm(d_model)


  def forward(self, x):
    attn_output, _ = self.mha(query=x, key=x, value=x)
    x = self.layer_norm(x + attn_output)
    return x



class FeedForward(nn.Module):
  def __init__(self, d_model, dropout):
    super(FeedForward, self).__init__()
    self.seq = nn.ModuleList([
        nn.Linear(d_model, 2 * d_model),
        nn.ReLU(),
        nn.Linear(2 * d_model, d_model),
        nn.Dropout(dropout)
    ])

    self.layernorm = nn.LayerNorm(d_model)

  def forward(self, x):
    original_x = x
    for layer in self.seq:
      x = layer(x)
    out = x + original_x
    return self.layernorm(out)

class Encoder_Layer(nn.Module):
  def __init__(self, d_model, num_heads, dropout):
    super(Encoder_Layer, self).__init__()
    self.self_attention = SelfAttention(d_model, num_heads, dropout)
    self.ff = FeedForward(d_model, dropout)

  def forward(self, x):
    x = self.self_attention(x)
    x = self.ff(x)
    return x


class Encoder(nn.Module):
  def __init__(self, vocab_size, max_length, d_model, num_heads, num_layers, dropout):
    super(Encoder, self).__init__()
    self.max_length = max_length
    self.d_model = d_model
    self.embedding = Embedding(vocab_size, max_length, d_model)
    self.encoder_layers = nn.ModuleList([
        Encoder_Layer(d_model, num_heads, dropout) for _ in range(num_layers)
    ])
    self.classifciation1 = nn.Linear(max_length*d_model, 1000)
    self.classifciation2 = nn.Linear(1000, 1)

  def forward(self, x):
    x = self.embedding(x)
    for enc_layer in self.encoder_layers:
      x = enc_layer(x)

    x = x.view(x.shape[0], -1)
    x = self.classifciation1(x)
    x = self.classifciation2(x)

    return torch.sigmoid(x)

In [60]:
model = Encoder(vocab_size=30522, max_length=64, d_model=8, num_heads=2, num_layers=2, dropout=0.5)
model = model.to(device)

In [86]:
class LSTMClassifier(nn.Module):
  def __init__(self, seq_length, vocab_size, embedding_dim, hidden_dim):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim)
    self.dropout = nn.Dropout(0.8)
    self.dense1 = nn.Linear(seq_length*hidden_dim, 1)


  def forward(self, x):
    '''
    Inputs:
      x: (N, seq_length)
    '''
    x = self.embedding(x) # (N, seq_length, embedding_dim)
    x, _ = self.lstm(x) # (N, seq_length, hidden_dim)
    x = x.reshape(x.shape[0], -1) # (N, seq_length*hidden_dim)
    x = self.dropout(x)
    x = torch.sigmoid(self.dense1(x)) # (N, 1)
    return x

In [172]:
model = LSTMClassifier(seq_length=64, vocab_size=30522, embedding_dim=2, hidden_dim=2)
model = model.to(device)

## Train

In [186]:
loss_fn = nn.BCELoss()
opt = optim.Adam(model.parameters(), lr=0.001)

In [187]:
def get_val_loss():
  size = len(val_dataloader)
  model.eval()
  total_val_loss = 0
  with torch.no_grad():
    for X, y in val_dataloader:
      X = X.to(device)
      y = y.reshape(y.shape[0], -1).float()
      y = y.to(device)
      pred = model(X)
      loss = loss_fn(pred, y)
      total_val_loss = loss
      total_val_loss += (loss / size)

  return total_val_loss

In [188]:
def save_checkpoint(state, filename="checkpoint.pth.tar"):
  torch.save(state, filename)

In [189]:
!mkdir '/content/Checkpoints'

mkdir: cannot create directory ‘/content/Checkpoints’: File exists


In [190]:
def train_one_epoch(i):
  for batch_idx, (X, y) in enumerate(train_dataloader):
    model.train()
    opt.zero_grad()
    X = X.to(device)
    y = y.reshape(y.shape[0], -1).float()
    y = y.to(device)

    pred = model(X)
    loss = loss_fn(pred, y)
    loss.backward()
    opt.step()

    if batch_idx % 100 == 0 or batch_idx == len(train_dataloader)-1:
      val_loss = get_val_loss()
      print(f"[Epoch {i} - Batch {batch_idx}] Train Loss: {loss} Val Loss: {val_loss}")

      # save checkpoints
      save_checkpoint({
                'epoch': i + 1,
                'state_dict': model.state_dict(),
                'optimizer' : opt.state_dict(),
            }, filename=f"/content/Checkpoints/checkpoint_epoch_{i}_batch_{batch_idx}.pth.tar")


In [191]:
for i in range(100):
  train_one_epoch(i+1)

[Epoch 1 - Batch 0] Train Loss: 0.5612155199050903 Val Loss: 0.7310085296630859
[Epoch 1 - Batch 100] Train Loss: 0.604354977607727 Val Loss: 0.7401944994926453
[Epoch 1 - Batch 200] Train Loss: 0.6899132132530212 Val Loss: 0.6753932237625122
[Epoch 1 - Batch 225] Train Loss: 0.6357384920120239 Val Loss: 0.6687198877334595
[Epoch 2 - Batch 0] Train Loss: 0.5832513570785522 Val Loss: 0.5927220582962036
[Epoch 2 - Batch 100] Train Loss: 0.6256217956542969 Val Loss: 0.7224656939506531
[Epoch 2 - Batch 200] Train Loss: 0.6243176460266113 Val Loss: 0.6535767316818237
[Epoch 2 - Batch 225] Train Loss: 0.6388856768608093 Val Loss: 0.6532761454582214
[Epoch 3 - Batch 0] Train Loss: 0.6109658479690552 Val Loss: 0.7886731028556824
[Epoch 3 - Batch 100] Train Loss: 0.6320313215255737 Val Loss: 0.5904964208602905
[Epoch 3 - Batch 200] Train Loss: 0.7110719680786133 Val Loss: 0.730602502822876
[Epoch 3 - Batch 225] Train Loss: 0.6836665272712708 Val Loss: 0.6141193509101868
[Epoch 4 - Batch 0] Trai

## Eval

In [193]:
checkpoint = torch.load('/content/Checkpoints/checkpoint_epoch_70_batch_0.pth.tar')
model.load_state_dict(checkpoint['state_dict'])
opt.load_state_dict(checkpoint['optimizer'])

In [194]:
test_df = pd.read_csv('/content/Dataset/test.csv')

In [195]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [196]:
test_corpus = test_df.iloc[:, 3]
padded_text = torch.tensor(tokenizer(test_corpus.tolist(), truncation=True, padding=True, max_length=64)['input_ids'])
padded_text=padded_text.to(device)
test_pred = model(padded_text)
target = test_pred.reshape(3263).tolist()
id = np.arange(3263)


In [197]:
real_p = [1 if p > 0.5 else 0 for p in target]
result_df = pd.DataFrame({'id': test_df.iloc[:,0].tolist(), 'target': real_p})
result_df.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,1


In [198]:
result_df.to_csv('result.csv', index=False)