In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

  import pandas.util.testing as tm


In [3]:
base_path = "/content/drive/My Drive/IEEE-BigMM"

train_csv = base_path + "/mydata/sarcasm_train.csv"
val_csv = base_path + "/mydata/sarcasm_val.csv"

train_df = pd.read_csv(train_csv)
val_df = pd.read_csv(val_csv)

train_df.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,text,Sarcasm
0,2725,1.0551e+18,rakhi sawant full interview tanushree dutta de...,0.0
1,1261,1.05137e+18,ption no2,0.0
2,7072,1.04992e+18,,0.0
3,2448,1.05502e+18,ption no2,1.0
4,6188,1.05223e+18,ption no2,0.0


In [4]:
print("Train shape: ", train_df.shape)
print("Val shape: ", val_df.shape)

Train shape:  (6382, 4)
Val shape:  (1596, 4)


In [5]:
import torch
from torchtext import data

In [6]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [7]:
fields = [(None, None), (None, None), ('text', TEXT), ('target', LABEL)]

In [8]:
train_data, val_data = data.TabularDataset.splits(
      path = base_path+'/mydata',
      train = 'sarcasm_train.csv',
      validation = 'sarcasm_val.csv',
      format = 'csv',
      fields = fields,
      skip_header = True
)

In [9]:
print("# of training data: ", len(train_data))
print("# of val data: ", len(val_data))

# of training data:  6382
# of val data:  1596


In [10]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [11]:
print("# unique tokens in TEXT: ", len(TEXT.vocab))
print("# unique tokens in LABEL: ", len(LABEL.vocab))

# unique tokens in TEXT:  14476
# unique tokens in LABEL:  2


In [12]:
print(TEXT.vocab.freqs.most_common(10))

[('metoo', 5315), ('woman', 1019), ('no2', 915), ('ption', 886), ('movement', 827), ('sexual', 627), ('amp', 496), ('metooindia', 450), ('harassment', 404), (' ', 363)]


In [13]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'metoo', 'woman', 'no2', 'ption', 'movement', 'sexual', 'amp', 'metooindia']


In [14]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, val_iterator = data.BucketIterator.splits(
        (train_data, val_data),
        batch_size = BATCH_SIZE,
        sort_key=lambda x: len(x.text),
        device = device
)

In [15]:
import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
    super().__init__()

    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.rnn = nn.RNN(embedding_dim, hidden_dim)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, text):
    embedded = self.embedding(text)
    output, hidden = self.rnn(embedded)

    assert torch.equal(output[-1,:,:], hidden.squeeze(0))

    return self.fc(hidden.squeeze(0))

In [16]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [17]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [18]:
criterion = nn.BCEWithLogitsLoss()

In [19]:
model = model.to(device)
criterion = criterion.to(device)

In [20]:
def binary_accuracy(preds, y):
  rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds == y).float()
  acc = correct.sum() / len(correct)
  return acc

In [21]:
from sklearn.metrics import roc_auc_score

def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  epoch_roc = 0

  model.train()

  for batch in iterator:
    optimizer.zero_grad()

    predictions = model(batch.text).squeeze(1)
    
    loss = criterion(predictions, batch.target)
    loss.backward()
    
    acc = binary_accuracy(predictions, batch.target)
    # roc = roc_auc_score(batch.target, torch.sigmoid(predictions).detach().numpy())

    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()
    # epoch_roc += roc
  
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [22]:
def validate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  epoch_roc = 0
   
  model.eval()

  with torch.no_grad():
    for batch in iterator:

      predictions = model(batch.text).squeeze(1)

      loss = criterion(predictions, batch.target)
      acc = binary_accuracy(predictions, batch.target)
      # roc = roc_auc_score(batch.target, torch.sigmoid(predictions).detach().numpy())

      epoch_loss += loss.item()
      epoch_acc += acc.item()
      # epoch_roc += roc
    
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [30]:
N_EPOCHS = 10

best_val_loss = float('inf')

for epoch in range(N_EPOCHS):
  start_time = time.time()

  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  val_loss, val_acc = validate(model, val_iterator, criterion)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
  
  if(val_loss < best_val_loss):
    best_val_loss = val_loss
    torch.save(model.state_dict(), base_path + '/models/rnn.pt')

  if((epoch+1) % 2 == 0):
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc*100:.2f}%')

Epoch: 02 | Time: 0m 1s
	Train Loss: 0.107 | Train Acc: 97.88%
	Val Loss: 0.290 | Val Acc: 96.31%
Epoch: 04 | Time: 0m 1s
	Train Loss: 0.107 | Train Acc: 97.82%
	Val Loss: 0.289 | Val Acc: 96.37%
Epoch: 06 | Time: 0m 1s
	Train Loss: 0.107 | Train Acc: 97.83%
	Val Loss: 0.288 | Val Acc: 96.37%
Epoch: 08 | Time: 0m 1s
	Train Loss: 0.107 | Train Acc: 97.83%
	Val Loss: 0.287 | Val Acc: 96.50%
Epoch: 10 | Time: 0m 1s
	Train Loss: 0.106 | Train Acc: 97.88%
	Val Loss: 0.286 | Val Acc: 96.50%
