In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
first_run = True
if first_run:
  !pip install torchtext
  !pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transforme

In [3]:
import pandas as pd
import re
import torch
import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
def clean_tweet(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

In [5]:
twitter_data_train_full = pd.read_csv('/content/drive/MyDrive/DL/Big_Data/Final_Project/data/twitter_train.csv', index_col = 0)
twitter_data_train_full.dropna(axis = 0, inplace = True)
twitter_data_train_full = twitter_data_train_full.sample(frac=1).reset_index(drop=True)
twitter_data_train_full['SentimentText'] = twitter_data_train_full['SentimentText'].apply(clean_tweet).astype(str)
twitter_data_train_full

Unnamed: 0,Sentiment,SentimentText
0,1,congratulations to you and the whole family Th...
1,0,inaperfectworld I would be Scouse and living i...
2,0,CBC ca slide passed over as quot talked about ...
3,0,My chick wont shut up Will only stop chirping ...
4,0,ohmygosh Thats scary did the police have to ta...
...,...,...
99984,1,you should try
99985,1,Does this happen every monday
99986,1,Thank you I ve discovered a talent
99987,0,uh oh bad news


In [6]:
twitter_data_train = twitter_data_train_full.iloc[:, :]

In [7]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
print(len(tokenizer.vocab))

30522


In [9]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.sep_token
unk_token = tokenizer.unk_token
print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [SEP] [UNK]


In [10]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)
print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 102 100


In [11]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

512


In [12]:
def tokenize_and_cut(sentence):
  tokens = tokenizer.tokenize(sentence)
  tokens = tokens[: max_input_length - 2]
  return tokens

In [13]:
twitter_data_train['SentimentText'] = twitter_data_train['SentimentText'].apply(tokenize_and_cut)
twitter_data_train

Unnamed: 0,Sentiment,SentimentText
0,1,"[congratulations, to, you, and, the, whole, fa..."
1,0,"[ina, ##per, ##fect, ##world, i, would, be, sc..."
2,0,"[cbc, ca, slide, passed, over, as, quo, ##t, t..."
3,0,"[my, chick, won, ##t, shut, up, will, only, st..."
4,0,"[oh, ##my, ##gos, ##h, that, ##s, scary, did, ..."
...,...,...
99984,1,"[you, should, try]"
99985,1,"[does, this, happen, every, monday]"
99986,1,"[thank, you, i, ve, discovered, a, talent]"
99987,0,"[uh, oh, bad, news]"


In [14]:
from sklearn.model_selection import train_test_split
twitter_text_train, twitter_text_val, target_train, target_val = train_test_split(twitter_data_train['SentimentText'], twitter_data_train['Sentiment'],train_size = 0.9 , shuffle = True, random_state = random.seed(SEED)) 

In [15]:
def process_data(X, y):
  data = []
  for i in range(len(X)):
    review_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(token) for token in X.iloc[i]], dtype = torch.long)
    target_tensor = torch.tensor([y.iloc[i]], dtype = torch.float)
    data.append((review_tensor, target_tensor))
  return data

In [16]:
train_data = process_data(twitter_text_train, target_train)
val_data = process_data(twitter_text_val, target_val)

In [17]:
BATCH_SIZE = 128
PAD_IDX = pad_token_idx
BOS_IDX = init_token_idx
EOS_IDX = eos_token_idx

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
  review_batch, target_batch = [], []
  for (review_item, target_item) in data_batch:
    review_batch.append(torch.cat([torch.tensor([BOS_IDX]), review_item, torch.tensor([EOS_IDX])], dim = 0))
    target_batch.append(target_item.item())
  review_batch = pad_sequence(review_batch, padding_value = PAD_IDX)
  return review_batch, torch.tensor(target_batch)

In [18]:
train_iter = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle = True, collate_fn = generate_batch)
val_iter = DataLoader(val_data, batch_size = BATCH_SIZE, shuffle = False, collate_fn = generate_batch)

In [19]:
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
import torch.nn as nn

class BERT_LSTM_Sentiment(nn.Module):
  def __init__(self, bert, hidden_dim1, hidden_dim2, output_dim, n_layers, bidirectional = True, dropout = None):

    super(BERT_LSTM_Sentiment, self).__init__()
    self.bert = bert
    embedding_dim = bert.config.to_dict()['hidden_size']

    self.lstm = nn.LSTM(embedding_dim,
                      hidden_dim1,
                      num_layers = n_layers,
                      bidirectional = bidirectional,
                      batch_first = True,
                      dropout = 0 if n_layers < 2 else dropout)
    
    self.fc1 = nn.Linear(hidden_dim1 * 2, hidden_dim2)
    self.fc2 = nn.Linear(hidden_dim2, output_dim)
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()
    self.dropout = nn.Dropout(dropout)

  def forward(self, text):
    with torch.no_grad():
      embedded = self.bert(text)[0]
    
    ls_output, (hidden, cell) = self.lstm(embedded)
    cat = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim = 1)
    rel = self.relu(cat)
    out = self.fc1(rel)
    out = self.dropout(out)
    out = self.fc2(out)
    output = self.sigmoid(out)

    return output

In [21]:
model = BERT_LSTM_Sentiment(bert,
                           hidden_dim1 = 256,
                           hidden_dim2 = 128,
                           output_dim = 1,
                           n_layers = 4,
                           bidirectional = True,
                           dropout = 0.1)

In [22]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [23]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = 0.01)
criterion = nn.CrossEntropyLoss()

In [24]:
model = model.to(device)
criterion = criterion.to(device)

In [25]:
def accuracy(preds, y):
  rounded_preds = torch.round(preds)
  true_preds = (rounded_preds == y).float()
  acc = true_preds.sum()/len(true_preds)
  return acc

In [26]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0.0
  epoch_acc = 0.0
  model.train()

  for text, labels in iterator:
    optimizer.zero_grad()
    text, labels = text.to(device), labels.to(device)

    predictions = model(text.T).squeeze(1)
    loss = criterion(predictions, labels)

    acc = accuracy(predictions, labels)
    loss.backward()

    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator) , epoch_acc / len(iterator)

In [27]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0.0
  epoch_acc = 0.0
  model.eval()
  predictions_batches = torch.tensor([], dtype = torch.float, device = 'cuda')
  label_batches = torch.tensor([], dtype = torch.float, device = 'cuda')
  with torch.no_grad():
    for text, labels in iterator:
      text, labels = text.to(device), labels.to(device)
      predictions = model(text.T).squeeze(1)
      loss = criterion(predictions, labels)
      
      predictions_batches = torch.cat([predictions_batches, predictions], dim = 0)
      label_batches = torch.cat([label_batches, labels], dim = 0)

      acc = accuracy(predictions, labels)

      epoch_loss += loss.item()
      epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator), predictions_batches.to('cpu'), label_batches.to('cpu')

In [28]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [29]:
train_model = False

if train_model:
  N_EPOCHS = 50

  best_val_acc = 0.0

  for epoch in range(N_EPOCHS):
    
    start_time = time.time()

    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iter, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if val_acc > best_val_acc:
      best_val_acc = val_acc
      torch.save(model.state_dict(), '/content/drive/MyDrive/DL/Big_Data/Final_Project/models/bert_lstm_model_v2.pt')


    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.3f}')
    print(f'\t Test Loss: {val_loss:.3f} |  Test Acc: {val_acc:.3f}')

In [30]:
path = '/content/drive/MyDrive/DL/Big_Data/Final_Project/models'

In [35]:
model.load_state_dict(torch.load(path + '/bert_lstm_model.pt', map_location = torch.device('cuda' if torch.cuda.is_available() else 'cpu')))

<All keys matched successfully>

In [36]:
val_loss, val_acc, predictions, labels = evaluate(model, val_iter, criterion)
print(f'Test Loss = {val_loss:.3f}, Test Accuracy = {val_acc:.3f}')

Test Loss = 348.202, Test Accuracy = 0.428


In [37]:
results = {
    'preds' : predictions.numpy(),
    'labels' : labels.numpy()
}

In [38]:
results_bert_lstm = pd.DataFrame(results)
results_bert_lstm.to_csv('results_bert_lstm_v1.csv')
results_bert_lstm

Unnamed: 0,preds,labels
0,3.698591e-09,1.0
1,3.692931e-09,0.0
2,3.690995e-09,1.0
3,3.729442e-09,1.0
4,4.052427e-09,1.0
...,...,...
9994,3.622789e-09,1.0
9995,3.613052e-09,1.0
9996,3.643425e-09,1.0
9997,3.708651e-09,1.0
