In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import re
import torch
import torch.nn as nn
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import numpy
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from tqdm import tqdm
import locale
locale.getpreferredencoding = lambda : "UTF-8"

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
def clean_tweet(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

In [4]:
twitter_data_train = pd.read_csv('/content/drive/MyDrive/DL/Big_Data/Final_Project/data/twitter_train.csv', index_col = 0)
twitter_data_train.dropna(axis = 0, inplace = True)
twitter_data_train = twitter_data_train.sample(frac=1).reset_index(drop=True)
twitter_data_train['SentimentText'] = twitter_data_train['SentimentText'].apply(clean_tweet).astype(str)
twitter_data_train

Unnamed: 0,Sentiment,SentimentText
0,0,Couldn t get the page to open
1,1,yes you can
2,1,KevinJonas He will never be in the corner He i...
3,1,Oh I saw it I d been watching for it since I p...
4,0,our bad we get a lot of content from winksound...
...,...,...
99984,0,Me too
99985,0,ha Gabbyb said you can t get into chat cause t...
99986,1,Beavers i love you papa
99987,1,Hope the tea toast and marmite does the trick ...


In [5]:
from collections import Counter

import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab

import io

In [6]:
def get_frequent(tokens, threshold = 10):
  frequency = {token :  0 for token in set(tokens)}
  for token in tokens:
    frequency[token] += 1
  
  frequent_tokens = []
  for token in tokens:
    if frequency[token] >= threshold:
      frequent_tokens += [token]
  frequent_tokens += ['<unk>', '<pad>', '<bos>', '<eos>']
  return frequent_tokens, len(set(frequent_tokens))

In [7]:
text = []

for tweet in twitter_data_train['SentimentText']:
  for word in tweet.split(' '):
    text.append(word.lower())

In [8]:
tokens, num_tokens = get_frequent(text)

print(f'There as {len(tokens)} total tokens and there are {num_tokens} unique tokens')

There as 1210759 total tokens and there are 6568 unique tokens


In [9]:
def mapping(tokens):
  word_to_id = {}
  id_to_word = {}

  for i, token in enumerate(set(tokens)):
    word_to_id[token] = i
    id_to_word[i] = token

  return word_to_id, id_to_word

In [10]:
WORD_TO_ID, ID_TO_WORD = mapping(tokens)

In [11]:
def process_data(raw_data):
  data = []
  for (raw_text, sentiment) in zip(raw_data['SentimentText'], raw_data['Sentiment']):
    text_tensor = torch.tensor([WORD_TO_ID[word.lower()] if word.lower() in WORD_TO_ID else WORD_TO_ID['<unk>']for word in raw_text.split(' ')], dtype = torch.long)
    target_tensor = torch.tensor([sentiment], dtype = torch.float)
    data.append((text_tensor, target_tensor))
  return data

In [12]:
train_split_df = twitter_data_train.iloc[ : int(len(twitter_data_train) * 0.9), :]
val_split_df = twitter_data_train.iloc[int(len(twitter_data_train) * 0.9) :, :]
train_data = process_data(train_split_df)
val_data = process_data(val_split_df)

In [69]:
text, label = train_data[7]

for token_idx in text:
  print(ID_TO_WORD[int(token_idx)], end = ' ')

benson hey amber did you hear i got your book now i got it last night and couldn t put it down but i had to go to sleep 

In [70]:
BATCH_SIZE = 32
PAD_IDX = WORD_TO_ID['<pad>']
BOS_IDX = WORD_TO_ID['<bos>']
EOS_IDX = WORD_TO_ID['<eos>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
  text_batch, target_batch = [], []
  for (text_item, target_item) in  data_batch:
    text_batch.append(torch.cat([torch.tensor([BOS_IDX]), text_item, torch.tensor([EOS_IDX])], dim = 0))
    target_batch.append(target_item)
  text_batch = pad_sequence(text_batch, padding_value = PAD_IDX)
  return text_batch, torch.tensor([target_batch])

In [71]:
train_iter = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle = True, collate_fn = generate_batch)
val_iter = DataLoader(val_data, batch_size = BATCH_SIZE, shuffle = False, collate_fn = generate_batch)

In [72]:
print(len(train_iter))
print(len(val_iter))

2813
313


In [73]:
token_text, target = next(iter(train_iter))
print(token_text.shape, target.shape)

torch.Size([29, 32]) torch.Size([1, 32])


In [74]:
class SentimentClassifier(nn.Module):

  def __init__(self):
    super(SentimentClassifier, self).__init__()

    EMB_DIM = 512
    HIDDEN_SIZE_1 = 256
    HIDDEN_SIZE_2 = 128
    OUTPUT_DIM = 1
    LAYERS = 4
    DROPOUT = 0.1

    self.emb = nn.Embedding(len(WORD_TO_ID), EMB_DIM)
    #self.encoder = nn.GRU(EMB_DIM, HIDDEN_SIZE_1, LAYERS // 2, bidirectional = True) #GRU
    self.encoder = nn.LSTM(EMB_DIM, HIDDEN_SIZE_1, LAYERS, bidirectional = True) #LSTM

    self.fc1 = nn.Linear(HIDDEN_SIZE_1 * 2, HIDDEN_SIZE_2)
    self.fc2 = nn.Linear(HIDDEN_SIZE_2, OUTPUT_DIM)

    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(DROPOUT)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    embedding = self.emb(x)
    enc_output, (enc_hidden, enc_cell_state) = self.encoder(embedding)
    cat = torch.cat((enc_hidden[-2, :, :], enc_hidden[-1, :, :]), dim = 1)
    rel = self.relu(cat)
    out = self.fc1(rel)
    out = self.dropout(out)
    out = self.fc2(out)
    out = self.sigmoid(out)
    return out

In [75]:
test_model = SentimentClassifier()

text, target = next(iter(train_iter))
print(test_model(text).T.shape)
print(target.shape)

torch.Size([1, 32])
torch.Size([1, 32])


In [76]:
model = SentimentClassifier()
model.to(device)

SentimentClassifier(
  (emb): Embedding(6568, 512)
  (encoder): LSTM(512, 256, num_layers=4, bidirectional=True)
  (fc1): Linear(in_features=512, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (sigmoid): Sigmoid()
)

In [77]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = 0.01)
criterion = nn.CrossEntropyLoss()

In [78]:
def accuracy(preds, y):
  rounded_preds = torch.round(preds)
  true_preds = (rounded_preds == y).float()
  acc = true_preds.sum()/len(true_preds)
  return acc

In [79]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0.0
  epoch_acc = 0.0
  model.train()

  for text, labels in iterator:
    optimizer.zero_grad()
    text, labels = text.to(device), labels.to(device)

    predictions = model(text).T
    loss = criterion(predictions, labels)

    acc = accuracy(predictions, labels)
    loss.backward()

    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator) , epoch_acc / len(iterator)

In [80]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0.0
  epoch_acc = 0.0
  
  model.eval()
  predictions_list = torch.tensor([], dtype = torch.float, device = 'cuda')
  labels_list = torch.tensor([], dtype = torch.float, device = 'cuda')

  with torch.no_grad():
    for text, labels in iterator:
      text, labels = text.to(device), labels.to(device)
      predictions = model(text).T
      loss = criterion(predictions, labels)
      predictions_list = torch.cat([predictions_list, predictions.T], dim = 0)
      labels_list = torch.cat([labels_list, labels.T], dim = 0)

      acc = accuracy(predictions, labels)

      epoch_loss += loss.item()
      epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator), predictions_list.to('cpu'), labels_list.to('cpu')

In [81]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [82]:
train_model = True

if train_model:
  N_EPOCHS = 200

  best_val_acc = 0.0

  for epoch in range(N_EPOCHS):
    
    start_time = time.time()

    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    val_loss, val_acc, _, _ = evaluate(model, val_iter, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if val_acc > best_val_acc:
      best_val_acc = val_acc
      torch.save(model.state_dict(), '/content/drive/MyDrive/DL/Big_Data/Final_Project/models/W2V_LSTM_model_256.pt')


    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
    print(f'\t Test Loss: {val_loss:.3f} |  Test Acc: {val_acc:.2f}%')

Epoch: 01 | Epoch Time: 0m 50s
	Train Loss: 62.593 | Train Acc: 18.06%
	 Test Loss: 62.534 |  Test Acc: 18.05%
Epoch: 02 | Epoch Time: 0m 50s
	Train Loss: 62.595 | Train Acc: 18.06%
	 Test Loss: 62.534 |  Test Acc: 18.05%
Epoch: 03 | Epoch Time: 0m 49s
	Train Loss: 62.594 | Train Acc: 18.06%
	 Test Loss: 62.534 |  Test Acc: 18.05%
Epoch: 04 | Epoch Time: 0m 49s
	Train Loss: 62.595 | Train Acc: 18.06%
	 Test Loss: 62.534 |  Test Acc: 18.05%
Epoch: 05 | Epoch Time: 0m 50s
	Train Loss: 62.595 | Train Acc: 18.06%
	 Test Loss: 62.534 |  Test Acc: 18.05%
Epoch: 06 | Epoch Time: 0m 49s
	Train Loss: 62.595 | Train Acc: 18.06%
	 Test Loss: 62.534 |  Test Acc: 18.05%
Epoch: 07 | Epoch Time: 0m 49s
	Train Loss: 62.594 | Train Acc: 18.06%
	 Test Loss: 62.534 |  Test Acc: 18.05%
Epoch: 08 | Epoch Time: 0m 49s
	Train Loss: 62.593 | Train Acc: 18.06%
	 Test Loss: 62.534 |  Test Acc: 18.05%
Epoch: 09 | Epoch Time: 0m 50s
	Train Loss: 62.593 | Train Acc: 18.06%
	 Test Loss: 62.534 |  Test Acc: 18.05%
E

KeyboardInterrupt: ignored

In [83]:
path = '/content/drive/MyDrive/DL/Big_Data/Final_Project/models'

In [84]:
model.load_state_dict(torch.load(path + '/W2V_LSTM_model_256.pt', map_location = torch.device('cuda' if torch.cuda.is_available() else 'cpu')))

<All keys matched successfully>

In [85]:
val_loss, val_acc, predictions, labels = evaluate(model, val_iter, criterion)
print(f'Test Loss = {val_loss:.3f}, Test Accuracy = {val_acc:.3f}')

Test Loss = 62.534, Test Accuracy = 18.051


In [86]:
results = {
    'preds' : predictions.numpy().squeeze(),
    'true' : labels.numpy().squeeze()
}

In [87]:
w2v_results_lstm_512 = pd.DataFrame(results)
w2v_results_lstm_512

Unnamed: 0,preds,true
0,1.0,0.0
1,1.0,1.0
2,1.0,1.0
3,1.0,0.0
4,1.0,1.0
...,...,...
9994,1.0,0.0
9995,1.0,0.0
9996,1.0,1.0
9997,1.0,1.0


In [88]:
w2v_results_lstm_512.to_csv('w2v_results_lstm_512.csv')