In [1]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
%cd /content/drive/My\ Drive/HW4

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1Q8LLNhf-oeUoj4rqN2e_tcbzZFBsvUdE/HW4


In [2]:
# Setup for GLoVE embeddings (only need to run once)
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove*.zip

# GLoVE Stuff

In [11]:
import numpy as np
embeddings_dict={}
with open("glove.6B.50d.txt", 'r') as f:
  for line in f:
      values = line.split()
      word = values[0]
      vector = np.asarray(values[1:], "float32")
      embeddings_dict[word] = vector

In [3]:
def get_embedding_features(raw_data, is_for_generation=False):
  """
  raw_data: List of tuples (words, label) corresponding to tweets and their corresponding hashtags
  is_for_generation: If using features for generation,
  return the sequence of word vectors, otherwise average over that dimension
  """
  featurized_data = []
  if not is_for_generation:
    vec = np.zeros((1,50))
  else:
    vec = []
  for row in raw_data:
    count = 0
    for word in row[0]:
      if not is_for_generation:
        if word in embeddings_dict:
          vec += embeddings_dict[word]
          count = count+1
      else:
        if word in embeddings_dict:
          vec.append(embeddings_dict[word])
    
    if not is_for_generation and count > 0:
      featurized_data.append([vec / count, row[1]])
    elif count > 0:
      featurized_data.append(np.stack(vec, axis=0))
  return featurized_data

# Hashtag Classification

In [None]:
import os
import json
import re

hashtag_to_label = {
    'superbowl': 0,
    'sb49': 1,
    'patriots': 2,
    'nfl': 3,
    'gopatriots': 4,
    'gohawks': 5
}

datadir = "data"
raw_data = []
for fname in os.listdir(datadir):
  if not fname.endswith('.txt'):
    continue
  
  pound_idx = fname.index('#')
  hashtag = fname[pound_idx+1:-4]
  label_for_file = hashtag_to_label[hashtag]

  with open(os.path.join(datadir, fname)) as json_file:
    for line in json_file.readlines():
      data_pt = json.loads(line)
      
      tweet = data_pt['tweet']['text']
      words = tweet.split()
      words = [x for x in words if x[0] != '#' and not x.startswith('http')] # remove hashtags and urls from tweet
      words = [re.sub(r'[^\w\s]', '', x) for x in words] # Remove punctuation
      
      raw_data.append([words, label_for_file])

In [None]:
classification_feats = get_embedding_features(raw_data, is_for_generation=False)
classification_labels = np.array([x[1] for x in classification_feats])
classification_feats = np.concatenate([x[0] for x in classification_feats], axis=0)
print(classification_feats.shape, classification_labels.shape)
# import numpy as np
# classification_feats = np.load("features.npy")
# classification_labels = np.load("labels.npy")

(2495446, 50) (2495446,)


In [None]:
np.save('features.npy', classification_feats)
np.save('labels.npy', classification_labels)

In [None]:
import torch.nn as nn

class MLP(nn.Module):
  def __init__(self, num_hidden_layers, hidden_width, input_width, output_width):
    super().__init__()
    self.input_layer = nn.Linear(input_width, hidden_width)
    self.relu = nn.ReLU()

    hidden_layer_list = []
    for i in range(num_hidden_layers):
      hidden_layer_list.append(nn.Linear(hidden_width, hidden_width))
      hidden_layer_list.append(nn.ReLU())
    self.hidden_layers = nn.Sequential(*hidden_layer_list)

    self.output_layer = nn.Linear(hidden_width, output_width)
  
  def forward(self, x):
    x = self.input_layer(x)
    x = self.relu(x)
    x = self.hidden_layers(x)
    return self.output_layer(x)

In [None]:
from sklearn.model_selection import train_test_split
import torch

model = MLP(num_hidden_layers=6, hidden_width=128, input_width=50, output_width=6) # 6 output classes corresponding to each hashtag
x_train, x_test, y_train, y_test = train_test_split(classification_feats, classification_labels, test_size=0.1)

x_train = torch.Tensor(x_train)
y_train = torch.Tensor(y_train)
x_test = torch.Tensor(x_test)
y_test = torch.Tensor(y_test)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
batch_size = 128
num_epochs = 1

model.cuda() # moves the model to the GPU

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

torch.Size([2245901, 50]) torch.Size([249545, 50]) torch.Size([2245901]) torch.Size([249545])


In [None]:
from tqdm import tqdm
# Helper function to print validation loss/accuracy
def evaluate(model, x_test, y_test, epoch):
  model.eval()
  with torch.no_grad():
    num_correct = 0.0
    for i in range(0, x_test.shape[0], batch_size):
      if i+batch_size > x_test.shape[0]:
        i_end = x_test.shape[0]
      else:
        i_end = i + batch_size

      model_in = x_test[i:i_end].cuda()
      labels = y_test[i:i_end].cuda()

      preds = model(model_in)
      num_correct += torch.sum(torch.argmax(preds, dim=1) == labels)
  print(f"Epoch: {epoch}, Validation Accuracy: {num_correct / x_test.shape[0]}")
  model.train()

evaluate(model, x_test, y_test, -1)
# Main training loop
for epoch in range(num_epochs):
  avg_loss = 0.0
  count = 0
  for i in tqdm(range(0, x_train.shape[0], batch_size)):
    if i+batch_size > x_train.shape[0]:
      i_end = x_train.shape[0]
    else:
      i_end = i + batch_size
  
    model_in = x_train[i:i_end].cuda() # Gets one batch of training data and GT labels onto the GPU
    labels = y_train[i:i_end].type('torch.cuda.LongTensor')

    # Forward pass through the model
    model_out = model(model_in)

    # Backwards pass to calculate gradients and update parameters
    loss = loss_fn(model_out, labels)
    avg_loss += loss.item()
    count += 1
    # print(loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  
  print(f"Average Training Loss for Epoch {epoch}: {avg_loss / count}")
  evaluate(model, x_test, y_test, epoch)


Epoch: -1, Validation Accuracy: 0.42275741696357727


100%|██████████| 17547/17547 [01:17<00:00, 226.05it/s]


Average Training Loss for Epoch 0: 2.8627502528611206
Epoch: 0, Validation Accuracy: 0.931715726852417


100%|██████████| 17547/17547 [01:20<00:00, 218.27it/s]


Average Training Loss for Epoch 1: 0.3438004992374492
Epoch: 1, Validation Accuracy: 0.9001462459564209


 60%|█████▉    | 10446/17547 [00:46<00:31, 224.08it/s]


KeyboardInterrupt: ignored

# Tweet Generation

In [5]:
import os
import json
import re

datadir = "data"
fname = "tweets_#gopatriots.txt" # Change this to load data for a different hashtag
raw_data = []
full_tweets = []
with open(os.path.join(datadir, fname)) as json_file:
  for line in json_file.readlines():
    data_pt = json.loads(line)
      
    tweet = data_pt['tweet']['text']
    words = tweet.split()
    words = [x.lower() for x in words if x[0] != '#' and not x.startswith('http')] # remove hashtags and urls from tweet
    words = [re.sub(r'[^\w\s]', '', x) for x in words] # Remove punctuation
    words = [x for x in words if len(x) > 0]
    if len(words) < 2:
      continue

    full_tweets.append(words)
      
    for i in range(len(words)-1):
      raw_data.append((words[:i+1], words[i+1]))

In [6]:
print(len(raw_data))
print(len(full_tweets))

128957
18731


In [7]:
!pip install pytorch-nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
from torchnlp.encoders.text import StaticTokenizerEncoder

# Creates the vocabulary over all the tweets
encoder = StaticTokenizerEncoder(full_tweets, tokenize=lambda x:x, min_occurrences=10)
print(len(encoder.vocab))

# Tokenizes each tweet in the dataset
processed_data = []
for tweet in full_tweets:
  tokenized = encoder.encode(tweet)

  input = tokenized[:-1]
  label = tokenized[-1]

  processed_data.append((input, label))

# Caluclates max sequence length for use in padding during training
sequence_length = max([x[0].shape[0] for x in processed_data])
print(sequence_length)

1554
27


In [37]:
import torch.nn as nn
import torch
class Generator(nn.Module):
  def __init__(self, vocab_size, hidden_dim=128, num_lstm_layers=3):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
    self.lstm = nn.LSTM(input_size=hidden_dim, hidden_size=hidden_dim, num_layers=num_lstm_layers, batch_first=True)
    self.output_layer = nn.Linear(hidden_dim, vocab_size)

  
  def forward(self, x, in_state=None):
    x = self.embedding(x)
    x, state = self.lstm(x)
    return self.output_layer(x), state

def evaluate(model, epoch, test_data, batch_size):
  model.eval()
  num_correct = 0
  for i in tqdm(range(0, len(test_data), batch_size)):
    if i+batch_size > len(test_data):
      i_end = len(test_data)
    else:
      i_end = i+batch_size

    batch = test[i:i_end]

    padded = [pad_tensor(x[0], length=sequence_length) for x in batch] # Necessary to pad each sequence so they are all the same length
    model_in = torch.stack(padded, dim=0).cuda()
    labels = torch.stack([x[1] for x in batch], dim=0).type('torch.cuda.LongTensor')

    with torch.no_grad():
      preds, _ = model(model_in)

    num_correct += torch.sum(torch.argmax(preds[:,-1,:], dim=-1) == labels)
  print(f"Validation Accuracy for epoch {epoch}: {num_correct / len(test_data)}")
  model.train()

In [41]:
from torchnlp.encoders.text import pad_tensor
from tqdm import tqdm
from sklearn.model_selection import train_test_split

model = Generator(len(encoder.vocab)).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
num_epochs = 50
batch_size = 64

train, test = train_test_split(processed_data, test_size=0.2)

state = (torch.zeros((3, batch_size, 128)).cuda(), torch.zeros((3, batch_size, 128)).cuda())
for epoch in range(num_epochs):
  average_loss = 0.0
  for i in tqdm(range(0, len(train), batch_size)):
    if i+batch_size > len(train):
      continue
    else:
      i_end = i+batch_size
    
    batch = train[i:i_end]

    padded = [pad_tensor(x[0], length=sequence_length) for x in batch] # Necessary to pad each sequence so they are all the same length
    model_in = torch.stack(padded, dim=0).cuda()
    labels = torch.stack([x[1] for x in batch], dim=0).type('torch.cuda.LongTensor')

    preds, state = model(model_in)

    # state = (state[0].detach(), state[1].detach())

    loss = loss_fn(preds[:,-1,:], labels)
    average_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  
  print(f"Average Loss for epoch {epoch}: {average_loss / len(train)}")
  evaluate(model, epoch, test, batch_size)

100%|██████████| 235/235 [00:01<00:00, 153.66it/s]


Average Loss for epoch 0: 0.07833696183392025


100%|██████████| 59/59 [00:00<00:00, 505.45it/s]


Validation Accuracy for epoch 0: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 172.41it/s]


Average Loss for epoch 1: 0.07437831398897227


100%|██████████| 59/59 [00:00<00:00, 524.71it/s]


Validation Accuracy for epoch 1: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 171.97it/s]


Average Loss for epoch 2: 0.07428703222819591


100%|██████████| 59/59 [00:00<00:00, 516.80it/s]


Validation Accuracy for epoch 2: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 169.99it/s]


Average Loss for epoch 3: 0.074235994947574


100%|██████████| 59/59 [00:00<00:00, 516.14it/s]


Validation Accuracy for epoch 3: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 171.26it/s]


Average Loss for epoch 4: 0.07419151027954904


100%|██████████| 59/59 [00:00<00:00, 505.16it/s]


Validation Accuracy for epoch 4: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 172.02it/s]


Average Loss for epoch 5: 0.07415125579103327


100%|██████████| 59/59 [00:00<00:00, 512.63it/s]


Validation Accuracy for epoch 5: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 174.08it/s]


Average Loss for epoch 6: 0.07411416738159314


100%|██████████| 59/59 [00:00<00:00, 497.27it/s]


Validation Accuracy for epoch 6: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 163.62it/s]


Average Loss for epoch 7: 0.07408186199826731


100%|██████████| 59/59 [00:00<00:00, 504.95it/s]


Validation Accuracy for epoch 7: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 156.14it/s]


Average Loss for epoch 8: 0.07405134647273151


100%|██████████| 59/59 [00:00<00:00, 504.36it/s]


Validation Accuracy for epoch 8: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 171.12it/s]


Average Loss for epoch 9: 0.07402734852257223


100%|██████████| 59/59 [00:00<00:00, 518.78it/s]


Validation Accuracy for epoch 9: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 168.80it/s]


Average Loss for epoch 10: 0.07400506123844723


100%|██████████| 59/59 [00:00<00:00, 509.63it/s]


Validation Accuracy for epoch 10: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 166.53it/s]


Average Loss for epoch 11: 0.07399377686673157


100%|██████████| 59/59 [00:00<00:00, 509.06it/s]


Validation Accuracy for epoch 11: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 170.21it/s]


Average Loss for epoch 12: 0.07397384418883683


100%|██████████| 59/59 [00:00<00:00, 509.17it/s]


Validation Accuracy for epoch 12: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 170.26it/s]


Average Loss for epoch 13: 0.07395691422619288


100%|██████████| 59/59 [00:00<00:00, 498.80it/s]


Validation Accuracy for epoch 13: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 170.13it/s]


Average Loss for epoch 14: 0.07394137361185855


100%|██████████| 59/59 [00:00<00:00, 504.89it/s]


Validation Accuracy for epoch 14: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 165.67it/s]


Average Loss for epoch 15: 0.07398199145639892


100%|██████████| 59/59 [00:00<00:00, 492.85it/s]


Validation Accuracy for epoch 15: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 161.24it/s]


Average Loss for epoch 16: 0.07392837211402165


100%|██████████| 59/59 [00:00<00:00, 469.86it/s]


Validation Accuracy for epoch 16: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 171.23it/s]


Average Loss for epoch 17: 0.07390731671756433


100%|██████████| 59/59 [00:00<00:00, 500.25it/s]


Validation Accuracy for epoch 17: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 170.43it/s]


Average Loss for epoch 18: 0.0738880855637442


100%|██████████| 59/59 [00:00<00:00, 483.97it/s]


Validation Accuracy for epoch 18: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 169.27it/s]


Average Loss for epoch 19: 0.07379213327147142


100%|██████████| 59/59 [00:00<00:00, 514.19it/s]


Validation Accuracy for epoch 19: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 170.03it/s]


Average Loss for epoch 20: 0.07372090613466498


100%|██████████| 59/59 [00:00<00:00, 504.97it/s]


Validation Accuracy for epoch 20: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 172.96it/s]


Average Loss for epoch 21: 0.0735402690237751


100%|██████████| 59/59 [00:00<00:00, 513.75it/s]


Validation Accuracy for epoch 21: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 172.94it/s]


Average Loss for epoch 22: 0.07332855359602916


100%|██████████| 59/59 [00:00<00:00, 498.25it/s]


Validation Accuracy for epoch 22: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 167.39it/s]


Average Loss for epoch 23: 0.07321345903182246


100%|██████████| 59/59 [00:00<00:00, 514.12it/s]


Validation Accuracy for epoch 23: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 163.30it/s]


Average Loss for epoch 24: 0.07298334072569253


100%|██████████| 59/59 [00:00<00:00, 504.77it/s]


Validation Accuracy for epoch 24: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 172.38it/s]


Average Loss for epoch 25: 0.07348856918645046


100%|██████████| 59/59 [00:00<00:00, 517.07it/s]


Validation Accuracy for epoch 25: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 172.45it/s]


Average Loss for epoch 26: 0.07297960956403424


100%|██████████| 59/59 [00:00<00:00, 506.64it/s]


Validation Accuracy for epoch 26: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 170.26it/s]


Average Loss for epoch 27: 0.07388626485346984


100%|██████████| 59/59 [00:00<00:00, 514.13it/s]


Validation Accuracy for epoch 27: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 171.87it/s]


Average Loss for epoch 28: 0.07354975617065572


100%|██████████| 59/59 [00:00<00:00, 507.75it/s]


Validation Accuracy for epoch 28: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 173.13it/s]


Average Loss for epoch 29: 0.07323184112972966


100%|██████████| 59/59 [00:00<00:00, 510.84it/s]


Validation Accuracy for epoch 29: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 167.91it/s]


Average Loss for epoch 30: 0.07290890612388383


100%|██████████| 59/59 [00:00<00:00, 496.52it/s]


Validation Accuracy for epoch 30: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 167.88it/s]


Average Loss for epoch 31: 0.07272735998647072


100%|██████████| 59/59 [00:00<00:00, 480.01it/s]


Validation Accuracy for epoch 31: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 158.24it/s]


Average Loss for epoch 32: 0.07212962147327776


100%|██████████| 59/59 [00:00<00:00, 493.05it/s]


Validation Accuracy for epoch 32: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 163.59it/s]


Average Loss for epoch 33: 0.07115629052835802


100%|██████████| 59/59 [00:00<00:00, 500.51it/s]


Validation Accuracy for epoch 33: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 172.73it/s]


Average Loss for epoch 34: 0.07148239586166183


100%|██████████| 59/59 [00:00<00:00, 497.41it/s]


Validation Accuracy for epoch 34: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 170.88it/s]


Average Loss for epoch 35: 0.07062277826343828


100%|██████████| 59/59 [00:00<00:00, 503.53it/s]


Validation Accuracy for epoch 35: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 173.05it/s]


Average Loss for epoch 36: 0.06970109830740233


100%|██████████| 59/59 [00:00<00:00, 503.35it/s]


Validation Accuracy for epoch 36: 0.30931413173675537


100%|██████████| 235/235 [00:01<00:00, 172.06it/s]


Average Loss for epoch 37: 0.06837732820223183


100%|██████████| 59/59 [00:00<00:00, 502.78it/s]


Validation Accuracy for epoch 37: 0.3186549246311188


100%|██████████| 235/235 [00:01<00:00, 171.70it/s]


Average Loss for epoch 38: 0.06699829238720012


100%|██████████| 59/59 [00:00<00:00, 517.78it/s]


Validation Accuracy for epoch 38: 0.32879638671875


100%|██████████| 235/235 [00:01<00:00, 171.55it/s]


Average Loss for epoch 39: 0.06642461975054886


100%|██████████| 59/59 [00:00<00:00, 473.99it/s]


Validation Accuracy for epoch 39: 0.32959702610969543


100%|██████████| 235/235 [00:01<00:00, 159.52it/s]


Average Loss for epoch 40: 0.0654691589336884


100%|██████████| 59/59 [00:00<00:00, 512.08it/s]


Validation Accuracy for epoch 40: 0.3317320644855499


100%|██████████| 235/235 [00:01<00:00, 164.03it/s]


Average Loss for epoch 41: 0.06453911741341235


100%|██████████| 59/59 [00:00<00:00, 503.59it/s]


Validation Accuracy for epoch 41: 0.33947157859802246


100%|██████████| 235/235 [00:01<00:00, 172.30it/s]


Average Loss for epoch 42: 0.06496990565558645


100%|██████████| 59/59 [00:00<00:00, 493.94it/s]


Validation Accuracy for epoch 42: 0.33733654022216797


100%|██████████| 235/235 [00:01<00:00, 168.76it/s]


Average Loss for epoch 43: 0.06399245911018771


100%|██████████| 59/59 [00:00<00:00, 493.18it/s]


Validation Accuracy for epoch 43: 0.3400053381919861


100%|██████████| 235/235 [00:01<00:00, 171.29it/s]


Average Loss for epoch 44: 0.06362957989093905


100%|██████████| 59/59 [00:00<00:00, 508.47it/s]


Validation Accuracy for epoch 44: 0.3405390977859497


100%|██████████| 235/235 [00:01<00:00, 172.87it/s]


Average Loss for epoch 45: 0.06362903512739843


100%|██████████| 59/59 [00:00<00:00, 505.03it/s]


Validation Accuracy for epoch 45: 0.3333333432674408


100%|██████████| 235/235 [00:01<00:00, 172.83it/s]


Average Loss for epoch 46: 0.06330429761408486


100%|██████████| 59/59 [00:00<00:00, 505.45it/s]


Validation Accuracy for epoch 46: 0.33013078570365906


100%|██████████| 235/235 [00:01<00:00, 173.47it/s]


Average Loss for epoch 47: 0.0627499277013798


100%|██████████| 59/59 [00:00<00:00, 496.50it/s]


Validation Accuracy for epoch 47: 0.3269282281398773


100%|██████████| 235/235 [00:01<00:00, 156.81it/s]


Average Loss for epoch 48: 0.06144263450945372


100%|██████████| 59/59 [00:00<00:00, 499.29it/s]


Validation Accuracy for epoch 48: 0.33039766550064087


100%|██████████| 235/235 [00:01<00:00, 164.82it/s]


Average Loss for epoch 49: 0.06061371379654737


100%|██████████| 59/59 [00:00<00:00, 501.48it/s]

Validation Accuracy for epoch 49: 0.333066463470459





# Generative MLP Baseline

In [4]:
import os
import json
import re

datadir = "data"
fname = "tweets_#gopatriots.txt" # Change this to load data for a different hashtag
raw_data = []
full_tweets = []
with open(os.path.join(datadir, fname)) as json_file:
  for line in json_file.readlines():
    data_pt = json.loads(line)
      
    tweet = data_pt['tweet']['text']
    words = tweet.split()
    words = [x.lower() for x in words if x[0] != '#' and not x.startswith('http')] # remove hashtags and urls from tweet
    words = [re.sub(r'[^\w\s]', '', x) for x in words] # Remove punctuation
    words = [x for x in words if len(x) > 0]
    if len(words) < 2:
      continue

    full_tweets.append(words)
      
    for i in range(len(words)-1):
      raw_data.append((words[:i+1], words[i+1]))
print(len(raw_data))
print(len(full_tweets))

128957
18731


In [5]:
!pip install pytorch-nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-nlp
  Downloading pytorch_nlp-0.5.0-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.1/90.1 KB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch-nlp
Successfully installed pytorch-nlp-0.5.0


In [19]:
def get_glove_embedding(sentence):
  vec = np.zeros((1,50))
  count = 0
  for word in sentence:
    if word in embeddings_dict:
      vec += embeddings_dict[word]
      count = count+1
  return vec / count if count > 0 else None

In [20]:
from torchnlp.encoders.text import StaticTokenizerEncoder

# Creates the vocabulary over all the tweets
encoder = StaticTokenizerEncoder(full_tweets, tokenize=lambda x:x, min_occurrences=10)
print(len(encoder.vocab))

# Tokenizes each tweet in the dataset
processed_data = []
for tweet in full_tweets:
  tokenized = encoder.encode(tweet)

  input = get_glove_embedding(encoder.decode(tokenized[:-1]))
  label = tokenized[-1]

  if input is not None:
    processed_data.append((input, label))
print(len(processed_data))

1554
18731


In [23]:
import torch.nn as nn
from tqdm import tqdm

class GenerativeMLP(nn.Module):
  def __init__(self, num_hidden_layers, hidden_width, input_width, output_width):
    super().__init__()
    self.input_layer = nn.Linear(input_width, hidden_width)
    self.relu = nn.ReLU()

    hidden_layer_list = []
    for i in range(num_hidden_layers):
      hidden_layer_list.append(nn.Linear(hidden_width, hidden_width))
      hidden_layer_list.append(nn.ReLU())
    self.hidden_layers = nn.Sequential(*hidden_layer_list)

    self.output_layer = nn.Linear(hidden_width, output_width)
  
  def forward(self, x):
    x = self.input_layer(x)
    x = self.relu(x)
    x = self.hidden_layers(x)
    return self.output_layer(x)

# Helper function to print validation loss/accuracy
def evaluate(model, x_test, y_test, epoch):
  model.eval()
  with torch.no_grad():
    num_correct = 0.0
    for i in range(0, x_test.shape[0], batch_size):
      if i+batch_size > x_test.shape[0]:
        i_end = x_test.shape[0]
      else:
        i_end = i + batch_size

      model_in = x_test[i:i_end].cuda()
      labels = y_test[i:i_end].cuda()

      preds = model(model_in)
      num_correct += torch.sum(torch.argmax(preds, dim=1) == labels)
  print(f"Epoch: {epoch}, Validation Accuracy: {num_correct / x_test.shape[0]}")
  model.train()

In [31]:
from sklearn.model_selection import train_test_split
import torch

generation_feats = [x[0][0] for x in processed_data]
generation_labels = [x[1] for x in processed_data]

model = GenerativeMLP(3, 128, 50, len(encoder.vocab))
x_train, x_test, y_train, y_test = train_test_split(generation_feats, generation_labels, test_size=0.1)

x_train = torch.Tensor(x_train)
y_train = torch.Tensor(y_train)
x_test = torch.Tensor(x_test)
y_test = torch.Tensor(y_test)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
batch_size = 128
num_epochs = 50

model.cuda() # moves the model to the GPU

evaluate(model, x_test, y_test, -1)
# Main training loop
for epoch in range(num_epochs):
  avg_loss = 0.0
  count = 0
  for i in tqdm(range(0, x_train.shape[0], batch_size)):
    if i+batch_size > x_train.shape[0]:
      i_end = x_train.shape[0]
    else:
      i_end = i + batch_size
  
    model_in = x_train[i:i_end].cuda() # Gets one batch of training data and GT labels onto the GPU
    labels = y_train[i:i_end].type('torch.cuda.LongTensor')

    # Forward pass through the model
    model_out = model(model_in)

    # Backwards pass to calculate gradients and update parameters
    loss = loss_fn(model_out, labels)
    avg_loss += loss.item()
    count += 1
    # print(loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  
  print(f"Average Training Loss for Epoch {epoch}: {avg_loss / count}")
  evaluate(model, x_test, y_test, epoch)


Epoch: -1, Validation Accuracy: 0.0


100%|██████████| 132/132 [00:00<00:00, 450.75it/s]


Average Training Loss for Epoch 0: 5.227243300640222
Epoch: 0, Validation Accuracy: 0.3100320100784302


100%|██████████| 132/132 [00:00<00:00, 384.26it/s]


Average Training Loss for Epoch 1: 4.747806390126546
Epoch: 1, Validation Accuracy: 0.3100320100784302


100%|██████████| 132/132 [00:00<00:00, 410.64it/s]


Average Training Loss for Epoch 2: 4.7389132037307276
Epoch: 2, Validation Accuracy: 0.3100320100784302


100%|██████████| 132/132 [00:00<00:00, 416.57it/s]


Average Training Loss for Epoch 3: 4.734461675990712
Epoch: 3, Validation Accuracy: 0.3100320100784302


100%|██████████| 132/132 [00:00<00:00, 386.47it/s]


Average Training Loss for Epoch 4: 4.726311615019133
Epoch: 4, Validation Accuracy: 0.3100320100784302


100%|██████████| 132/132 [00:00<00:00, 402.83it/s]


Average Training Loss for Epoch 5: 4.696763627456896
Epoch: 5, Validation Accuracy: 0.3143009543418884


100%|██████████| 132/132 [00:00<00:00, 397.15it/s]


Average Training Loss for Epoch 6: 4.6586937579241665
Epoch: 6, Validation Accuracy: 0.3143009543418884


100%|██████████| 132/132 [00:00<00:00, 374.55it/s]


Average Training Loss for Epoch 7: 4.62347070014838
Epoch: 7, Validation Accuracy: 0.312700092792511


100%|██████████| 132/132 [00:00<00:00, 450.62it/s]


Average Training Loss for Epoch 8: 4.586231430371602
Epoch: 8, Validation Accuracy: 0.31376734375953674


100%|██████████| 132/132 [00:00<00:00, 466.10it/s]


Average Training Loss for Epoch 9: 4.540412335684805
Epoch: 9, Validation Accuracy: 0.3143009543418884


100%|██████████| 132/132 [00:00<00:00, 458.60it/s]


Average Training Loss for Epoch 10: 4.497893593528054
Epoch: 10, Validation Accuracy: 0.31376734375953674


100%|██████████| 132/132 [00:00<00:00, 437.02it/s]


Average Training Loss for Epoch 11: 4.458480181116046
Epoch: 11, Validation Accuracy: 0.3153681755065918


100%|██████████| 132/132 [00:00<00:00, 452.35it/s]


Average Training Loss for Epoch 12: 4.423214238701445
Epoch: 12, Validation Accuracy: 0.31696903705596924


100%|██████████| 132/132 [00:00<00:00, 459.18it/s]


Average Training Loss for Epoch 13: 4.3832492936741225
Epoch: 13, Validation Accuracy: 0.31910350918769836


100%|██████████| 132/132 [00:00<00:00, 468.05it/s]


Average Training Loss for Epoch 14: 4.3488639268008145
Epoch: 14, Validation Accuracy: 0.3207043707370758


100%|██████████| 132/132 [00:00<00:00, 457.03it/s]


Average Training Loss for Epoch 15: 4.3178858937639175
Epoch: 15, Validation Accuracy: 0.31963711977005005


100%|██████████| 132/132 [00:00<00:00, 457.79it/s]


Average Training Loss for Epoch 16: 4.295739246137215
Epoch: 16, Validation Accuracy: 0.3153681755065918


100%|██████████| 132/132 [00:00<00:00, 450.45it/s]


Average Training Loss for Epoch 17: 4.2811344175627735
Epoch: 17, Validation Accuracy: 0.31643542647361755


100%|██████████| 132/132 [00:00<00:00, 406.07it/s]


Average Training Loss for Epoch 18: 4.262714866435889
Epoch: 18, Validation Accuracy: 0.31590181589126587


100%|██████████| 132/132 [00:00<00:00, 411.52it/s]


Average Training Loss for Epoch 19: 4.235261590191812
Epoch: 19, Validation Accuracy: 0.31590181589126587


100%|██████████| 132/132 [00:00<00:00, 427.79it/s]


Average Training Loss for Epoch 20: 4.211056814049229
Epoch: 20, Validation Accuracy: 0.31590181589126587


100%|██████████| 132/132 [00:00<00:00, 420.76it/s]


Average Training Loss for Epoch 21: 4.186172703901927
Epoch: 21, Validation Accuracy: 0.31590181589126587


100%|██████████| 132/132 [00:00<00:00, 430.65it/s]


Average Training Loss for Epoch 22: 4.16438325065555
Epoch: 22, Validation Accuracy: 0.31696903705596924


100%|██████████| 132/132 [00:00<00:00, 426.61it/s]


Average Training Loss for Epoch 23: 4.14253519520615
Epoch: 23, Validation Accuracy: 0.3185698986053467


100%|██████████| 132/132 [00:00<00:00, 417.31it/s]


Average Training Loss for Epoch 24: 4.120530932238608
Epoch: 24, Validation Accuracy: 0.31910350918769836


100%|██████████| 132/132 [00:00<00:00, 431.71it/s]


Average Training Loss for Epoch 25: 4.105358217701768
Epoch: 25, Validation Accuracy: 0.3185698986053467


100%|██████████| 132/132 [00:00<00:00, 416.42it/s]


Average Training Loss for Epoch 26: 4.086197807933345
Epoch: 26, Validation Accuracy: 0.3175026476383209


100%|██████████| 132/132 [00:00<00:00, 453.01it/s]


Average Training Loss for Epoch 27: 4.070441693970651
Epoch: 27, Validation Accuracy: 0.31696903705596924


100%|██████████| 132/132 [00:00<00:00, 457.11it/s]


Average Training Loss for Epoch 28: 4.051661746068434
Epoch: 28, Validation Accuracy: 0.31910350918769836


100%|██████████| 132/132 [00:00<00:00, 457.90it/s]


Average Training Loss for Epoch 29: 4.032623784108595
Epoch: 29, Validation Accuracy: 0.3217715919017792


100%|██████████| 132/132 [00:00<00:00, 457.18it/s]


Average Training Loss for Epoch 30: 4.0137948086767485
Epoch: 30, Validation Accuracy: 0.3212379813194275


100%|██████████| 132/132 [00:00<00:00, 465.59it/s]


Average Training Loss for Epoch 31: 4.001592621658787
Epoch: 31, Validation Accuracy: 0.3207043707370758


100%|██████████| 132/132 [00:00<00:00, 449.18it/s]


Average Training Loss for Epoch 32: 3.999217004487009
Epoch: 32, Validation Accuracy: 0.3217715919017792


100%|██████████| 132/132 [00:00<00:00, 461.30it/s]


Average Training Loss for Epoch 33: 4.002301653226216
Epoch: 33, Validation Accuracy: 0.3217715919017792


100%|██████████| 132/132 [00:00<00:00, 443.03it/s]


Average Training Loss for Epoch 34: 3.9946650104089216
Epoch: 34, Validation Accuracy: 0.31963711977005005


100%|██████████| 132/132 [00:00<00:00, 458.72it/s]


Average Training Loss for Epoch 35: 3.9794930400270405
Epoch: 35, Validation Accuracy: 0.3212379813194275


100%|██████████| 132/132 [00:00<00:00, 454.55it/s]


Average Training Loss for Epoch 36: 3.956934204607299
Epoch: 36, Validation Accuracy: 0.3212379813194275


100%|██████████| 132/132 [00:00<00:00, 460.55it/s]


Average Training Loss for Epoch 37: 3.919296895012711
Epoch: 37, Validation Accuracy: 0.3207043707370758


100%|██████████| 132/132 [00:00<00:00, 452.89it/s]


Average Training Loss for Epoch 38: 3.8992942174275718
Epoch: 38, Validation Accuracy: 0.3207043707370758


100%|██████████| 132/132 [00:00<00:00, 457.23it/s]


Average Training Loss for Epoch 39: 3.890816816777894
Epoch: 39, Validation Accuracy: 0.31963711977005005


100%|██████████| 132/132 [00:00<00:00, 465.64it/s]


Average Training Loss for Epoch 40: 3.87202477274519
Epoch: 40, Validation Accuracy: 0.3201707601547241


100%|██████████| 132/132 [00:00<00:00, 462.41it/s]


Average Training Loss for Epoch 41: 3.8506807973890593
Epoch: 41, Validation Accuracy: 0.3201707601547241


100%|██████████| 132/132 [00:00<00:00, 460.91it/s]


Average Training Loss for Epoch 42: 3.836533237587322
Epoch: 42, Validation Accuracy: 0.31963711977005005


100%|██████████| 132/132 [00:00<00:00, 466.27it/s]


Average Training Loss for Epoch 43: 3.8195129199461504
Epoch: 43, Validation Accuracy: 0.3201707601547241


100%|██████████| 132/132 [00:00<00:00, 467.78it/s]


Average Training Loss for Epoch 44: 3.802514860124299
Epoch: 44, Validation Accuracy: 0.3217715919017792


100%|██████████| 132/132 [00:00<00:00, 452.92it/s]


Average Training Loss for Epoch 45: 3.789186497529348
Epoch: 45, Validation Accuracy: 0.3217715919017792


100%|██████████| 132/132 [00:00<00:00, 459.89it/s]


Average Training Loss for Epoch 46: 3.768307716557474
Epoch: 46, Validation Accuracy: 0.3217715919017792


100%|██████████| 132/132 [00:00<00:00, 464.74it/s]


Average Training Loss for Epoch 47: 3.7463708249005405
Epoch: 47, Validation Accuracy: 0.32550692558288574


100%|██████████| 132/132 [00:00<00:00, 457.52it/s]


Average Training Loss for Epoch 48: 3.7301120974800805
Epoch: 48, Validation Accuracy: 0.3239060640335083


100%|██████████| 132/132 [00:00<00:00, 459.77it/s]

Average Training Loss for Epoch 49: 3.715768929683801
Epoch: 49, Validation Accuracy: 0.32230523228645325



