In [1]:
import math

import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# if this fails, open terminal (on Linux, Mac) or Anaconda Prompt (windows)
# and run:
# conda install -c huggingface transformers
from transformers import AutoModel, AutoTokenizer

In [2]:
tweets_df = pd.read_csv("data/tweets.csv")
tweets_df

Unnamed: 0,author,content,country,date_time,id,language,latitude,longitude,number_of_likes,number_of_shares
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,,12/01/2017 19:52,8.196330e+17,en,,,7900,3472
1,katyperry,@barackobama Thank you for your incredible gra...,,11/01/2017 08:38,8.191010e+17,en,,,3689,1380
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,,11/01/2017 02:52,8.190140e+17,en,,,10341,2387
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,,11/01/2017 02:44,8.190120e+17,en,,,10774,2458
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,,10/01/2017 05:22,8.186890e+17,en,,,17620,4655
...,...,...,...,...,...,...,...,...,...,...
52537,ddlovato,Life couldn't be better right now. 😊,,06/01/2015 23:10,5.526030e+17,en,,,32799,23796
52538,ddlovato,First Monday back in action. I'd say 21.6 mile...,,06/01/2015 02:17,5.522880e+17,en,,,21709,12511
52539,ddlovato,"Crime shows, buddy, snuggles = the perfect Sun...",,05/01/2015 03:42,5.519470e+17,en,,,25269,15583
52540,ddlovato,❄️ http://t.co/sHCFdPpGPa,,05/01/2015 00:06,5.518920e+17,und,,,15985,10456


In [3]:
unique_people = tweets_df['author'].unique()
print(unique_people)
NUM_CLASSES = len(unique_people)

# assign each person a number
id_to_person = {i: unique_people[i] for i in range(len(unique_people))}
person_to_id = {v:k for k,v in id_to_person.items()}

# create a column of author ids
tweets_df['author_id'] = tweets_df['author'].apply(lambda x: person_to_id[x])

['katyperry' 'justinbieber' 'taylorswift13' 'BarackObama' 'rihanna'
 'YouTube' 'ladygaga' 'TheEllenShow' 'Twitter' 'jtimberlake'
 'KimKardashian' 'britneyspears' 'Cristiano' 'selenagomez' 'cnnbrk'
 'jimmyfallon' 'ArianaGrande' 'shakira' 'instagram' 'ddlovato']


In [4]:
tweets_df

Unnamed: 0,author,content,country,date_time,id,language,latitude,longitude,number_of_likes,number_of_shares,author_id
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,,12/01/2017 19:52,8.196330e+17,en,,,7900,3472,0
1,katyperry,@barackobama Thank you for your incredible gra...,,11/01/2017 08:38,8.191010e+17,en,,,3689,1380,0
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,,11/01/2017 02:52,8.190140e+17,en,,,10341,2387,0
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,,11/01/2017 02:44,8.190120e+17,en,,,10774,2458,0
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,,10/01/2017 05:22,8.186890e+17,en,,,17620,4655,0
...,...,...,...,...,...,...,...,...,...,...,...
52537,ddlovato,Life couldn't be better right now. 😊,,06/01/2015 23:10,5.526030e+17,en,,,32799,23796,19
52538,ddlovato,First Monday back in action. I'd say 21.6 mile...,,06/01/2015 02:17,5.522880e+17,en,,,21709,12511,19
52539,ddlovato,"Crime shows, buddy, snuggles = the perfect Sun...",,05/01/2015 03:42,5.519470e+17,en,,,25269,15583,19
52540,ddlovato,❄️ http://t.co/sHCFdPpGPa,,05/01/2015 00:06,5.518920e+17,und,,,15985,10456,19


In [5]:
class TweetDataset(Dataset):
    def __init__(self, df):
        print('making tokenizer...')
        self.tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
        print('making bert...')
        self.bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
        
        print('tokenizing inputs...')
        self.inputs = self.tokenizer(list(df['content']), return_tensors="pt", padding=True, truncation=True)
        
        # Run BERT forward pass to get embeddings
        print('running BERT to get embeddings...')
        with torch.no_grad():
            features = self.bertweet(self.inputs.input_ids)
        
        self.embeddings = features.pooler_output
        self.labels = torch.tensor(df['author_id'].values).long()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        
        x = self.embeddings[idx]
        y = self.labels[idx]
        
        return x, y

In [6]:
# shuffle the df
np.random.seed(7)
df_full = tweets_df.sample(frac=1.0)

# only do 1000/1000 for the sake of time
n_train = 1000
n_test = 1000

df_train = df_full[:n_train]
df_test = df_full[n_train:n_train+n_test]

ds_train = TweetDataset(df_train)
ds_test = TweetDataset(df_test)

making tokenizer...
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
making bert...
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
tokenizing inputs...
running BERT to get embeddings...
making tokenizer...
Special tokens have been added in the 

In [7]:
torch.manual_seed(7)

mlp = torch.nn.Sequential(
    torch.nn.Linear(768, 100),
    torch.nn.ReLU(),
    torch.nn.Linear(100, 100),
    torch.nn.ReLU(),
    torch.nn.Linear(100, NUM_CLASSES),
    torch.nn.Softmax(dim=1)
)

In [8]:
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-3)
loss_f = torch.nn.CrossEntropyLoss()

In [9]:
def get_accuracy(ds):
    num_correct = 0

    batch_start = 0
    batch_end = batch_start + batch_size
    print_counter = print_frequency
    while batch_start < len(ds):
        # Grab the batch from the dataset
        x, y = ds[batch_start:batch_end]

        # Forward pass
        with torch.no_grad():
            y_pred = mlp(x)

        # Gets the number of correct predictions
        num_correct += (torch.argmax(y_pred, dim=1) == y).numpy().sum()

        # Update batch parameters
        batch_start = batch_end
        batch_end = batch_start + batch_size
    
    return num_correct / len(ds)

In [10]:
# Randomly shuffle the indices
idxs = np.arange(len(ds_train))
np.random.shuffle(idxs)

In [11]:
mlp.train()
epochs = 100
batch_size = 64

# print every these number of batches
print_frequency = 5
print_counter = print_frequency

# Train loop
for epoch in range(epochs):
    batch_start = 0
    batch_end = batch_start + batch_size
    while batch_start < len(ds_train):
        # Grab the batch from the dataset
        idxs_batch = idxs[batch_start:batch_end]
        x, y = ds_train[idxs_batch]
               
        # Forward pass
        optimizer.zero_grad()
        y_pred = mlp(x)

        # Compute Loss
        ### TODO: your code here. Hint: how do we use loss_f? See Pytorch
        # documentation for examples.
        loss = loss_f(y_pred, y)
        
        print_counter -= 1
        if print_counter <= 0:
            print_counter = print_frequency
            
            # Figure out which batch we are on
            batch_num = batch_start // batch_size
            print(f'Epoch {epoch}, Batch {batch_num}. Train loss: {loss.item()}')
        
        # Backward pass
        ### TODO: your code here. Hint: we need to do backpropogation and then
        # update parameters using the optimizer. Look up some examples of how to
        # to do this or go to Pytorch documentation.
        loss.backward()
        optimizer.step()

        
        # Update batch parameters
        batch_start = batch_end
        batch_end = batch_start + batch_size
    
    print('\n----')
    
    train_acc = get_accuracy(ds_train)
    print(f'Train Acc on Epoch {epoch}:', train_acc)
    
    test_acc = get_accuracy(ds_test)
    print(f'Test Acc on Epoch {epoch}:', test_acc)
    
    print('----')

069458
Epoch 12, Batch 7. Train loss: 2.9848451614379883
Epoch 12, Batch 12. Train loss: 2.9523069858551025

----
Train Acc on Epoch 12: 0.09
Test Acc on Epoch 12: 0.087
----
Epoch 13, Batch 1. Train loss: 2.996501922607422
Epoch 13, Batch 6. Train loss: 2.978071928024292
Epoch 13, Batch 11. Train loss: 2.987680435180664

----
Train Acc on Epoch 13: 0.11
Test Acc on Epoch 13: 0.107
----
Epoch 14, Batch 0. Train loss: 2.9788246154785156
Epoch 14, Batch 5. Train loss: 2.964968204498291
Epoch 14, Batch 10. Train loss: 2.996135950088501
Epoch 14, Batch 15. Train loss: 2.9613277912139893

----
Train Acc on Epoch 14: 0.118
Test Acc on Epoch 14: 0.112
----
Epoch 15, Batch 4. Train loss: 2.9641940593719482
Epoch 15, Batch 9. Train loss: 2.9627768993377686
Epoch 15, Batch 14. Train loss: 2.967087507247925

----
Train Acc on Epoch 15: 0.121
Test Acc on Epoch 15: 0.113
----
Epoch 16, Batch 3. Train loss: 2.950817584991455
Epoch 16, Batch 8. Train loss: 2.9121253490448
Epoch 16, Batch 13. Train lo