# Project 1 - Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np

### A. Loading data

The following code shows how to load the datasets for this project.  
Among which, we do not release the labels (the "stars" column) for the test set. 
You may evaluate your trained model on the validation set instead.
However, your submitted predictions (``pred.csv``) should be generated on the test set.

Each year we release different data, so old models are not guaranteed to solve the new data.

In [2]:
def load_data(split_name='train', columns=['text', 'stars'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.
        
        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "stars" column is the labels (sentiment). 
        If you like, you are free to use columns other than "text" for prediction.
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

Then you can extract the data by specifying the desired split and columns

In [3]:
train_df = load_data('train', columns=['text', 'stars','cool','funny','useful'], folder='data')
valid_df = load_data('valid', columns=['text', 'stars','cool','funny','useful'], folder='data')
# the test set labels (the 'stars' column) are not available! So the following code will instead return all columns
test_df = load_data('test', columns=['text', 'stars'], folder='data')

select [text, stars, cool, funny, useful] columns from the train split
Success
select [text, stars, cool, funny, useful] columns from the valid split
Success
select [text, stars] columns from the test split
Failed loading specified columns... Returning all columns from the test split


## Test RobertA

pip install transformers

In [4]:
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import tqdm
from sklearn.metrics import confusion_matrix, classification_report

2022-03-29 20:23:55.811335: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/lib64
2022-03-29 20:23:55.811373: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [5]:
checkpoint = "cardiffnlp/twitter-roberta-base-emotion"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len=512

In [6]:
#tokenize dataset
def tokenize(text):
  return tokenizer(text, truncation=True,max_length=512,padding='max_length')

def tokenize_df(df):
  tokens = df['text'].map(tokenize)
  df['input_ids'] = [x['input_ids'] for x in tokens]
  df['attention_mask'] = [x['attention_mask'] for x in tokens]
  return df

In [7]:
train_df = tokenize_df(train_df)
valid_df = tokenize_df(valid_df)
test_df = tokenize_df(test_df)

## Hyperparameters

In [8]:
batch_size = 8
learning_rate = 1e-5
dropout_prob = 0.5

In [9]:
class CustomModel(nn.Module):
  def __init__(self,checkpoint,num_labels): 
    super(CustomModel,self).__init__() 
    self.num_labels = num_labels 

    #Load Model with given checkpoint and extract its body
    self.model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
    self.dropout = nn.Dropout(dropout_prob) 
    self.classifier = nn.Linear(771,num_labels) # load and initialize weights


  def forward(self, input_ids=None, attention_mask=None, reactions=None, labels=None):
    #Extract outputs from the body
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #Add custom layers
    sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

    #Concat sequence_output with reactions (,3)
    output_concat = torch.cat((sequence_output[:,0,:].view(-1,768), reactions),dim=1)

    logits = self.classifier(output_concat) 
    
    # calculate losses
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [10]:
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model=CustomModel(checkpoint=checkpoint,num_labels=5).to(device)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [11]:
class MyDataset(Dataset):
    def __init__(self, df):
        assert len(df['input_ids']) == len(df['stars'])
        self.input_ids = df['input_ids']
        self.attention_mask = df['attention_mask']
        self.label = df['stars']-1
        self.reactions = df[['cool','funny','useful']].values
    
    def __getitem__(self, idx):
        return np.asarray(self.input_ids[idx]), np.asarray(self.attention_mask[idx]), self.reactions[idx] ,self.label[idx]

    def __len__(self):
        return len(self.label)

In [12]:
train_dataloader = DataLoader(MyDataset(train_df), batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(MyDataset(valid_df), batch_size=batch_size)

In [13]:
from transformers import AdamW,get_scheduler

optimizer = AdamW(model.parameters(), lr=learning_rate)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

11250




In [14]:
#next(iter(train_dataloader))

In [15]:
#torch.cuda.empty_cache()

In [16]:
for epoch in range(num_epochs):    
    print('epoch ', epoch)
    model.train()
    train_acc = 0
    train_loss = 0
    train_count = 0
    with tqdm.tqdm(train_dataloader) as t:
        for input_ids, attention_mask, reactions, labels in t:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            reactions = reactions.to(device)
            labels = labels.to(device)

            outputs = model(input_ids,attention_mask,reactions,labels)
            loss = outputs.loss
            loss.backward()

            train_acc += (torch.argmax(outputs.logits,dim=-1) == labels).sum().item()
            train_count += labels.size(0)
            train_loss += loss.item()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            t.set_postfix({'train_loss': train_loss/train_count, 'train_acc': train_acc/train_count})

    model.eval()
    y_pred = []
    y_true = []
    val_acc = 0
    val_count = 0
    val_loss = 0
    with tqdm.tqdm(valid_dataloader) as t:
        for input_ids, attention_mask, reactions, labels in t:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            reactions = reactions.to(device)
            labels = labels.to(device)
            with torch.no_grad():
                outputs = model(input_ids,attention_mask,reactions,labels)
                loss = outputs.loss
            val_acc += (torch.argmax(outputs.logits,dim=-1) == labels).sum().item()
            val_count += len(labels)
            val_loss += loss.item()
            y_pred += torch.argmax(outputs.logits,dim=-1).tolist()
            y_true += labels.tolist()

            t.set_postfix({'val_loss': val_loss/val_count, 'val_acc': val_acc/val_count})
    
    print(classification_report(y_true, y_pred))
    print("\n\n")
    print(confusion_matrix(y_true, y_pred))
    
    # torch.save(model.state_dict(), 'checkpoints/model_bs{}_lr{}_dop{}_e{}.pth'.format(batch_size, learning_rate, dropout_prob, epoch+3))
    # with open('logs/model_bs{}_lr{}_dop{}_e{}.txt'.format(batch_size, learning_rate, dropout_prob, epoch), 'w') as f:
    #     f.write(str(classification_report(y_true, y_pred)))
    #     f.write("\n\n")
    #     f.write(str(confusion_matrix(y_true, y_pred)))

epoch  0


100%|██████████| 2250/2250 [11:57<00:00,  3.14it/s, train_loss=0.097, train_acc=0.671] 
100%|██████████| 250/250 [00:26<00:00,  9.59it/s, val_loss=0.0841, val_acc=0.716]


              precision    recall  f1-score   support

           0       0.83      0.88      0.85       282
           1       0.53      0.40      0.46       136
           2       0.52      0.66      0.58       212
           3       0.58      0.44      0.50       466
           4       0.80      0.87      0.83       904

    accuracy                           0.72      2000
   macro avg       0.65      0.65      0.64      2000
weighted avg       0.71      0.72      0.71      2000




[[248  24  10   0   0]
 [ 36  54  44   2   0]
 [  8  17 139  42   6]
 [  4   3  67 203 189]
 [  4   3   9 101 787]]
epoch  1


100%|██████████| 2250/2250 [11:40<00:00,  3.21it/s, train_loss=0.0788, train_acc=0.733]
100%|██████████| 250/250 [00:26<00:00,  9.59it/s, val_loss=0.0834, val_acc=0.719]


              precision    recall  f1-score   support

           0       0.85      0.84      0.84       282
           1       0.46      0.39      0.42       136
           2       0.55      0.60      0.57       212
           3       0.59      0.50      0.54       466
           4       0.81      0.87      0.84       904

    accuracy                           0.72      2000
   macro avg       0.65      0.64      0.64      2000
weighted avg       0.71      0.72      0.71      2000




[[237  37   8   0   0]
 [ 33  53  48   2   0]
 [  3  22 128  54   5]
 [  3   1  45 235 182]
 [  3   2   5 110 784]]
epoch  2


100%|██████████| 2250/2250 [11:58<00:00,  3.13it/s, train_loss=0.0662, train_acc=0.782]
100%|██████████| 250/250 [00:26<00:00,  9.57it/s, val_loss=0.0862, val_acc=0.72] 


              precision    recall  f1-score   support

           0       0.87      0.79      0.83       282
           1       0.44      0.46      0.45       136
           2       0.54      0.62      0.58       212
           3       0.60      0.51      0.55       466
           4       0.82      0.87      0.84       904

    accuracy                           0.72      2000
   macro avg       0.65      0.65      0.65      2000
weighted avg       0.72      0.72      0.72      2000




[[224  51   7   0   0]
 [ 25  63  47   1   0]
 [  2  23 131  52   4]
 [  3   2  50 238 173]
 [  4   3   7 106 784]]
epoch  3


100%|██████████| 2250/2250 [12:02<00:00,  3.11it/s, train_loss=0.0549, train_acc=0.823]
100%|██████████| 250/250 [00:26<00:00,  9.60it/s, val_loss=0.0951, val_acc=0.72] 


              precision    recall  f1-score   support

           0       0.82      0.88      0.85       282
           1       0.49      0.38      0.43       136
           2       0.55      0.54      0.54       212
           3       0.58      0.53      0.56       466
           4       0.81      0.86      0.84       904

    accuracy                           0.72      2000
   macro avg       0.65      0.64      0.64      2000
weighted avg       0.71      0.72      0.71      2000




[[249  27   6   0   0]
 [ 42  52  41   1   0]
 [  6  24 114  63   5]
 [  4   1  42 247 172]
 [  4   3   6 113 778]]
epoch  4


100%|██████████| 2250/2250 [11:58<00:00,  3.13it/s, train_loss=0.046, train_acc=0.857] 
100%|██████████| 250/250 [00:26<00:00,  9.56it/s, val_loss=0.103, val_acc=0.721]

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       282
           1       0.47      0.43      0.45       136
           2       0.55      0.51      0.53       212
           3       0.58      0.53      0.56       466
           4       0.82      0.87      0.84       904

    accuracy                           0.72      2000
   macro avg       0.65      0.64      0.65      2000
weighted avg       0.71      0.72      0.72      2000




[[244  33   5   0   0]
 [ 39  59  36   2   0]
 [  3  31 108  66   4]
 [  4   0  41 248 173]
 [  4   3   5 109 783]]





In [18]:
torch.save(model.state_dict(), 'checkpoints/model_bs{}_lr{}_dop{}.pth'.format(batch_size, learning_rate, dropout_prob))

In [None]:
# model = torch.load('model1.pth')

### Model Validation Evaluation

In [19]:
model.eval()
y_pred = []
y_true = []
val_acc = 0
val_count = 0
val_loss = 0
with tqdm.tqdm(valid_dataloader) as t:
    for input_ids, attention_mask, reactions, labels in t:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        reactions = reactions.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            outputs = model(input_ids,attention_mask,reactions,labels)
            loss = outputs.loss
        val_acc += (torch.argmax(outputs.logits,dim=-1) == labels).sum().item()
        val_count += len(labels)
        val_loss += loss.item()
        y_pred += torch.argmax(outputs.logits,dim=-1).tolist()
        y_true += labels.tolist()

        t.set_postfix({'val_loss': val_loss/val_count, 'val_acc': val_acc/val_count})

100%|██████████| 250/250 [00:25<00:00,  9.88it/s, val_loss=0.103, val_acc=0.721]


In [None]:
len(y_pred)

2000

In [20]:
with open('logs/model_bs{}_lr{}_dop{}_outputcat.txt'.format(batch_size, learning_rate, dropout_prob), 'w') as f:
    f.write(str(classification_report(y_true, y_pred)))
    f.write("\n\n")
    f.write(str(confusion_matrix(y_true, y_pred)))

### B. Saving predictions to file

Your submitted predictions are supposed to be a .csv file containing two columns, i.e. (``review_id`` and ``stars``). 

Here, as an example, we generate some random predictions as our answer, which are put in a DataFrame and output to a .csv file

After getting your model predictions on the test set, you may follow these steps to generate your ``pred.csv`` file. (By replacing the random predictions with your model predictions)

In [21]:
class MyTestDataset(Dataset):
    def __init__(self, df):
        assert len(df['input_ids']) == len(df['attention_mask'])
        self.input_ids = df['input_ids']
        self.attention_mask = df['attention_mask']
        self.reactions = df[['cool','funny','useful']].values
    
    def __getitem__(self, idx):
        return np.asarray(self.input_ids[idx]), np.asarray(self.attention_mask[idx]), self.reactions[idx]

    def __len__(self):
        return len(self.input_ids)

In [22]:
batch_size = 8
test_dataloader = DataLoader(MyTestDataset(test_df), batch_size=batch_size)

In [23]:
model.eval()
y_pred = []
with tqdm.tqdm(test_dataloader) as t:
    for input_ids, attention_mask, reactions in t:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        reactions = reactions.to(device)
        with torch.no_grad():
            outputs = model(input_ids,attention_mask,reactions)
        y_pred += torch.argmax(outputs.logits,dim=-1).tolist()

100%|██████████| 500/500 [00:51<00:00,  9.75it/s]


In [24]:
test_df['stars'] = [x+1 for x in y_pred]

In [25]:
test_df.head()

Unnamed: 0,business_id,cool,date,funny,review_id,text,useful,user_id,input_ids,attention_mask,stars
0,IKcZpSuELli7DUjU2fKGNg,1,2015-04-07 17:17:39,0,I77zZlSdCFAClxdjHwPcxw,OMG! I'm an avid spray tanner and have been al...,2,tUZtqzqE0bIOcLelcR4opg,"[0, 3765, 534, 328, 38, 437, 41, 20137, 11782,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5
1,vbVJzKDhHlhMnKRpES5QzQ,1,2017-06-30 17:42:40,0,ioFNKarf29KGjRZdH0qC8Q,Sets the standard. Authentic. Outstanding. Cou...,1,Gwvrebru-kDM1N51aeJiFg,"[0, 104, 2580, 5, 2526, 4, 41808, 636, 4, 2548...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5
2,GdPWJo3z4ySEXpF7Wkn3FA,0,2014-08-02 05:53:47,2,9429anmcYIcaEcMptJCNKQ,Came on 7/23/2014 with a group of 10 - service...,1,at7dS8gtLiEwd_4uHv231A,"[0, 347, 4344, 15, 262, 73, 1922, 73, 16310, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3,BYc5IFQPq-PLVXnYjDp6vw,0,2015-02-20 19:31:21,0,PsUCdt7PKjzgBC0c7xXhJA,I love Bobs Subs! Tasty n made to order...yum!...,0,vaQxpV8IXqRmCIAHovP4NA,"[0, 100, 657, 3045, 29, 4052, 29, 328, 255, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5
4,Wxxvi3LZbHNIDwJ-ZimtnA,0,2012-06-27 00:44:08,0,GQBlykKyShQcNeu2ivLdSA,This is my hotel of choice on the strip. I re...,0,dy_4NAZ0KR2bDoB9qAOMRg,"[0, 713, 16, 127, 2303, 9, 2031, 15, 5, 9572, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4


In [26]:
test_df.to_csv(f'test_df.csv',index=False)

In [27]:
pred_df = pd.DataFrame(data={
    'review_id': test_df['review_id'],
    'stars': test_df['stars']
})

In [28]:
pred_df.head()

Unnamed: 0,review_id,stars
0,I77zZlSdCFAClxdjHwPcxw,5
1,ioFNKarf29KGjRZdH0qC8Q,5
2,9429anmcYIcaEcMptJCNKQ,1
3,PsUCdt7PKjzgBC0c7xXhJA,5
4,GQBlykKyShQcNeu2ivLdSA,4


In [29]:
pred_df.to_csv(f'pred_1.csv', index=False)