# Project 1 - Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np

### A. Loading data

The following code shows how to load the datasets for this project.  
Among which, we do not release the labels (the "stars" column) for the test set. 
You may evaluate your trained model on the validation set instead.
However, your submitted predictions (``pred.csv``) should be generated on the test set.

Each year we release different data, so old models are not guaranteed to solve the new data.

In [2]:
def load_data(split_name='train', columns=['text', 'stars'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.
        
        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "stars" column is the labels (sentiment). 
        If you like, you are free to use columns other than "text" for prediction.
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

Then you can extract the data by specifying the desired split and columns

In [3]:
train_df = load_data('train', columns=['text', 'stars'], folder='data')
valid_df = load_data('valid', columns=['text', 'stars'], folder='data')
# the test set labels (the 'stars' column) are not available! So the following code will instead return all columns
test_df = load_data('test', columns=['text', 'stars'], folder='data')

select [text, stars] columns from the train split
Success
select [text, stars] columns from the valid split
Success
select [text, stars] columns from the test split
Failed loading specified columns... Returning all columns from the test split


## Test RobertA

pip install transformers

In [4]:
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import tqdm
from sklearn.metrics import confusion_matrix, classification_report

In [5]:
checkpoint = "cardiffnlp/twitter-roberta-base-emotion"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len=512

In [6]:
#tokenize dataset
def tokenize(text):
  return tokenizer(text, truncation=True,max_length=512,padding='max_length')

def tokenize_df(df):
  tokens = df['text'].map(tokenize)
  df['input_ids'] = [x['input_ids'] for x in tokens]
  df['attention_mask'] = [x['attention_mask'] for x in tokens]
  return df

In [7]:
train_df = tokenize_df(train_df)
valid_df = tokenize_df(valid_df)
test_df = tokenize_df(test_df)

## Hyperparameters

In [8]:
batch_size = 16
learning_rate = 5e-5
dropout_prob = 0.5

In [13]:
class CustomModel(nn.Module):
  def __init__(self,checkpoint,num_labels): 
    super(CustomModel,self).__init__() 
    self.num_labels = num_labels 

    #Load Model with given checkpoint and extract its body
    self.model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
    self.dropout = nn.Dropout(dropout_prob) 
    self.classifier = nn.Linear(768,num_labels) # load and initialize weights

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #Add custom layers
    sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

    logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses
    
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)


In [14]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model=CustomModel(checkpoint=checkpoint,num_labels=5).to(device)
# model=CustomModel(checkpoint="checkpoints/model_bs16_lr5e-05_dop0.5.pth",num_labels=5).to(device)
model.load_state_dict(torch.load("checkpoints/model_bs16_lr5e-05_dop0.5.pth"))
model.eval()


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

CustomModel(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),

In [15]:
class MyDataset(Dataset):
    def __init__(self, df):
        assert len(df['input_ids']) == len(df['stars'])
        self.input_ids = df['input_ids']
        self.attention_mask = df['attention_mask']
        self.label = df['stars']-1
    
    def __getitem__(self, idx):
        return np.asarray(self.input_ids[idx]), np.asarray(self.attention_mask[idx]), self.label[idx]

    def __len__(self):
        return len(self.label)

In [16]:
train_dataloader = DataLoader(MyDataset(train_df), batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(MyDataset(valid_df), batch_size=batch_size)

In [17]:
from transformers import AdamW,get_scheduler

optimizer = AdamW(model.parameters(), lr=learning_rate)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

5625




In [16]:
#next(iter(train_dataloader))

In [17]:
#torch.cuda.empty_cache()

In [18]:
for epoch in range(num_epochs):    
    print('epoch ', epoch+1)
    model.train()
    train_acc = 0
    train_loss = 0
    train_count = 0
    with tqdm.tqdm(train_dataloader) as t:
        for input_ids, attention_mask, labels in t:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids,attention_mask,labels)
            loss = outputs.loss
            loss.backward()

            train_acc += (torch.argmax(outputs.logits,dim=-1) == labels).sum().item()
            train_count += labels.size(0)
            train_loss += loss.item()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            t.set_postfix({'train_loss': train_loss/train_count, 'train_acc': train_acc/train_count})

    model.eval()
    y_pred = []
    y_true = []
    val_acc = 0
    val_count = 0
    val_loss = 0
    with tqdm.tqdm(valid_dataloader) as t:
        for input_ids, attention_mask, labels in t:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            with torch.no_grad():
                outputs = model(input_ids,attention_mask,labels)
                loss = outputs.loss
            val_acc += (torch.argmax(outputs.logits,dim=-1) == labels).sum().item()
            val_count += len(labels)
            val_loss += loss.item()
            y_pred += torch.argmax(outputs.logits,dim=-1).tolist()
            y_true += labels.tolist()

            t.set_postfix({'val_loss': val_loss/val_count, 'val_acc': val_acc/val_count})
    
    print(classification_report(y_true, y_pred))
    print("\n\n")
    print(confusion_matrix(y_true, y_pred))
    
    torch.save(model.state_dict(), 'checkpoints/model_2l_bs{}_lr{}_dop{}_e{}.pth'.format(batch_size, learning_rate, dropout_prob, epoch+1))
    with open('logs/model_2l_bs{}_lr{}_dop{}_e{}.txt'.format(batch_size, learning_rate, dropout_prob, epoch), 'w') as f:
        f.write(str(classification_report(y_true, y_pred)))
        f.write("\n\n")
        f.write(str(confusion_matrix(y_true, y_pred)))

epoch  1


100%|██████████| 1125/1125 [05:27<00:00,  3.43it/s, train_loss=0.0511, train_acc=0.66] 
100%|██████████| 125/125 [00:11<00:00, 10.85it/s, val_loss=0.0423, val_acc=0.715]


              precision    recall  f1-score   support

           0       0.78      0.91      0.84       282
           1       0.52      0.26      0.35       136
           2       0.56      0.46      0.50       212
           3       0.55      0.56      0.56       466
           4       0.81      0.87      0.84       904

    accuracy                           0.71      2000
   macro avg       0.65      0.61      0.62      2000
weighted avg       0.70      0.71      0.70      2000




[[256  15   9   0   2]
 [ 52  36  41   6   1]
 [  9  16  97  86   4]
 [  6   1  26 259 174]
 [  4   1   1 116 782]]
epoch  2


100%|██████████| 1125/1125 [05:37<00:00,  3.33it/s, train_loss=0.0393, train_acc=0.735]
100%|██████████| 125/125 [00:11<00:00, 10.90it/s, val_loss=0.0449, val_acc=0.698]


              precision    recall  f1-score   support

           0       0.78      0.90      0.83       282
           1       0.37      0.50      0.42       136
           2       0.61      0.29      0.39       212
           3       0.61      0.37      0.46       466
           4       0.76      0.93      0.84       904

    accuracy                           0.70      2000
   macro avg       0.62      0.60      0.59      2000
weighted avg       0.69      0.70      0.67      2000




[[254  21   3   1   3]
 [ 56  68   7   4   1]
 [  6  83  62  48  13]
 [  5  12  29 174 246]
 [  6   1   1  58 838]]
epoch  3


100%|██████████| 1125/1125 [05:43<00:00,  3.28it/s, train_loss=0.0302, train_acc=0.809]
100%|██████████| 125/125 [00:11<00:00, 10.79it/s, val_loss=0.0461, val_acc=0.719]


              precision    recall  f1-score   support

           0       0.78      0.94      0.85       282
           1       0.47      0.38      0.42       136
           2       0.59      0.49      0.54       212
           3       0.59      0.50      0.54       466
           4       0.80      0.87      0.83       904

    accuracy                           0.72      2000
   macro avg       0.65      0.63      0.64      2000
weighted avg       0.70      0.72      0.71      2000




[[264  12   3   0   3]
 [ 58  51  23   4   0]
 [  7  44 104  49   8]
 [  4   2  41 232 187]
 [  6   0   4 108 786]]
epoch  4


100%|██████████| 1125/1125 [05:43<00:00,  3.28it/s, train_loss=0.0197, train_acc=0.884]
100%|██████████| 125/125 [00:11<00:00, 10.87it/s, val_loss=0.0544, val_acc=0.723]


              precision    recall  f1-score   support

           0       0.84      0.83      0.83       282
           1       0.46      0.53      0.49       136
           2       0.61      0.54      0.57       212
           3       0.59      0.53      0.56       466
           4       0.81      0.86      0.84       904

    accuracy                           0.72      2000
   macro avg       0.66      0.66      0.66      2000
weighted avg       0.72      0.72      0.72      2000




[[233  42   4   2   1]
 [ 36  72  24   3   1]
 [  5  35 115  51   6]
 [  2   3  42 246 173]
 [  3   3   5 113 780]]
epoch  5


100%|██████████| 1125/1125 [05:35<00:00,  3.35it/s, train_loss=0.0117, train_acc=0.939]
100%|██████████| 125/125 [00:11<00:00, 10.90it/s, val_loss=0.0637, val_acc=0.718]


              precision    recall  f1-score   support

           0       0.84      0.82      0.83       282
           1       0.44      0.48      0.46       136
           2       0.59      0.56      0.58       212
           3       0.58      0.53      0.56       466
           4       0.81      0.86      0.83       904

    accuracy                           0.72      2000
   macro avg       0.65      0.65      0.65      2000
weighted avg       0.71      0.72      0.72      2000




[[230  45   5   0   2]
 [ 36  65  31   3   1]
 [  3  32 119  53   5]
 [  3   2  42 248 171]
 [  2   3   4 121 774]]


In [None]:
torch.save(model.state_dict(), 'checkpoints/model_bs{}_lr{}_dop{}.pth'.format(batch_size, learning_rate, dropout_prob))

In [None]:
# model = torch.load('model1.pth')

### Model Validation Evaluation

In [18]:
model.eval()
y_pred = []
y_true = []
val_acc = 0
val_count = 0
val_loss = 0
with tqdm.tqdm(valid_dataloader) as t:
    for input_ids, attention_mask, labels in t:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            outputs = model(input_ids,attention_mask,labels)
            loss = outputs.loss
        val_acc += (torch.argmax(outputs.logits,dim=-1) == labels).sum().item()
        val_count += len(labels)
        val_loss += loss.item()
        y_pred += torch.argmax(outputs.logits,dim=-1).tolist()
        y_true += labels.tolist()

        t.set_postfix({'val_loss': val_loss/val_count, 'val_acc': val_acc/val_count})

100%|██████████| 125/125 [00:12<00:00,  9.92it/s, val_loss=0.0475, val_acc=0.728]


In [19]:
len(y_pred)

2000

In [21]:
with open('logs/model_bs{}_lr{}_dop{}.txt'.format(batch_size, learning_rate, dropout_prob), 'w') as f:
    # f.write(str(classification_report(y_true, y_pred)))
    # f.write("\n\n")
    # f.write(str(confusion_matrix(y_true, y_pred)))
    print(str(classification_report(y_true, y_pred)))
    print("\n\n")
    print(str(confusion_matrix(y_true, y_pred)))

              precision    recall  f1-score   support

           0       0.86      0.83      0.85       282
           1       0.50      0.51      0.51       136
           2       0.58      0.56      0.57       212
           3       0.59      0.56      0.57       466
           4       0.82      0.86      0.84       904

    accuracy                           0.73      2000
   macro avg       0.67      0.66      0.67      2000
weighted avg       0.72      0.73      0.73      2000




[[235  37   7   1   2]
 [ 28  70  35   3   0]
 [  2  29 118  59   4]
 [  2   2  39 259 164]
 [  5   3   5 117 774]]


### B. Saving predictions to file

Your submitted predictions are supposed to be a .csv file containing two columns, i.e. (``review_id`` and ``stars``). 

Here, as an example, we generate some random predictions as our answer, which are put in a DataFrame and output to a .csv file

After getting your model predictions on the test set, you may follow these steps to generate your ``pred.csv`` file. (By replacing the random predictions with your model predictions)

In [22]:
class MyTestDataset(Dataset):
    def __init__(self, df):
        assert len(df['input_ids']) == len(df['attention_mask'])
        self.input_ids = df['input_ids']
        self.attention_mask = df['attention_mask']
    
    def __getitem__(self, idx):
        return np.asarray(self.input_ids[idx]), np.asarray(self.attention_mask[idx])

    def __len__(self):
        return len(self.input_ids)

In [23]:
batch_size = 16
test_dataloader = DataLoader(MyTestDataset(test_df), batch_size=batch_size)

In [24]:
model.eval()
y_pred = []
with tqdm.tqdm(test_dataloader) as t:
    for input_ids, attention_mask in t:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        with torch.no_grad():
            outputs = model(input_ids,attention_mask)
        y_pred += torch.argmax(outputs.logits,dim=-1).tolist()

100%|██████████| 250/250 [00:22<00:00, 11.09it/s]


In [25]:
test_df['stars'] = [x+1 for x in y_pred]

In [26]:
test_df.head()

Unnamed: 0,business_id,cool,date,funny,review_id,text,useful,user_id,input_ids,attention_mask,stars
0,IKcZpSuELli7DUjU2fKGNg,1,2015-04-07 17:17:39,0,I77zZlSdCFAClxdjHwPcxw,OMG! I'm an avid spray tanner and have been al...,2,tUZtqzqE0bIOcLelcR4opg,"[0, 3765, 534, 328, 38, 437, 41, 20137, 11782,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5
1,vbVJzKDhHlhMnKRpES5QzQ,1,2017-06-30 17:42:40,0,ioFNKarf29KGjRZdH0qC8Q,Sets the standard. Authentic. Outstanding. Cou...,1,Gwvrebru-kDM1N51aeJiFg,"[0, 104, 2580, 5, 2526, 4, 41808, 636, 4, 2548...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5
2,GdPWJo3z4ySEXpF7Wkn3FA,0,2014-08-02 05:53:47,2,9429anmcYIcaEcMptJCNKQ,Came on 7/23/2014 with a group of 10 - service...,1,at7dS8gtLiEwd_4uHv231A,"[0, 347, 4344, 15, 262, 73, 1922, 73, 16310, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3,BYc5IFQPq-PLVXnYjDp6vw,0,2015-02-20 19:31:21,0,PsUCdt7PKjzgBC0c7xXhJA,I love Bobs Subs! Tasty n made to order...yum!...,0,vaQxpV8IXqRmCIAHovP4NA,"[0, 100, 657, 3045, 29, 4052, 29, 328, 255, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5
4,Wxxvi3LZbHNIDwJ-ZimtnA,0,2012-06-27 00:44:08,0,GQBlykKyShQcNeu2ivLdSA,This is my hotel of choice on the strip. I re...,0,dy_4NAZ0KR2bDoB9qAOMRg,"[0, 713, 16, 127, 2303, 9, 2031, 15, 5, 9572, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4


In [27]:
test_df.to_csv(f'test_df.csv',index=False)

In [28]:
pred_df = pd.DataFrame(data={
    'review_id': test_df['review_id'],
    'stars': test_df['stars']
})

In [29]:
pred_df.head()

Unnamed: 0,review_id,stars
0,I77zZlSdCFAClxdjHwPcxw,5
1,ioFNKarf29KGjRZdH0qC8Q,5
2,9429anmcYIcaEcMptJCNKQ,1
3,PsUCdt7PKjzgBC0c7xXhJA,5
4,GQBlykKyShQcNeu2ivLdSA,4


In [30]:
pred_df.to_csv(f'pred.csv', index=False)