In [1]:
# coding = utf-8
import pandas as pd
import numpy as np
import torch
import argparse
import os
import datetime
import traceback
import model
import random

DATA_PATH = '../Dyadic_PELD.tsv'

# identify and specify the GPU as the device, later in training loop we will load data into device
SEED = 19

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

MAX_LEN = 256

df = pd.read_csv(DATA_PATH, sep='\t').fillna('Nan')
    
Utterance_1   = df['Utterance_1'].values 
Utterance_2   = df['Utterance_2'].values
Utterance_3   = df['Utterance_3'].values

label_1 = df['Emotion_1']
label_2 = df['Emotion_2']
label_3 = df['Emotion_3']


VAD_Lexicons = pd.read_csv('NRC-VAD-Lexicon.txt', sep='\t')
VAD_dict = {}
for r in VAD_Lexicons.iterrows():
    VAD_dict[r[1]['Word']] = [r[1]['Valence'], r[1]['Arousal'], r[1]['Dominance']]

from utils import get_vad, Emotion_dict, Emotion_Senti

'''
Emotion_dict = {
    'anger': [-0.51, 0.59, 0.25],
    'disgust': [-0.60, 0.35, 0.11],
    'fear': [-0.62, 0.82, -0.43],
    'joy': [0.81, 0.51, 0.46],
    'neutral': [0.0, 0.0, 0.0],
    'sadness': [-0.63, -0.27, -0.33],
    'surprise': [0.40, 0.67, -0.13]
}
'''


Mood_dict = {}
Mood_dict['negative'] = np.average([
                np.array(Emotion_dict['anger']), 
                np.array(Emotion_dict['disgust']), 
                np.array(Emotion_dict['fear']), 
                np.array(Emotion_dict['sadness'])],axis=0)
Mood_dict['positive'] =  np.average([
                np.array(Emotion_dict['joy']),
                np.array(Emotion_dict['surprise'])],axis=0)
Mood_dict['neutral'] = np.array([0,0,0])

print(Mood_dict)


# Emo recognition
# sentences = list(Utterance_1) + list(Utterance_2) + list(Utterance_3)
# labels = list(label_1) + list(label_2) + list(label_3)
# df = pd.DataFrame([])
# df['sents'] = sentences
# df['labels'] = labels
# print(df.shape)
# df = df.drop_duplicates(subset=['sents'], keep='first', inplace=False)
# print(df.shape)
# df = df[df['labels'] != 'neutral']
# print(df.shape)

# sentences = df['sents'].values
# labels = df['labels'].values
# init_emo = get_vad(VAD_dict, sentences)


# sentences = list(Utterance_1)
# labels = list(label_1)

sentences = Utterance_1 + ' [SEP] ' + Utterance_2
labels = list(label_3)
df = pd.DataFrame([])
df['sents'] = sentences
df['labels'] = labels
df['init_emo'] = label_1
print(df.shape)
df = df.drop_duplicates(subset=['sents'], keep='first', inplace=False)
print(df.shape)
# df = df[df['labels'] != 'neutral']
print(df.shape)

sentences = df['sents'].values
labels = df['labels'].values
init_emo = list(df['init_emo'].apply(lambda x: Emotion_dict[x]))



from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
labels = labelencoder.fit_transform(labels)
labels = list(labels)


print(len(sentences), len(set(sentences)))
print(len(labels))



{'negative': array([-0.59  ,  0.3725, -0.1   ]), 'positive': array([0.605, 0.59 , 0.165]), 'neutral': array([0, 0, 0])}
(6510, 3)
(6185, 3)
(6185, 3)
6185 6185
6185


In [4]:
import numpy as np

Emotion_dict = {
    'anger': [-0.51, 0.59, 0.25],
    'disgust': [-0.60, 0.35, 0.11],
    'fear': [-0.62, 0.82, -0.43],
    'joy': [0.81, 0.51, 0.46],
    'neutral': [0.0, 0.0, 0.0],
    'sadness': [-0.63, -0.27, -0.33],
    'surprise': [0.40, 0.67, -0.13]
}

np.average([
                np.array(Emotion_dict['joy']), 
                np.array(Emotion_dict['surprise']), 
],axis=0)

array([0.605, 0.59 , 0.165])

In [2]:
init_emo

[[0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [0.4, 0.67, -0.13],
 [0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [-0.62, 0.82, -0.43],
 [0.4, 0.67, -0.13],
 [-0.63, -0.27, -0.33],
 [0.4, 0.67, -0.13],
 [0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [0.81, 0.51, 0.46],
 [-0.63, -0.27, -0.33],
 [0.4, 0.67, -0.13],
 [0.0, 0.0, 0.0],
 [-0.6, 0.35, 0.11],
 [-0.63, -0.27, -0.33],
 [0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [0.81, 0.51, 0.46],
 [0.4, 0.67, -0.13],
 [0.4, 0.67, -0.13],
 [0.81, 0.51, 0.46],
 [0.4, 0.67, -0.13],
 [0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [0.81, 0.51, 0.46],
 [0.81, 0.51, 0.46],
 [0.81, 0.51, 0.46],
 [-0.63, -0.27, -0.33],
 [0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [0.81, 0.51, 0.46],
 [0.4, 0.67, -0.13],
 [0.81, 0.51, 0.46],
 [0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [-0.51, 0.59, 0.25],
 [0.81, 0.51, 0.46],
 [0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [0.81, 0.51, 0.46],
 [0.81, 0.51, 0.46],
 [0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [0.81, 0.51, 0.46],
 [0.0, 0.0, 0.0],
 [0.

In [3]:
from transformers import BertTokenizer, BertConfig,AdamW, BertForSequenceClassification,get_linear_schedule_with_warmup

MAX_LEN = 256

## Import BERT tokenizer, that is used to convert our text into tokens that corresponds to BERT library
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
input_ids = [tokenizer.encode(sent, add_special_tokens=True,max_length=MAX_LEN,pad_to_max_length=True) for sent in sentences]

print("Actual sentence before tokenization: ",sentences[2])
print("Encoded Input from dataset: ",input_ids[2])

## Create attention mask
attention_masks = []
## Create a mask of 1 for all input tokens and 0 for all padding tokens
attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
print(attention_masks[2])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Actual sentence before tokenization:  My duties?  All right. [SEP] Now you ll be heading a whole division, so you ll have a lot of duties.
Encoded Input from dataset:  [101, 2026, 5704, 1029, 2035, 2157, 1012, 102, 2085, 2017, 2222, 2022, 5825, 1037, 2878, 2407, 1010, 2061, 2017, 2222, 2031, 1037, 2843, 1997, 5704, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import accuracy_score,matthews_corrcoef

from tqdm import tqdm, trange,tnrange,tqdm_notebook

import random
import os
import io

train_inputs,test_inputs,train_labels,test_labels = train_test_split(input_ids,labels,random_state=41,test_size=0.1)
train_masks,test_masks,_,_ = train_test_split(attention_masks,input_ids,random_state=41,test_size=0.1)
train_init_emos,test_init_emos,_,_ = train_test_split(init_emo,input_ids,random_state=41,test_size=0.1)

train_set_labels = train_labels


train_inputs,validation_inputs,train_labels,validation_labels = train_test_split(train_inputs,train_set_labels,random_state=41,test_size=0.1)
train_masks,validation_masks,_,_ = train_test_split(train_masks,train_set_labels,random_state=41,test_size=0.1)
train_init_emos,validation_init_emos,_,_ = train_test_split(train_init_emos,train_set_labels,random_state=41,test_size=0.1)


In [5]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# convert all our data into torch tensors, required data type for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
test_inputs = torch.tensor(test_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
test_labels = torch.tensor(test_labels)

train_init_emos = torch.tensor(train_init_emos)
validation_init_emos = torch.tensor(validation_init_emos)
test_init_emos = torch.tensor(test_init_emos)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
test_masks = torch.tensor(test_masks)

# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs,train_masks,train_init_emos,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

validation_data = TensorDataset(validation_inputs,validation_masks,validation_init_emos,validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data,sampler=validation_sampler,batch_size=batch_size)

test_data = TensorDataset(test_inputs,test_masks,test_init_emos,test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data,sampler=test_sampler,batch_size=batch_size)


In [6]:
train_data[0]

(tensor([  101, 10166,  1010,  2008,  2003,  1037,  4474,  1012,  2074,  2028,
          2210,  3160,  1010,  7910,  1010,  2339,  2025,  5811,  1005,  1055,
          2282,  1029,   102,  2175,  4095,  1010,  2057,  5720,  2055,  2008,
          2021,  2115,  2567,  2038,  2061,  2116,  2671, 22236,  1998, 28487,
          1998,  7857, 23433,  1010,  2092,  2057,  2134,  1005,  1056,  2215,
          2000, 22995,  2068,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [7]:
from transformers import BertConfig, BertModel, BertPreTrainedModel
import torch.nn as nn
import torch.nn.functional as F
import torch


class ClassificationHead(nn.Module):
    """
    Head for sentence-level classification tasks.
    """

    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()        
        self.dense = nn.Linear(input_size, hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.out_proj = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class Emo_Generation(BertPreTrainedModel):
    def __init__(self, config, mode):
        super().__init__(config)
        self.num_labels = 7
        self.mid_size = 100 
        self.bert = BertModel(config)
        self.mode = mode

        # self.reduce = nn.Linear(config.hidden_size*2, config.hidden_size)
        
        
        if mode == 1: # mode 1: directlly classify with bert embedding;
            self.utter_classifier = nn.Linear(config.hidden_size, 7)
        elif mode == 2: # mode 2: concat bert embedding and personality;
            self.personality_trans = nn.Linear(5, self.mid_size) # 5-d personality vec     
        elif mode == 3: # mode 3: personality-based emotion transition;
            self.utter_classifier = nn.Linear(config.hidden_size, 3)
            # self.vad_para_trans = nn.Linear(3, 3) 
            # self.vad_to_hidden = nn.Linear(3, config.hidden_size)
            
            self.init_transfer = ClassificationHead(3, config.hidden_size, 3)
            self.cls_head = ClassificationHead(3, config.hidden_size, 7)
            

    def personality_to_vad(self, personality):
        O, C, E, A, N = personality[:, 0], personality[:, 1], personality[:, 2], personality[:, 3], personality[:, 4]
        
        valence = 0.21 * E + 0.59 * A + 0.19 * N
        arousal = 0.15 * O + 0.30 * A - 0.57 * N
        dominance = 0.25 * O + 0.17 * C + 0.60 * E - 0.32 * A

        # valence = E + A + N
        # arousal = O + A - N
        # dominance = O + C + E - A

        return torch.cat((valence.unsqueeze(-1), arousal.unsqueeze(-1), dominance.unsqueeze(-1)), 1)
    
    def forward(self, input_ids, attention_mask, init_emo=None):
        bert_outputs = self.bert(input_ids, attention_mask)
        bert_hidden = bert_outputs[1]
        
        if self.mode == 1:
            logits = self.utter_classifier(bert_hidden)
        elif self.mode == 2: 
            personality = self.personality_trans(personality.cuda(device))
            logits = self.utter_classifier(bert_hidden, personality)        
        elif self.mode == 3:
            utter_emo = self.utter_classifier(bert_hidden) # delta of v, a, d
            init_emo = self.init_transfer(init_emo)
            target_emo = init_emo + utter_emo# * personality_influence
            logits = self.cls_head(utter_emo)
        
        return logits

        


In [8]:


# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=7).cuda(1)


model = Emo_Generation.from_pretrained('bert-base-uncased', mode=3).cuda(1)
# model = Emo_Generation(mode=3).cuda(1)



# Parameters:
lr = 1e-5
adam_epsilon = 1e-8

# Number of training epochs (authors recommend between 2 and 4)
epochs = 50

num_warmup_steps = 0
num_training_steps = len(train_dataloader)*epochs


# for name, param in model.named_parameters():
#         if name.startswith('bert'):
#             param.requires_grad = False
#         else:
#             pass
#         if name.startswith('bert.encoder.layer.11') or name.startswith('bert.pooler'):
#             param.requires_grad = True
              


### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr,eps=adam_epsilon,correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

Some weights of the model checkpoint at bert-base-uncased were not used when initializing Emo_Generation: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing Emo_Generation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Emo_Generation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Emo_Generation were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['init_transfe

In [None]:

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix,classification_report

## Store our loss and accuracy for plotting
train_loss_set = []
learning_rate = []

# Gradients gets accumulated by default
model.zero_grad()

macro_list = []

# tnrange is a tqdm wrapper around the normal python range
for _ in tnrange(1,epochs+1,desc='Epoch'):
    print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
    # Calculate total loss for this epoch
    batch_loss = 0

    for step, batch in enumerate(train_dataloader):
        # Set our model to training mode (as opposed to evaluation mode)
        model.train()
        
        # Add batch to GPU
        batch = tuple(t.cuda(1) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_init_emo, b_labels = batch

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_input_mask, init_emo=b_init_emo)
        
        loss_fct = nn.CrossEntropyLoss(weight = torch.FloatTensor([0.6342, 5.9110, 0.8695, 0.5490, 0.4640, 0.8700, 0.7023]).cuda(1))
        
        # loss_fct = nn.CrossEntropyLoss()        
        loss     = loss_fct(outputs, b_labels)
        # loss = outputs[0]
        
        # Backward pass
        loss.backward()
        
        # Clip the norm of the gradients to 1.0
        # Gradient clipping is not in AdamW anymore
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        
        # Update learning rate schedule
        scheduler.step()

        # Clear the previous accumulated gradients
        optimizer.zero_grad()
        
        # Update tracking variables
        batch_loss += loss.item()

    # Calculate the average loss over the training data.
    avg_train_loss = batch_loss / len(train_dataloader)

    #store the current learning rate
    for param_group in optimizer.param_groups:
        print("\n\tCurrent Learning rate: ",param_group['lr'])
        learning_rate.append(param_group['lr'])
      
    train_loss_set.append(avg_train_loss)
    print(F'\n\tAverage Training loss: {avg_train_loss}')
      
    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()
    
    pred_list = []
    labels_list = []
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.cuda(1) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_init_emo, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
          # Forward pass, calculate logit predictions
          logits = model(b_input_ids, attention_mask=b_input_mask, init_emo=b_init_emo)
        
        # Move logits and labels to CPU
        logits = logits.to('cpu').numpy()
        label_ids = b_labels.to('cpu').numpy()
  
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        
        pred_list = np.append(pred_list, pred_flat)
        labels_list = np.append(labels_list, labels_flat)


    result = classification_report(pred_list, labels_list, digits=4, output_dict=False)
    # print(result)
    
    print('Test')
    pred_list = []
    labels_list = []
    # Evaluate data for one epoch
    for batch in test_dataloader:
        # Add batch to GPU
        batch = tuple(t.cuda(1) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_init_emo, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
          # Forward pass, calculate logit predictions
          logits = model(b_input_ids, attention_mask=b_input_mask, init_emo=b_init_emo)
        
        # Move logits and labels to CPU
        logits = logits.to('cpu').numpy()
        label_ids = b_labels.to('cpu').numpy()
  
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        
        pred_list = np.append(pred_list, pred_flat)
        labels_list = np.append(labels_list, labels_flat)


    result = classification_report(pred_list, labels_list, digits=4, output_dict=False)
    dic = classification_report(pred_list, labels_list, digits=4, output_dict=True)
    macro_list.append(dic['macro avg']['f1-score'])
    print(result)
    
    

  for _ in tnrange(1,epochs+1,desc='Epoch'):


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]


	Current Learning rate:  9.800000000000001e-06

	Average Training loss: 1.8749165225560498


  _warn_prf(average, modifier, msg_start, len(result))


Test
              precision    recall  f1-score   support

         0.0     0.0000    0.0000    0.0000         0
         1.0     0.0000    0.0000    0.0000         0
         2.0     0.0000    0.0000    0.0000         0
         3.0     0.0000    0.0000    0.0000         0
         4.0     1.0000    0.4297    0.6011       619
         5.0     0.0000    0.0000    0.0000         0
         6.0     0.0000    0.0000    0.0000         0

    accuracy                         0.4297       619
   macro avg     0.1429    0.0614    0.0859       619
weighted avg     1.0000    0.4297    0.6011       619


	Current Learning rate:  9.600000000000001e-06

	Average Training loss: 1.8663953565488196
Test
              precision    recall  f1-score   support

         0.0     0.0000    0.0000    0.0000         0
         1.0     0.0000    0.0000    0.0000         0
         2.0     0.0000    0.0000    0.0000         0
         3.0     0.0000    0.0000    0.0000         0
         4.0     1.0000    0.4


	Current Learning rate:  7.4e-06

	Average Training loss: 1.1024649481105198
Test
              precision    recall  f1-score   support

         0.0     0.4675    0.1417    0.2175       254
         1.0     0.0625    0.0909    0.0741        11
         2.0     0.0244    0.1111    0.0400         9
         3.0     0.0000    0.0000    0.0000         7
         4.0     0.6090    0.4793    0.5364       338
         5.0     0.0000    0.0000    0.0000         0
         6.0     0.0000    0.0000    0.0000         0

    accuracy                         0.3231       619
   macro avg     0.1662    0.1176    0.1240       619
weighted avg     0.5259    0.3231    0.3841       619


	Current Learning rate:  7.2000000000000005e-06

	Average Training loss: 1.0626954407828628
Test
              precision    recall  f1-score   support

         0.0     0.4156    0.1410    0.2105       227
         1.0     0.0000    0.0000    0.0000        10
         2.0     0.0000    0.0000    0.0000         0
     

In [11]:
macro_list

[0.08587570621468925,
 0.08587570621468925,
 0.11466490657407771,
 0.09872862346220392,
 0.09204270906398568,
 0.09796813462226997,
 0.09924577149555765,
 0.10137038165450687,
 0.11662261664892273,
 0.11958529138561512,
 0.12301217718072155,
 0.12961078918525729,
 0.12400293910627837,
 0.12277937490785788,
 0.14193439778843456,
 0.1697131876185775,
 0.15785813539081728,
 0.1759443854859024,
 0.15793485763076415,
 0.14285940188140941,
 0.1611964277392315,
 0.1594931913040873,
 0.14799881725920586,
 0.14848455616446157,
 0.14233899708634262,
 0.15587599020332324,
 0.15985353723695317,
 0.16713670117082788,
 0.1605265270814844,
 0.16646151298850392,
 0.16521365132013993,
 0.15744437112259477,
 0.1618604790066016,
 0.14860607728866262,
 0.13913450944089284,
 0.15609911631465728,
 0.15717668801315152,
 0.15978426937381626,
 0.158577142899096,
 0.16898885308797046,
 0.16997583401813035,
 0.15875757476289776,
 0.17548121473765502,
 0.17204696115981713,
 0.16248104150730455,
 0.178158876630468