In [18]:
import numpy as np
import torch
import random
import pickle
import pandas as pd

import torch.nn as nn
import torch.optim as optim

from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder

from datasets import load_dataset


### https://mccormickml.com/2019/07/22/BERT-fine-tuning/

In [2]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
if torch.cuda.is_available():
    dev = torch.cuda.current_device()
    torch.cuda.device(dev)
    print("Using", torch.cuda.get_device_name(0))
else:
    torch.device("cpu")

Using GeForce GTX 1060


### Processing the data [DailyDialog] to make it task specific

In [4]:
dataset = load_dataset('daily_dialog')
print(dataset)

Using custom data configuration default
Reusing dataset daily_dialog (/home/sambit/.cache/huggingface/datasets/daily_dialog/default/1.0.0/c03444008e9508b8b76f1f6793742d37d5e5f83364f8d573c2747bff435ea55c)


DatasetDict({
    train: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 11118
    })
    test: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
})


In [5]:
dataset['train'][:2]

OrderedDict([('act', [[3, 4, 2, 2, 2, 3, 4, 1, 3, 4], [2, 1, 2, 2, 1, 1]]),
             ('dialog',
              [['Say , Jim , how about going for a few beers after dinner ? ',
                ' You know that is tempting but is really not good for our fitness . ',
                ' What do you mean ? It will help us to relax . ',
                " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ",
                " I guess you are right.But what shall we do ? I don't feel like sitting at home . ",
                ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ',
                " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ",
                ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ',
                " Good.Let ' s go now . ",
     

In [8]:
def split_data_into_lists(mode):
    global dataset
    temp_data = dataset[mode]
    rows = []
    max_length_previous = 0
    max_pidx = -1
    max_length_current = 0
    max_cidx = -1
    
    for i, d in enumerate(temp_data):
        acts = d["act"]
        dialogs = d["dialog"]
        emotions = d["emotion"]
        
        for idx in range(1, len(acts)):
            rows.append([dialogs[idx-1], dialogs[idx], acts[idx], emotions[idx]])
            
    df = pd.DataFrame(data=rows, columns=['previous_dialog', 'current_dialog', 'act', 'emotion'])
    df.to_csv("data/daily_dialog/csv/"+mode+".csv", index=False)

In [9]:
### UNCOMMENT IF THE DATA SPLITTING PROCESS IS CHANGED
split_data_into_lists("test")
split_data_into_lists("validation")
split_data_into_lists("train")

### Pre-processing for BERT

In [10]:
ohe_act = OneHotEncoder(handle_unknown='ignore')
X = [[1], [2], [3], [4]]
ohe_act.fit(X)

# { 1: inform，2: question, 3: directive, 4: commissive }

ohe_emo = OneHotEncoder(handle_unknown='ignore')
X = [[0], [1], [2], [3], [4], [5], [6]]
ohe_emo.fit(X)

# { 0: no emotion, 1: anger, 2: disgust, 3: fear, 4: happiness, 5: sadness, 6: surprise}

print(ohe_act.transform([[1]]).toarray())
print(ohe_emo.transform([[4]]).toarray())

[[1. 0. 0. 0.]]
[[0. 0. 0. 0. 1. 0. 0.]]


In [39]:
def load_csv(mode):
    df = pd.read_csv("data/daily_dialog/csv/"+mode+".csv")
    return df

# Get the dialog from the generated dataframe
def get_cell(df, row, column_name):
    return df.loc[df.index[row], column_name]

### Loading and tokenizing the data

In [12]:
train = load_csv("train")
val = load_csv("validation")
test = load_csv("test")

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
sentence = get_cell(train, 0, "current_dialog")
print("Original:", sentence)
tokenized = tokenizer.tokenize(sentence)
print("Tokenised:", tokenized)
tokenided = tokenizer.convert_tokens_to_ids(tokenized)
print("Token ID:", tokenided)

Original:  You know that is tempting but is really not good for our fitness . 
Tokenised: ['you', 'know', 'that', 'is', 'tempting', 'but', 'is', 'really', 'not', 'good', 'for', 'our', 'fitness', '.']
Token ID: [2017, 2113, 2008, 2003, 23421, 2021, 2003, 2428, 2025, 2204, 2005, 2256, 10516, 1012]


In [42]:
# Tokenize the current and previous dialog for a specific row
# Can return emotion data or act data depending on "mode"

max_previous_length = 0
max_current_length = 0

def get_processed_row(df, row, mode="emo"):
    global  max_previous_length
    global max_current_length
    
    sentence = get_cell(df, row, "previous_dialog")
    tokenized = tokenizer.tokenize(sentence)
    tokenided_p = tokenizer.convert_tokens_to_ids(tokenized)
    
    sentence = get_cell(df, row, "current_dialog")
    tokenized = tokenizer.tokenize(sentence)
    tokenided_c = tokenizer.convert_tokens_to_ids(tokenized)
    
    if len(tokenided_c) > max_current_length:
        max_current_length = len(tokenided_c)
    
    if len(tokenided_p) > max_previous_length:
        max_previous_length = len(tokenided_p)
    
    label = None
    
    if mode == "act":
        label = ohe_act.transform([[get_cell(df, row, "act")]]).toarray().flatten().tolist()
    else:
        label = ohe_emo.transform([[get_cell(df, row, "emotion")]]).toarray().flatten().tolist()
    
    return tokenided_p, tokenided_c, label


# Save the sentence with one hot encoded labels (can be used with encode_plus)

def get_unprocessed_row(df, row, mode="emo"):
    global  max_previous_length
    global max_current_length
    
    sentence_p = get_cell(df, row, "previous_dialog")
    sentence_c = get_cell(df, row, "current_dialog")
    
    label = None
    
    if mode == "act":
        label = get_cell(df, row, "act")
    else:
        label = get_cell(df, row, "emotion")
    
    return sentence_p, sentence_c, label

In [43]:
### Save tokenised data

def save_to_pkl(data, mode, dataset="emo"):
    with open("data/daily_dialog/pkl/"+mode+"_"+dataset+".pkl", "wb") as f:
        pickle.dump(data, f)

def save_to_sent(data, mode, dataset="emo"):
    with open("data/daily_dialog/sent/"+mode+"_"+dataset+".pkl", "wb") as f:
        pickle.dump(data, f)

### Generating PKLs for Emotion Data & Act Data (Sentences are tokenised)

In [40]:
max_previous_length = 0
max_current_length = 0

train_data = []
for i in range(train.shape[0]):
    train_data.append(get_processed_row(train, i))
save_to_pkl(train_data, "train")

val_data = []
for i in range(val.shape[0]):
    val_data.append(get_processed_row(val, i))
save_to_pkl(val_data, "val")

test_data = []
for i in range(test.shape[0]):
    test_data.append(get_processed_row(test, i))
save_to_pkl(test_data, "test")


max_previous_length = 0
max_current_length = 0

train_data = []
for i in range(train.shape[0]):
    train_data.append(get_processed_row(train, i, "act"))
save_to_pkl(train_data, "train", "act")

val_data = []
for i in range(val.shape[0]):
    val_data.append(get_processed_row(val, i, "act"))
save_to_pkl(val_data, "val", "act")

test_data = []
for i in range(test.shape[0]):
    test_data.append(get_processed_row(test, i, "act"))
save_to_pkl(test_data, "test", "act")
    
print("Max Input Lengths:")
print("Previous Sentence:", max_previous_length)
print("Current Sentence:", max_current_length)

Max Input Lengths:
Previous Sentence: 275
Current Sentence: 294


### Generating PKLs for Sentences

In [44]:
# Emo data

train_data = []
for i in range(train.shape[0]):
    train_data.append(get_unprocessed_row(train, i))
save_to_sent(train_data, "train")

val_data = []
for i in range(val.shape[0]):
    val_data.append(get_unprocessed_row(val, i))
save_to_sent(val_data, "val")

test_data = []
for i in range(test.shape[0]):
    test_data.append(get_unprocessed_row(test, i))
save_to_sent(test_data, "test")


# Act data

train_data = []
for i in range(train.shape[0]):
    train_data.append(get_unprocessed_row(train, i, "act"))
save_to_sent(train_data, "train", "act")

val_data = []
for i in range(val.shape[0]):
    val_data.append(get_unprocessed_row(val, i, "act"))
save_to_sent(val_data, "val", "act")

test_data = []
for i in range(test.shape[0]):
    test_data.append(get_unprocessed_row(test, i, "act"))
save_to_sent(test_data, "test", "act")