In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import sklearn
import pandas as pd
import numpy as np

from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix,classification_report
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm, trange,tnrange,tqdm_notebook
from transformers import BertTokenizer, BertConfig
import random
import shutil

## Load T5 Model

In [2]:
# tokenizer = T5Tokenizer.from_pretrained("t5-base")
# model = T5ForConditionalGeneration.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


## Load Data 

In [26]:
import string
punctuation = string.punctuation
import nltk

import re
def sent_tokenize(df,sent_method):
    sents = []
    for index,row in df.iterrows():
        sent = []
        tempSent = row['utterance'].split('|||')
        if sent_method == 'nltk':
            for i in tempSent:
                sent = sent + nltk.sent_tokenize(i)
        elif sent_method == 'comma':
            for i in tempSent:
                sent = sent + [j.strip().lower() for j in re.split(r"[,,.,?,!,;()]",i) if len(j.strip().split(' '))>1]
        sents = sents + sent
    return sents


data_paths = [
#     '../data/FriendsPersona/Friends_A_whole.tsv',
#     '../data/myPersonality/MyPersonality_A_whole.tsv',
#     '../data/pan2015/Pan_A_whole.tsv',
    '../data/Essay/Essay_A_whole.tsv',
    # '../data/Kaggle_mbti/Kaggle_map_label_words_comma.tsv'  
]

df_data = pd.DataFrame([])
for path in data_paths:
    df = pd.read_csv(path,  sep='\t')
    sents = sent_tokenize(df,'comma')
    df_sents = pd.DataFrame(sents)
    df_data = pd.concat([df_data, df_sents], axis=0)
    print(df_data.shape)
df_data = df_data.drop_duplicates()
print(df_data.shape)

(158419, 1)
(149305, 1)


In [23]:
label_words = []
for trait in ['A','C','E','O','N']:
    with open('label_words/'+trait+'_words.txt', 'r') as f:
        pos = f.readline().split(',')
        neg = f.readline().split(',')
    label_words += pos
    label_words += neg
print(len(label_words))
label_words = set(label_words)
print(len(label_words))

7610
1434


In [18]:
refined_label_words = label_words - set(('think', 'about'))

In [27]:
import datetime
starttime = datetime.datetime.now()
context_dict = {}

sent_with_label_word = []
for sent in df_data[0]: ## 200015
    for word in refined_label_words: ## 1433
        if word in sent.split(' '):
            sent_with_label_word.append([sent, word])
            w_list = sent.split(' ')
            for w in w_list:
                if not w == word:
                    try:
                        context_dict[w] += 1
                    except:
                        context_dict[w] = 1
print(len(sent_with_label_word))

endtime = datetime.datetime.now()
print((endtime - starttime))


41741
0:01:01.012563


In [28]:
sorted_dict = {k: v for k, v in sorted(context_dict.items(), key=lambda item: item[1], reverse=True)}
cnt = 0
for k, v in sorted_dict.items():
    print(k,v)
    if cnt > 20:
        break
    cnt += 1

i 57807
to 32087
and 23516
the 21009
that 16525
is 15529
a 15257
my 14533
it 13617
of 13206
in 9602
but 9044
so 8359
 8013
be 7981
me 7712
have 7377
this 6663
am 6303
was 6287
for 6127
not 6033


In [6]:
df_data = pd.DataFrame([])
df_data['sent'] = [i[0] for i in sent_with_label_word]
df_data['label_word'] = [i[1] for i in sent_with_label_word]
df_data.head()

Unnamed: 0,sent,label_word
0,and everything was fine until,fine
1,lying on massage table,lying
2,moving his hands up phoebe's legs,moving
3,just some basic dehydrating of a few fruits an...,basic
4,i'm nervous,nervous


In [7]:
def get_span(pos, row):
    if pos == 1:
        return row['sent'].split(row['label_word'])[0]
    elif pos == 2:
        try:
            return row['sent'].split(row['label_word'])[1]
        except:
            return ""

df_data['span_1'] = df_data.apply(lambda x: get_span(1, x), axis=1)
df_data['span_2'] = df_data.apply(lambda x: get_span(2, x), axis=1)
df_data.head()

Unnamed: 0,sent,label_word,span_1,span_2
0,and everything was fine until,fine,and everything was,until
1,lying on massage table,lying,,on massage table
2,moving his hands up phoebe's legs,moving,,his hands up phoebe's legs
3,just some basic dehydrating of a few fruits an...,basic,just some,dehydrating of a few fruits and vegetables
4,i'm nervous,nervous,i'm,


In [9]:
df_data.head(1000).to_csv('test.tsv', sep='\t')

## Construct Training Samples

In [38]:
def get_input_and_target(df):
    span_1   = '<extra_id_0> '
    span_2   = ' <extra_id_1> '
    span_end = ' <extra_id_2>'
    tmp_df = df.fillna(" ")
    tmp_df['inputs'] = span_1 + tmp_df['label_word'] + span_2
    tmp_df['target'] = span_1 + tmp_df['span_1'] + span_2 + tmp_df['span_2'] + span_end
    return list(tmp_df['inputs']), list(tmp_df['target'])

inputs, targets = get_input_and_target(df_data)


from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

max_len    = 20
batch_size = 16

inputs  = [tokenizer.encode(sent, add_special_tokens=True, max_length=max_len, pad_to_max_length=True) for sent in inputs]
targets = [tokenizer.encode(sent, add_special_tokens=True, max_length=max_len, pad_to_max_length=True) for sent in targets]


## TO DO ::: construct input mask
input_attention_masks = [[float(i>0) for i in seq] for seq in inputs]



train_inputs  = inputs
train_masks   = input_attention_masks
train_targets = targets


train_inputs     = torch.tensor(train_inputs)
train_targets    = torch.tensor(train_targets)
train_masks      = torch.tensor(train_masks)

train_data       = TensorDataset(train_inputs, train_masks, train_targets)
train_sampler    = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


## Fine-tuning T5 model

In [39]:
num_epoch          = 100
learning_rate      = 1e-4
adam_epsilon       = 1e-8
num_warmup_steps   = 0
num_training_steps = len(train_dataloader)*num_epoch


optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr = learning_rate, eps = adam_epsilon, correct_bias = False)  
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  


model.cuda()
for _ in tnrange(1, num_epoch+1, desc='Epoch'):
    print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
    batch_loss = 0
    for step, batch in enumerate(train_dataloader):
        model.train()
        batch = tuple(t.cuda() for t in batch)
        b_input_ids, b_input_masks, b_labels = batch

        loss = model(input_ids=b_input_ids, attention_mask=b_input_masks, labels=b_labels).loss
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        optimizer.zero_grad()
        
        batch_loss += loss.item()
        
        ## evaluation
        
    print(batch_loss)

  for _ in tnrange(1, num_epoch+1, desc='Epoch'):


Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

9295.930498123169
8639.428654670715
8267.243554472923
7954.965410113335
7687.0647040605545
7440.246058940887
7212.2961703538895
7003.700336933136
6803.423740327358
6616.095346331596
6440.434916853905
6269.80981528759
6101.765981674194
5947.186869859695
5795.640866398811
5652.573081374168
5514.282096624374
5375.085220873356
5251.261933624744
5123.383871793747
5006.344127833843
4891.139159560204
4782.074482083321
4675.641802370548
4567.492575705051
4473.959973037243
4384.846105277538
4289.998818397522
4199.032634437084
4119.709498345852
4039.796664059162
3951.1289554834366
3885.654219150543
3814.5815757513046
3744.602669239044
3677.2528742551804
3613.4548812508583
3552.9242827892303
3494.76059705019
3437.736697435379
3387.597365438938
3338.465136706829
3285.2657828330994
3238.224386870861
3194.032771408558
3147.442440867424
3108.299083471298
3069.962757885456
3026.3040993511677
2993.383432030678
2952.718172043562
2922.073758125305
2889.356928765774
2853.512800067663
2829.212549418211
279

## Save Model

In [40]:
model_path = 'Friends_template_t5_base/'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('Friends_template_t5_base/tokenizer_config.json',
 'Friends_template_t5_base/special_tokens_map.json',
 'Friends_template_t5_base/spiece.model',
 'Friends_template_t5_base/added_tokens.json')

## Save Prior Logits

#### construct input samples for all the label words:

In [None]:
# df_verbalizer = pd.read_csv('big_five_cleaned.tsv', sep='\t')
# pos = [a.lower() for a in list(df_verbalizer['word'][df_verbalizer['A']>0])]
# neg = [a.lower() for a in list(df_verbalizer['word'][df_verbalizer['A']<0])]
# label_words = pos + neg

# span_1   = '<extra_id_0> '
# span_2   = ' <extra_id_1> '

# inputs = []
# for w in label_words:    
#      inputs.append(span_1 + w + span_2)

# inputs  = [tokenizer.encode(sent, add_special_tokens=True, max_length=max_len, pad_to_max_length=True) for sent in inputs]
# input_attention_masks = [[float(i>0) for i in seq] for seq in inputs]


#### generate the logits of each position in the template

In [None]:
# # inputs = torch.tensor(inputs)
# # input_attention_masks = torch.tensor(input_attention_masks)

# label_word_logits = []
# for input_, mask_ in zip(inputs, input_attention_masks):
#     input_ = torch.tensor(input_).cuda()
#     mask_  = torch.tensor(mask_).cuda()
#     logits = model(input_ids=input_, attention_mask=mask_)[0]
#     label_word_logits.append(logits)
#     label_word_logits = torch.cat(label_word_logits, 0)

# print(label_word_logits.shape)