In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import sklearn
import pandas as pd
import numpy as np

from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, classification_report
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm, trange,tnrange,tqdm_notebook
from transformers import BertTokenizer, BertConfig
import random
import shutil



## Load Data 

In [2]:
import string
punctuation = string.punctuation
import nltk

import re
def sent_tokenize(df,sent_method):
    sents = []
    for index,row in df.iterrows():
        sent = []
        tempSent = row['utterance'].split('|||')
        if sent_method == 'nltk':
            for i in tempSent:
                sent = sent + nltk.sent_tokenize(i)
        elif sent_method == 'comma':
            for i in tempSent:
                sent = sent + [j.strip().lower() for j in re.split(r"[,,.,?,!,;()]",i) if len(j.strip().split(' '))>1]
        sents = sents + sent
    return sents


data_paths = [
    '../data/FriendsPersona/Friends_A_whole.tsv',
    '../data/myPersonality/MyPersonality_A_whole.tsv',
    '../data/pan2015/Pan_A_whole.tsv',
    '../data/Essay/Essay_A_whole.tsv',
#     '../data/Kaggle_mbti/Kaggle_map_label_words_comma.tsv'  
]

df_data = pd.DataFrame([])
for path in data_paths:
    df = pd.read_csv(path,  sep='\t')
    sents = sent_tokenize(df,'comma')
    df_sents = pd.DataFrame(sents)
    df_data = pd.concat([df_data, df_sents], axis=0)
    print(df_data.shape)
df_data = df_data.drop_duplicates()
print(df_data.shape)

(5631, 1)
(24140, 1)
(54665, 1)
(213084, 1)
(200015, 1)


In [3]:
def get_span(pos, row):
    if pos == 1:
        return row['sent'].split(row['label_word'])[0]
    elif pos == 2:
        try:
            return row['sent'].split(row['label_word'])[1]
        except:
            return ""

## Construct Training Samples

In [4]:

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

def get_input_and_target(df):
    span_1   = '<extra_id_0> '
    span_2   = ' <extra_id_1> '
    span_end = ' <extra_id_2>'
    tmp_df = df.fillna(" ")
    tmp_df['inputs'] = span_1 + tmp_df['label_word'] + span_2
    tmp_df['target'] = span_1 + tmp_df['span_1'] + span_2 + tmp_df['span_2'] + span_end
    return list(tmp_df['inputs']), list(tmp_df['target'])



def construct_training(df_data_, tokenizer):
    inputs, targets = get_input_and_target(df_data_)
    max_len    = 20
    batch_size = 16

    inputs  = [tokenizer.encode(sent, add_special_tokens=True, max_length=max_len, pad_to_max_length=True) for sent in inputs]
    targets = [tokenizer.encode(sent, add_special_tokens=True, max_length=max_len, pad_to_max_length=True) for sent in targets]

    input_attention_masks = [[float(i>0) for i in seq] for seq in inputs]



    train_inputs  = inputs
    train_masks   = input_attention_masks
    train_targets = targets


    train_inputs     = torch.tensor(train_inputs)
    train_targets    = torch.tensor(train_targets)
    train_masks      = torch.tensor(train_masks)

    train_data       = TensorDataset(train_inputs, train_masks, train_targets)
    train_sampler    = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return train_dataloader


In [5]:
import datetime
label_words = []
for trait in ['A', 'C','E','O','N']:
    print('Processing trait ', trait, '...')
    label_words = []
    with open('label_words/'+trait+'_words.txt', 'r') as f:
        pos = f.readline().split(',')
        neg = f.readline().split(',')
    label_words += pos
    label_words += neg
    
    label_words = set(label_words)
    
    refined_label_words = label_words - set(('think', 'about', 'really', 'work', 'mean', 'kind'))
    
    starttime = datetime.datetime.now()
    context_dict = {}
    label_word_dict = {}

    sent_with_label_word = []
    for sent in df_data[0]: 
        for word in refined_label_words: 
            if word in sent.split(' '):
                try:
                    label_word_dict[word] += 1
                except:
                    label_word_dict[word] = 1
                sent_with_label_word.append([sent, word])
                w_list = sent.split(' ')
                for w in w_list:
                    if not w == word:
                        try:
                            context_dict[w] += 1
                        except:
                            context_dict[w] = 1
    
    
    label_word_dict = {k: v for k, v in sorted(label_word_dict.items(), key=lambda item: abs(item[1]), reverse=True)}
    print('Number of matched label words: ', len(label_word_dict))
    print('Top 10 matched label words: ', list(label_word_dict.items())[:10])
    
    print('Number of sents containing label words: ', len(sent_with_label_word))
    print('Top 10 sents containing label words: ', sent_with_label_word[:10])
    
    
    
    df_data_ = pd.DataFrame([])
    df_data_['sent'] = [i[0] for i in sent_with_label_word]
    df_data_['label_word'] = [i[1] for i in sent_with_label_word]
    
    df_data_['span_1'] = df_data_.apply(lambda x: get_span(1, x), axis=1)
    df_data_['span_2'] = df_data_.apply(lambda x: get_span(2, x), axis=1)
    
    
    tokenizer = T5Tokenizer.from_pretrained("t5-large")
    model = T5ForConditionalGeneration.from_pretrained("t5-large")
    
    print('Construct Training Samples...')
    train_dataloader = construct_training(df_data_, tokenizer)
    
    print('Second phase Pre-training...')    
    
    
    num_epoch          = 20
    learning_rate      = 1e-4
    adam_epsilon       = 1e-8
    num_warmup_steps   = 0
    num_training_steps = len(train_dataloader)*num_epoch


    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr = learning_rate, eps = adam_epsilon, correct_bias = False)  
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  


    model.cuda()
    for _ in tnrange(1, num_epoch+1, desc='Epoch'):
        print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
        batch_loss = 0
        for step, batch in enumerate(train_dataloader):
            model.train()
            batch = tuple(t.cuda() for t in batch)
            b_input_ids, b_input_masks, b_labels = batch

            loss = model(input_ids=b_input_ids, attention_mask=b_input_masks, labels=b_labels).loss

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()

            batch_loss += loss.item()

        print(batch_loss)
    
    model_path = 'Adapted_t5_large_'+trait+'/'
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    
    endtime = datetime.datetime.now()
    print((endtime - starttime))
    print()
    

Processing trait  A ...
Number of matched label words:  654
Top 10 matched label words:  [('great', 1553), ('happy', 1276), ('nice', 1207), ('able', 998), ('funny', 779), ('reason', 706), ('cold', 516), ('easy', 504), ('free', 501), ('ready', 463)]
Number of sents containing label words:  23979
Top 10 sents containing label words:  [["i'm nervous", 'nervous'], ['you look so young', 'young'], ['very nice touch', 'nice'], ["these'll go great in my new place", 'great'], ['based on serious stuff', 'serious'], ["i'd be able to be a stand-up guy and go the distance", 'able'], ["i don't mean to be disrespectful", 'disrespectful'], ['this is great', 'great'], ['very sexy', 'sexy'], ['joey gets her something really great', 'great']]


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Construct Training Samples...




Second phase Pre-training...


  for _ in tnrange(1, num_epoch+1, desc='Epoch'):


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

3286.6157561540604
2841.5701818466187
2586.7103914022446
2365.190559208393
2177.101964890957
2012.837809085846
1864.817135334015
1731.005232334137
1613.3676114678383
1503.6499709486961
1407.6912552118301
1315.3482387065887
1236.8762281537056
1163.870855331421
1101.2429710924625
1044.7848420739174
992.5419355928898
948.064116448164
911.4604422152042
875.5463183820248
1:48:44.164257

Processing trait  C ...
Number of matched label words:  646
Top 10 matched label words:  [('great', 1553), ('happy', 1276), ('nice', 1207), ('able', 998), ('funny', 779), ('reason', 706), ('cold', 516), ('easy', 504), ('free', 501), ('ready', 463)]
Number of sents containing label words:  23696
Top 10 sents containing label words:  [["i'm nervous", 'nervous'], ['you look so young', 'young'], ['very nice touch', 'nice'], ["these'll go great in my new place", 'great'], ['based on serious stuff', 'serious'], ["i'd be able to be a stand-up guy and go the distance", 'able'], ["i don't mean to be disrespectful", '

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Construct Training Samples...




Second phase Pre-training...


  for _ in tnrange(1, num_epoch+1, desc='Epoch'):


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

3252.3868144750595
2803.1521847248077
2549.990755200386
2333.9466296434402
2153.1524339318275
1990.0079635977745
1847.2088550329208
1717.2773668169975
1599.8275401592255
1490.571525633335
1390.704477250576
1308.6300648450851
1226.904787659645
1155.464589357376
1095.9962752461433
1037.0590411424637
986.0407022833824
943.413983643055
902.4983675181866
867.0297521352768
1:47:22.305013

Processing trait  E ...
Number of matched label words:  628
Top 10 matched label words:  [('great', 1553), ('happy', 1276), ('nice', 1207), ('able', 998), ('funny', 779), ('reason', 706), ('cold', 516), ('easy', 504), ('free', 501), ('ready', 463)]
Number of sents containing label words:  23375
Top 10 sents containing label words:  [["i'm nervous", 'nervous'], ['very nice touch', 'nice'], ["these'll go great in my new place", 'great'], ['based on serious stuff', 'serious'], ["i'd be able to be a stand-up guy and go the distance", 'able'], ['this is great', 'great'], ['very sexy', 'sexy'], ['joey gets her so

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Construct Training Samples...




Second phase Pre-training...


  for _ in tnrange(1, num_epoch+1, desc='Epoch'):


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

3194.416847229004
2757.4696955680847
2507.75208568573
2293.8839213848114
2113.081297814846
1953.9884468317032
1812.9346996545792
1685.0640798807144
1568.052683711052
1461.1664459109306
1366.8325026631355
1279.8108813762665
1204.1760628819466
1141.2586366534233
1082.0844423174858
1019.8563286960125
967.3558802604675
927.601091325283
887.2434900999069
854.3406876921654
1:45:52.556360

Processing trait  O ...
Number of matched label words:  635
Top 10 matched label words:  [('great', 1553), ('happy', 1276), ('nice', 1207), ('able', 998), ('funny', 779), ('reason', 706), ('cold', 516), ('easy', 504), ('free', 501), ('ready', 463)]
Number of sents containing label words:  23433
Top 10 sents containing label words:  [["i'm nervous", 'nervous'], ['you look so young', 'young'], ['very nice touch', 'nice'], ["these'll go great in my new place", 'great'], ['based on serious stuff', 'serious'], ["i'd be able to be a stand-up guy and go the distance", 'able'], ["i don't mean to be disrespectful", 

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Construct Training Samples...




Second phase Pre-training...


  for _ in tnrange(1, num_epoch+1, desc='Epoch'):


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

3213.8445814847946
2772.5325511693954
2521.254425048828
2305.2346140146255
2122.528953373432
1968.2328466176987
1823.2791233062744
1692.8604972958565
1578.5596642494202
1474.279801785946
1373.6594161391258
1290.5531959533691
1211.780412375927
1141.7948338389397
1077.1719652414322
1023.2682588994503
978.5466766059399
932.1611191928387
894.0443823933601
857.4327992796898
1:46:10.771707

Processing trait  N ...
Number of matched label words:  642
Top 10 matched label words:  [('great', 1553), ('happy', 1276), ('nice', 1207), ('able', 998), ('funny', 779), ('reason', 706), ('cold', 516), ('easy', 504), ('free', 501), ('ready', 463)]
Number of sents containing label words:  23369
Top 10 sents containing label words:  [["i'm nervous", 'nervous'], ['you look so young', 'young'], ['very nice touch', 'nice'], ["these'll go great in my new place", 'great'], ['based on serious stuff', 'serious'], ["i'd be able to be a stand-up guy and go the distance", 'able'], ["i don't mean to be disrespectful"

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Construct Training Samples...




Second phase Pre-training...


  for _ in tnrange(1, num_epoch+1, desc='Epoch'):


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

3186.9186861515045
2754.942104101181
2507.4792774915695
2293.4754366874695
2111.269564270973
1951.4770336151123
1807.9176329374313
1679.0132877230644
1563.3644071221352
1457.059626340866
1363.3138824105263
1279.610775232315
1199.4739170074463
1132.27404910326
1070.22325360775
1013.7126913070679
968.5972692370415
922.7060555815697
886.5315509438515
850.4257380366325
1:45:54.675341

