In [1]:
import torch
from pytorch_pretrained_bert.modeling import Bert_CRF, BertForTokenClassification
from pytorch_pretrained_bert.tokenization import BertTokenizer

## load model

In [2]:
class TEMP:
    pass
args = TEMP()
args.output_dir = './saved_models/qq/'
args.do_lower_case = True
bio_num_labels = 3
type_num_labels = 3
args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
model = Bert_CRF.from_pretrained(args.output_dir, bio_num_labels=bio_num_labels, type_num_labels=type_num_labels)
model.to(args.device)

Bert_CRF(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertIntermediate(

## prepare data

In [4]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        raise ValueError('the length should larger than all max {0} now {1}'.format(max_length, total_length))
        tokens_b.pop()

In [53]:
def convert_examples_to_features(examples, tokenizer,
                  max_seq_length=128,
                  label_list=None, output_mode=None,
                  pad_on_left=False,
                  pad_token=0,
                  pad_token_segment_id=0,
                  mask_padding_with_zero=True):
    """
    Loads a data file into a list of `InputBatch`s
    """
    
    features = []
    print('length:', len(examples))
    for (ex_index, example) in enumerate(examples):
        if ex_index % 100 == 0:
            print("Writing example %d of %d" % (ex_index, len(examples)))

        tokens_a = tokenizer.tokenize(example['context'])
        #print(labels)

        tokens_b = None
        if(example['topic'] is not None):
            tokens_b = tokenizer.tokenize(example['topic'])
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)
        input_mask_a = [1] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        input_mask_a += [0] * (max_seq_length - len(input_mask_a))

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        recover = []
        now, index = 0, 0
        example['context'] = example['context'].split()
        for a in tokens_a:
            if(a[:2]=='##'):
                a = a[2:]
            recover.append(now)
            now += 1
            example['context'][index] = example['context'][index][len(a):]
            if(len(example['context'][index])==0):
                index += 1
                now = 0
                
        recover = [1] + recover
        recover.extend([2]*(max_seq_length-len(recover)))

        # prepare label for type classification
        # since there will be extending words, we need to check the excat
        #######################################################################
        if ex_index < 5:
            print("*** Example ***")
            print(len(tokens_a), tokens_a)
            print(len(example['context']), example['context'])
            print("guid: %s" % (example['uid']))
            print("tokens: %s" % " ".join([str(x) for x in tokens]))
            print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            print("recover: %s" % " ".join([str(x) for x in recover]))
            print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            print("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))

        # ['input_ids', 'attention_mask', 'crf_mask', 'segment_ids', 'label_id', 'recover']
        features.append({
            'input_ids':input_ids,
            'attention_mask':input_mask,
            'crf_mask':input_mask_a,
            'segment_ids':segment_ids,
            'recover':recover
            })
    return features

## predict function

In [6]:
def pred(features):
    total_pred = {}
    for key in features:
        total_pred[key] = []
        for i, f_data in enumerate(features[key]):

            tmp_data = {}
            for data_key in f_data:
                tmp_data[data_key] = torch.tensor(f_data[data_key]).cuda().unsqueeze(0)

            with torch.no_grad():
                pred = model(tmp_data['input_ids'], token_type_ids=tmp_data['segment_ids'], attention_mask=tmp_data['attention_mask'], 
                             crf_mask=tmp_data['crf_mask'])
            total_pred[key].append(pred)
    return total_pred

In [7]:
# recover to origin function
def recover(pad_pred, re):
    pred = []
    for p, r in zip(pad_pred, re):
        if(r==0):
            pred.append(p)
    return pred

## origin data

In [10]:
import json
total = {'data':{}, 'features':{}}

In [11]:
for key in ['train', 'eval']:
    total['data'][key] = []
    with open('./../preprocess/parsing/ArgumentEssays_'+key) as f:
        label_list = f.readline()
        for line in f:
            temp = json.loads(line)
            for index, (bio, context, label) in enumerate(zip(temp['bio'],temp['context'], temp['label'])):
                total['data'][key].append(
                    {
                        'uid':'{0}_{1}'.format(key, index),
                        'topic':temp['topic'],
                        'context':context,
                        'bio':bio,
                        'type':label
                    }.copy()
                )

In [12]:
for key in ['train', 'eval']:
    total['features'][key] = convert_examples_to_features(total['data'][key].copy(), tokenizer,
                    max_seq_length=220,
                    label_list=None, output_mode=None,
                    pad_on_left=False,
                    pad_token=0,
                    pad_token_segment_id=0,
                    mask_padding_with_zero=True)

length: 1443
Writing example 0 of 1443
*** Example ***
96 ['it', 'is', 'always', 'said', 'that', 'competition', 'can', 'effectively', 'promote', 'the', 'development', 'of', 'economy', '.', 'in', 'order', 'to', 'survive', 'in', 'the', 'competition', ',', 'companies', 'continue', 'to', 'improve', 'their', 'products', 'and', 'service', ',', 'and', 'as', 'a', 'result', ',', 'the', 'whole', 'society', 'pro', '##sper', '##s', '.', 'however', ',', 'when', 'we', 'discuss', 'the', 'issue', 'of', 'competition', 'or', 'cooperation', ',', 'what', 'we', 'are', 'concerned', 'about', 'is', 'not', 'the', 'whole', 'society', ',', 'but', 'the', 'development', 'of', 'an', 'individuals', 'whole', 'life', '.', 'from', 'this', 'point', 'of', 'view', ',', 'i', 'firmly', 'believe', 'that', 'we', 'should', 'attach', 'more', 'importance', 'to', 'cooperation', 'during', 'primary', 'education', '.']
94 ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',

Writing example 300 of 1443
Writing example 400 of 1443
Writing example 500 of 1443
Writing example 600 of 1443
Writing example 700 of 1443
Writing example 800 of 1443
Writing example 900 of 1443
Writing example 1000 of 1443
Writing example 1100 of 1443
Writing example 1200 of 1443
Writing example 1300 of 1443
Writing example 1400 of 1443
length: 390
Writing example 0 of 390
*** Example ***
98 ['technology', 'accelerate', '##s', 'humans', 'evolving', 'pace', '.', 'with', 'advanced', 'technology', ',', 'many', 'things', 'that', 'seemed', 'impossible', 'in', 'the', 'past', ',', 'have', 'become', 'realities', '.', 'for', 'example', ',', 'people', 'in', 'the', 'past', 'never', 'dreamed', 'of', 'talking', 'to', 'anyone', 'whenever', 'they', 'wanted', 'or', 'see', 'someone', 'overseas', 'on', 'a', 'computer', 'screen', ',', 'both', 'of', 'these', 'can', 'be', 'achieved', 'with', 'cell', '##phone', 'and', 'internet', '.', 'however', ',', 'some', 'people', 'point', 'that', 'technology', 'has',

Writing example 200 of 390
Writing example 300 of 390


In [13]:
total['pred'] = pred(total['features'])

In [14]:
for key in total['features']:    
    for index in range(len(total['pred'][key])):
        total['pred'][key][index]['bio'] = recover(total['pred'][key][index]['bio']['output'][0], total['features'][key][index]['recover'])

In [15]:
for key in total['features']:    
    print(total['data'][key][0]['context'])
    for _ in total['pred'][key][0]['bio']:
        print(_, end=' ')
    print('\n','-'*10)
    for _ in total['data'][key][0]['bio'].split():
        print(_, end=' ')
    print()
    print()

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 1 1 1 1 1 1 1 1 1 2 
 ----------
O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B I I I I I I I I I O 

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',

## check without marker
### here we want to check how will the performance decay without marker

In [16]:
import json
part = {'data':{}, 'features':{}}

In [54]:
for key in ['train', 'eval']:
    part['data'][key] = []
    with open('./../preprocess/parsing/ArgumentEssays_'+key) as f:
        label_list = f.readline()
        for line in f:
            temp = json.loads(line)
            for index, (bio, context) in enumerate(zip(temp['bio'],temp['context'])):
                tmp_bio, tmp_context = [], []
                for b, c in zip(bio.split(), context.split()):
                    if(b=='O' and len(tmp_bio)>0):
                        # print(tmp_context)
                        # print(tmp_bio)
                        part['data'][key].append({
                                'uid':'{0}_{1}'.format(key, index),
                                'topic':temp['topic'],
                                'context':' '.join(tmp_context),
                                'bio':' '.join(tmp_bio)
                            }.copy())
                        tmp_bio, tmp_context = [], []
                    elif(b=='I' or b=='B'):
                        tmp_bio.append(b)
                        tmp_context.append(c)

In [55]:
for key in ['train', 'eval']:
    part['features'][key] = convert_examples_to_features(part['data'][key].copy(), tokenizer,
                max_seq_length=128,
                label_list=None, output_mode=None,
                pad_on_left=False,
                pad_token=0,
                pad_token_segment_id=0,
                mask_padding_with_zero=True)

length: 4748
Writing example 0 of 4748
*** Example ***
10 ['we', 'should', 'attach', 'more', 'importance', 'to', 'cooperation', 'during', 'primary', 'education']
10 ['', '', '', '', '', '', '', '', '', '']
guid: train_0
tokens: [CLS] we should attach more importance to cooperation during primary education [SEP]
input_ids: 101 2057 2323 22476 2062 5197 2000 6792 2076 3078 2495 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
recover: 1 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

Writing example 1700 of 4748
Writing example 1800 of 4748
Writing example 1900 of 4748
Writing example 2000 of 4748
Writing example 2100 of 4748
Writing example 2200 of 4748
Writing example 2300 of 4748
Writing example 2400 of 4748
Writing example 2500 of 4748
Writing example 2600 of 4748
Writing example 2700 of 4748
Writing example 2800 of 4748
Writing example 2900 of 4748
Writing example 3000 of 4748
Writing example 3100 of 4748
Writing example 3200 of 4748
Writing example 3300 of 4748
Writing example 3400 of 4748
Writing example 3500 of 4748
Writing example 3600 of 4748
Writing example 3700 of 4748
Writing example 3800 of 4748
Writing example 3900 of 4748
Writing example 4000 of 4748
Writing example 4100 of 4748
Writing example 4200 of 4748
Writing example 4300 of 4748
Writing example 4400 of 4748
Writing example 4500 of 4748
Writing example 4600 of 4748
Writing example 4700 of 4748
length: 1329
Writing example 0 of 1329
*** Example ***
8 ['technology', 'may', 'have', 'some', 'negat

In [56]:
part['pred'] = pred(part['features'])

In [57]:
for key in total['features']:    
    for index in range(len(part['pred'][key])):
        part['pred'][key][index]['bio'] = recover(part['pred'][key][index]['bio']['output'][0], part['features'][key][index]['recover'])

In [58]:
for key in total['features']:    
    print(part['data'][key][0]['context'])
    for _ in part['pred'][key][0]['bio']:
        print(_, end=' ')
    print('\n','-'*10)
    for _ in part['data'][key][0]['bio'].split():
        print(_, end=' ')
    print()
    print()

['', '', '', '', '', '', '', '', '', '']
0 1 1 1 1 1 1 1 1 1 
 ----------
B I I I I I I I I I 

['', '', '', '', '', '', '', '']
0 1 1 1 1 1 1 1 
 ----------
B I I I I I I I 



In [60]:
## performance function
from sklearn.metrics import f1_score
def check(pred, label, check=False):
    result = {}
    for key in pred:
        temp = [[], []]
        for p, l in zip(pred[key], label[key]):
            for p_b, l_b in zip(p['bio'], l['bio'].split()):
                if(l_b == 'B'):
                    temp[0].append(p_b)
                    temp[1].append(0)
                elif(l_b == 'I'):
                    temp[0].append(p_b)
                    temp[1].append(1)
                elif(l_b == 'O' and check):
                    temp[0].append(p_b)
                    temp[1].append(2)
        result[key] = f1_score(y_true=temp[1], y_pred=temp[0] , average='macro')
    return result

In [48]:
check(total['pred'], total['data'], True)

{'train': array([0.89009662, 0.96826568, 0.93176122]),
 'eval': array([0.82902655, 0.9375289 , 0.85734311])}

In [62]:
check(total['pred'], total['data'], False)

{'train': 0.6364547177183598, 'eval': 0.6189040459971215}

In [50]:
check(part['pred'], part['data'])

{'train': array([0.77743902, 0.81653188, 0.        ]),
 'eval': array([0.72922465, 0.76705113, 0.        ])}

In [61]:
check(part['pred'], part['data'])

{'train': 0.5386032263228308, 'eval': 0.5072624362307513}

## check dissent

In [65]:
import pandas as pd
dissent = {'data':{}, 'features':{}}

In [63]:
part['data']['train'][0].keys()

dict_keys(['uid', 'topic', 'context', 'bio'])

In [71]:
for key in ['train', 'eval']:
    dissent['data'][key] = []
    with open('./../dissent_bert/data/discourse_'+key) as f:
        for index, (sent1, sent2) in enumerate(zip(temp['0'].tolist()[:128], temp['1'].tolist()[:128])):
            dissent['data'][key].append({
                'uid':'{0}_{1}'.format(key, index*2), 
                'topic':None, 
                'context':sent1
            })
            dissent['data'][key].append({
                'uid':'{0}_{1}'.format(key, index*2+1), 
                'topic':None, 
                'context':sent2
            })
    
    

In [74]:
for key in ['train', 'eval']:
    dissent['features'][key] = convert_examples_to_features(dissent['data'][key], tokenizer,
                    max_seq_length=128,
                    label_list=None, output_mode=None,
                    pad_on_left=False,
                    pad_token=0,
                    pad_token_segment_id=0,
                    mask_padding_with_zero=True)

length: 256
Writing example 0 of 256
*** Example ***
19 ['neither', 'was', 'he', ',', 'their', 'lips', 'crushing', 'together', ',', 'their', 'tongues', 'not', 'so', 'much', 'meeting', 'as', 'duel', '##ing', '.']
18 ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
guid: train_0
tokens: [CLS] neither was he , their lips crushing together , their tongues not so much meeting as duel ##ing . [SEP]
input_ids: 101 4445 2001 2002 1010 2037 2970 14527 2362 1010 2037 19677 2025 2061 2172 3116 2004 14216 2075 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
recover: 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 

In [75]:
dissent['pred'] = pred(dissent['features'])

In [78]:
for key in dissent['features']:    
    for index in range(len(dissent['pred'][key])):
        dissent['pred'][key][index]['bio'] = recover(dissent['pred'][key][index]['bio']['output'][0], dissent['features'][key][index]['recover'])

In [80]:
for index in range(16):
    for _ in dissent['pred'][key][index]['bio']:
        print(_, end=' ')
    print()
    if(index%2 == 1):
        print()

0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 
2 2 2 2 2 2 2 2 

2 2 0 1 1 1 1 1 1 1 1 1 2 
2 2 2 2 2 2 2 2 2 2 

0 1 1 1 2 
0 1 1 1 1 1 2 

2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 

2 2 2 2 2 2 2 
2 2 2 2 2 

0 1 1 1 1 1 2 
0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 

0 1 1 1 1 1 1 1 2 
2 2 2 2 2 2 2 2 2 2 2 

0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 
0 1 1 1 1 2 

