In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from torch.optim.optimizer import Optimizer
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import random
import torch
from transformers import pipeline
import warnings 
warnings.filterwarnings('ignore')
from pytorch_lightning import seed_everything
from torch.utils.data import DataLoader
import os
import gc
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
gc.collect()

  '"sox" backend is being deprecated. '


22

# set_seed(42)

In [2]:
def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)

Global seed set to 42


# LOAD DATA

In [3]:
df = pd.read_csv('preprocess_for_SQUAD_銀行.csv',index_col=0)
df

Unnamed: 0,string_X_train,Y_label,string_Y_1,string_Y_2
16,SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS INDIC...,CTBC BANK CO LTD,1860,1876
22,SIGNED COMMERCIAL INVOICE IN 2 ORIGINALINDICAT...,CTBC BANK CO LTD,4924,4940
23,SIGNED COMMERCIAL INVOICE IN 2 ORIGINALINDICAT...,CTBC BANK CO LTD,4924,4940
30,1SIGNED COMMERCIAL INVOICE IN DUPLICATE SHOWIN...,CHANG HWA COMMERCIAL BANK LTD,5345,5374
41,1FULL SET OF ORIGINALS AND 2 NON-NEGOTIABLE CO...,STANDARD CHARTERED BANK,1596,1619
...,...,...,...,...
8264,SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS INDIC...,CTBC BANK CO LTD,2379,2395
8266,SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS INDIC...,CTBC BANK CO LTD,2379,2395
8274,SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS INDIC...,CTBC BANK CO LTD,2372,2388
8275,SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS INDIC...,CTBC BANK CO LTD,2372,2388


In [4]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
print(train_df.shape)
print(val_df.shape)

(1160, 4)
(291, 4)


# Model and Tokenizer Initialization

In [5]:
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering

# load model and tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

# add new_tokens
#new_tokens = df['Y_label'].values.tolist()
#num_added_toks = tokenizer.add_tokens(new_tokens)
# resize_token_embeddings
#model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

# Data preprocessing

In [6]:
def preprocessing(df):
    contexts = df['string_X_train'].values.tolist()
    questions = [ 'What is the bank name?' for i in range(len(df))]
    answers = []
    for idx in df.index:
        answers.append({
            'text':df.loc[idx,'Y_label'],
            'answer_start':df.loc[idx,'string_Y_1'],
            'answer_end':df.loc[idx,'string_Y_2'],
            })
    return contexts ,questions ,answers

def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

def df2DataLoader(df):
    contexts ,questions ,answers = preprocessing(df)
    encodings = tokenizer(contexts, questions, truncation=True, padding=True)
    encodings = add_token_positions(encodings, answers)
    dataset = SquadDataset(encodings)
    return dataset

In [7]:
train_dataset = df2DataLoader(train_df)
val_dataset = df2DataLoader(val_df)

In [8]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch",num_train_epochs=3)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
)

trainer.train()

***** Running training *****
  Num examples = 1160
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 111


Epoch,Training Loss,Validation Loss
1,No log,0.506757
2,No log,0.233445
3,No log,0.23105


***** Running Evaluation *****
  Num examples = 291
  Batch size = 32
***** Running Evaluation *****
  Num examples = 291
  Batch size = 32
***** Running Evaluation *****
  Num examples = 291
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=111, training_loss=0.9177645434130419, metrics={'train_runtime': 48.2854, 'train_samples_per_second': 72.071, 'train_steps_per_second': 2.299, 'total_flos': 454672788111360.0, 'train_loss': 0.9177645434130419, 'epoch': 3.0})

# save

In [9]:
torch.save(trainer.model.state_dict(),'Product_Data_SQuAD_model_bank.pt')

# load

In [10]:
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model.resize_token_embeddings(len(tokenizer))
model.load_state_dict(torch.load('Product_Data_SQuAD_model_bank.pt'))
model.eval()
nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.9.1",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9a

In [11]:
def test_model(nlp,df,test_n=30):
    table = pd.DataFrame()
    idx_list = df.sample(test_n).index.tolist()
    for i in tqdm(idx_list):
        sample = df.loc[[i]]
        string_X_train = sample['string_X_train'].values[0]
        string_Y_1 = sample['string_Y_1'].values[0]
        string_Y_2 = sample['string_Y_2'].values[0]
        QA_input = {
            'question': 'What is the bank name?',
            'context': string_X_train
        }
        res = nlp(QA_input)
        predict = QA_input['context'][res['start']:res['end']]
        predict = predict.split('x000D')[0]
        row = pd.DataFrame({
            'label':string_X_train[string_Y_1:string_Y_2],
            'predict:':predict},index=[i])
        if string_X_train[string_Y_1:string_Y_2] == predict:
            row['是否全對'] = 'Yes'
        else:
            row['是否全對'] = 'No'
        table = table.append(row)
        
        # make persudo label by nlp output 
        persudo_val_df = df.copy()
        persudo_val_df.loc[i]['string_Y_1'] = res['start']
        persudo_val_df.loc[i]['string_Y_2'] = res['end']
        persudo_val_df.loc[i]['Y_label'] = predict
    
    return table,persudo_val_df

In [12]:
all_res,_ = test_model(nlp,val_df)
all_res

  0%|          | 0/30 [00:00<?, ?it/s]

Unnamed: 0,label,predict:,是否全對
6275,MIZUHO BANK LTD,MIZUHO BANK LTD,Yes
1647,CTBC BANK CO LTD,CTBC BANK,No
4820,CITIBANK NA,OF,No
6579,BANGKOK BANK PUBLIC COMPANY LIMITED,BANGKOK BANK PUBLIC COMPANY LIMITED,Yes
7828,TAICHUNG COMMERCIAL BANK,TAICHUNG COMMERCIAL BANK,Yes
6190,BANK SINOPACOBU,BANK SINOPACOBU,Yes
6679,THE KOREA DEVELOPMENT BANK,KOREA DEVELOPMENT BANK,No
1291,CHINA EVERBRIGHT BANK HANGZHOU BRANCH,SERVICE,No
7547,SUMITOMO MITSUI BANKING CORPORATION,SUMITOMO MITSUI BANKING CORPORATIONBANGKOK,No
3394,BANQUE SAUDI FRANSI,BANQUE SAUDI FRANSI,Yes


In [13]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


def get_acc(df,t=1):
    df['predict'] = [[i] for i in df['predict:']]
    correct = []
    correct_label = []
    for i in df.index:
        jacs = []
        for j in df.loc[i,'predict']:
            jacs.append(get_jaccard_sim(df.loc[i,'label'],j))
        if max(jacs) >= t:
            correct.append('yes')
        else:
            correct.append('no')
    return pd.Series(correct).value_counts()['yes']/len(correct)

acc = get_acc(all_res,1)
print('acc:',acc)
acc_放水 = get_acc(all_res,0.75)
print('acc 放水:',acc_放水)
jaccard_avg_score = np.mean([ get_jaccard_sim(all_res.label.loc[i],all_res['predict:'].loc[i]) for i in all_res.index])
print('jaccard_avg_score:',jaccard_avg_score)

acc: 0.6
acc 放水: 0.6333333333333333
jaccard_avg_score: 0.7427777777777778
