In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from torch.optim.optimizer import Optimizer
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import random
import torch
from transformers import pipeline
import warnings 
warnings.filterwarnings('ignore')
from pytorch_lightning import seed_everything
from torch.utils.data import DataLoader
import os
import gc
import numpy as np
import re
gc.collect()

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)

# 預處理x
def preprocess_x(x):
    x = str(x) # 0.轉字串
    x = re.sub('[\u4e00-\u9fa5]', '', x) # 1.去除中文
    x = re.sub('[’!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~，。,.]', '', x) # 2.去除標點符號
    x = x.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') # 3.去除換行符號
    x = x.replace('x000D','') # 4.移除'x000D'
    x = ' ' + str.strip(x) + ' ' # 5.移除左右空白 在左右各加一格空白
    return x

def model_predict(nlp,df):
    table = pd.DataFrame()
    for i in tqdm(df.index):
        sample = df.loc[[i]]
        string_X_train = sample['string_X_train'].values[0]
        QA_input = {
            'question': 'What is the bank name?',
            'context': string_X_train
        }
        res = nlp(QA_input)
        predict = QA_input['context'][res['start']:res['end']]
        row = pd.DataFrame({'predict:':predict},index=[i])
        table = table.append(row)
    return table

尾綴列表 = np.load('..\data\寶典\銀行尾綴.npy')
split_keywords = np.load('..\data\寶典\銀行關鍵字.npy')
銀行列表 = np.load('..\data\寶典\銀行寶典.npy')

# 人工規則找銀行
def get_bank(text,split_keywords,尾綴列表):
    text = preprocess_x(text)
    def 根據尾綴找答案(text,尾綴):
        if 尾綴 in text:
            result = preprocess_x(text[:text.find(尾綴)+len(尾綴)]).strip()
            if (尾綴 in result) and (len(result) > len(尾綴)):
                return result
            else:
                return 'not find'
        else:
            return 'not find'
    for keyword in split_keywords:
        if keyword in text:
            result_list = []
            for 尾綴 in 尾綴列表:
                result = 根據尾綴找答案(text.split(keyword)[1],尾綴)
                if result != 'not find':
                    return result
                else:
                    return 'not find'
        else:
            return 'not find'

# 根據寶典找銀行
def get_bank_寶典(x,寶典):
    for p in 寶典:
        if p in x:
            return p
    return 'not find'

def get_acc(df,t=0.75):
    correct = []
    correct_label = []
    for i in df.index:
        jac = get_jaccard_sim(df.loc[i,'Y_label'],df.loc[i,'predict'])
        if jac >= t:
            correct.append('yes')
        else:
            correct.append('no')
    result = pd.Series(correct)
    return result.value_counts()['yes']/len(result)

def get_jac(df):
    all_jacs = []
    for i in df.index:
        all_jacs.append(get_jaccard_sim(str(df.loc[i,'Y_label']),str(df.loc[i,'predict'])))
    return np.sum(all_jacs)/len(all_jacs)

Global seed set to 42


# LOAD DATA

In [2]:
df = pd.read_csv('../data/preprocess_for_SQUAD_銀行.csv',index_col=0)
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
print(train_df.shape)
print(val_df.shape)
display(val_df)
display(val_df['Y_label'].apply(lambda x:x.split(' ')[-1]).value_counts().head(10))

(2984, 10)
(747, 10)


Unnamed: 0,46A,47A,78,LCBK,string_X_train,Y_label,string_Y_1,string_Y_2,predict,correct
7209,+SIGNED COMMERCIAL INVOICE IN 3 COPIES\r\n+FUL...,+DISCOUNT CHARGES ARE FOR SELLER'S ACCOUNT\r\n...,ALL DOCUMENTS MUST BE FORWARDED TO KEB HANA B...,KEB HANA BANK,SIGNED COMMERCIAL INVOICE IN 3 COPIES FULL S...,KEB HANA BANK,115,128,KEB HANA BANK,yes
19459,1.SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,1.T/T REIMBURSEMENTS ARE NOT ALLOWED.\r\n2.DRA...,1.THE AMOUNT OF EACH PRESENTATION UNDER THIS C...,CHINA CITIC BANK,1SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,CHINA CITIC BANK,1949,1965,CHINA CITIC BANK,yes
7585,+SIGNED COMMERCIAL INVOICE IN TRIPLICATE\r\n+P...,DOCUMENTS MUST BE PRESENTED PRIOR TO CREDIT EX...,T/T REIMBURSEMENT ALLOWED\r\nTHE DRAFTS MUST B...,SHINHAN BANK,SIGNED COMMERCIAL INVOICE IN TRIPLICATE PACK...,SHINHAN BANK,146,158,SHINHAN BANK,yes
15330,+SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS.+INS...,+ALL DOCUMENTS MUST BE ISSUED IN ENGLISH LANGU...,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD.,SIGNED COMMERCIAL INVOICE IN 3 ORIGINALSINSUR...,MUFG BANK LTD,734,747,IE MUFG BANK LTD,no
1644,+SIGNED COMMERCIAL INVOICE IN 3 COPIES INDICAT...,+THIRD PARTY DOCUMENTS EXCEPT INVOICE AND DRAF...,+TO NEGOTIATING BANK ONLY: PLEASE FORWARD ALL ...,"CTBC BANK CO., LTD.",SIGNED COMMERCIAL INVOICE IN 3 COPIES INDICAT...,CTBC BANK CO LTD,2151,2167,CTBC BANK CO LTD,yes
...,...,...,...,...,...,...,...,...,...,...
5404,+SIGNED COMMERCIAL INVOICE IN 3 COPIES\r\n+FUL...,+APPLICANT'S CONTACT DETAILS (IN 50 FIELD): TE...,+ALL DOCUMENTS SHOULD BE FORWARDED TO NONGHYUP...,NONGHYUP BANK,SIGNED COMMERCIAL INVOICE IN 3 COPIES FULL S...,NONGHYUP BANK,116,129,NONGHYUP BANK,yes
15244,1.SIGNED COMMERCIAL INVOICES IN THREE ORIGINAL...,1.INVOICE IN EXCESS OF DRAFT AMOUNTARE NOT ACC...,1.ALL NEGOTIATIONS UNDER THIS CREDIT MUST BEEN...,BANK OF BARODA,1SIGNED COMMERCIAL INVOICES IN THREE ORIGINAL...,BANK OF BARODA,217,231,BANK OF BARODA,yes
15834,+ SIGNED COMMERCIAL INVOICE IN TRIPLICATE+ FUL...,+A DISCREPANCY FEE OF USD80.00(OR EQUIVALENT) ...,+ALL DOCUMENTS MUST BE FORWARDED DIRECTLY TO O...,"INDUSTRIAL BANK OF KOREA, SEOUL",SIGNED COMMERCIAL INVOICE IN TRIPLICATE FULL ...,INDUSTRIAL BANK OF KOREA SEOUL,875,905,INDUSTRIAL BANK OF KOREA SEOUL,yes
934,+SIGNED COMMERCIAL INVOICE IN TRIPLICATE.\r\n+...,+APPLICANT'S ADDRESS:NO.M-05B YINGFU INDUSTRIA...,+THE NEGOTIATING BANK IS TO FORWARD DIRECT TO:...,OCBC WING HANG BANK (CHINA)\nLIMITED,SIGNED COMMERCIAL INVOICE IN TRIPLICATE FULL...,OCBC WING HANG BANK CHINA LIMITED,144,177,OCBC WING HANG BANK CHINA LIMITED,no


BANK       263
LTD        124
LIMITED     70
BHD         36
NA          30
BERHAD      23
SEOUL       22
CHINA       21
BRANCH      13
BM          13
Name: Y_label, dtype: int64

# Load Model

In [3]:
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model.load_state_dict(torch.load('../models/Product_Data_SQuAD_model_銀行.pt'))
model.eval()
nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)
gc.collect()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

0

# 先規則

In [4]:
result = pd.DataFrame()
result['string_X_train'] = val_df['string_X_train']
result['Y_label'] = val_df['Y_label']
result['predict'] = [ get_bank_寶典(i,銀行列表) for i in val_df['string_X_train'].values]
result.loc[result['predict'] != 'not find','class'] = 'rule'
result.loc[result['predict'] != 'not find',:]

Unnamed: 0,string_X_train,Y_label,predict,class
7209,SIGNED COMMERCIAL INVOICE IN 3 COPIES FULL S...,KEB HANA BANK,KEB HANA BANK,rule
19459,1SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,CHINA CITIC BANK,CHINA CITIC BANK,rule
7585,SIGNED COMMERCIAL INVOICE IN TRIPLICATE PACK...,SHINHAN BANK,SHINHAN BANK,rule
15330,SIGNED COMMERCIAL INVOICE IN 3 ORIGINALSINSUR...,MUFG BANK LTD,IE MUFG BANK LTD,rule
1644,SIGNED COMMERCIAL INVOICE IN 3 COPIES INDICAT...,CTBC BANK CO LTD,CTBC BANK CO LTD,rule
...,...,...,...,...
5404,SIGNED COMMERCIAL INVOICE IN 3 COPIES FULL S...,NONGHYUP BANK,NONGHYUP BANK,rule
15244,1SIGNED COMMERCIAL INVOICES IN THREE ORIGINAL...,BANK OF BARODA,BANK OF BARODA,rule
15834,SIGNED COMMERCIAL INVOICE IN TRIPLICATE FULL ...,INDUSTRIAL BANK OF KOREA SEOUL,INDUSTRIAL BANK OF KOREA SEOUL,rule
934,SIGNED COMMERCIAL INVOICE IN TRIPLICATE FULL...,OCBC WING HANG BANK CHINA LIMITED,OCBC WING HANG BANK CHINA LIMITED,rule


# 後接bert

In [5]:
not_find_df = result.loc[result['predict']=='not find',:]
display(not_find_df)

Unnamed: 0,string_X_train,Y_label,predict,class


In [6]:
bert_predict = model_predict(nlp,not_find_df)
bert_predict

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [7]:
try:
    result.loc[bert_predict.index,'predict'] = bert_predict['predict:']
    result.loc[bert_predict.index,'class'] = 'bert'
    result
except:
    pass

In [8]:
try:
    display(result[result['Y_label']==result['predict']].sample(10).append(result[result['Y_label']!=result['predict']].sample(10)))
except:
    display(result.sample(10))

Unnamed: 0,string_X_train,Y_label,predict,class
5780,SIGNED COMMERCIAL INVOICE IN 3 COPIES FULL S...,KOREA DEVELOPMENT BANK,KOREA DEVELOPMENT BANK,rule
927,SIGNED COMMERCIAL INVOICE IN TRIPLICATE FUL...,INDUSTRIAL BANK OF KOREA SEOUL,INDUSTRIAL BANK OF KOREA SEOUL,rule
5421,1 SIGNED COMMERCIAL INVOICE 6 ORIGINALS 2 PA...,RAIFFEISENVERBAND SALZBURG,RAIFFEISENVERBAND SALZBURG,rule
6437,DOCUMENTS IN TRIPLICATE UNLESS OTHERWISE STIP...,UNITED OVERSEAS BANK MALAYSIA BHD,UNITED OVERSEAS BANK MALAYSIA BHD,rule
5976,SIGNED COMMERCIAL INVOICE IN TRIPLICATE PACK...,SHINHAN BANK,SHINHAN BANK,rule
2649,1 BENEFICIARYS SIGNED ORIGINAL COMMERCIAL INV...,HABIB METROPOLITAN BANK LTD,HABIB METROPOLITAN BANK LTD,rule
20588,1 SIGNED COMMERICAL INVOICE IN THREE ORIGINAL...,MUFG BANK LTD,MUFG BANK LTD,rule
6229,1 SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AN...,BANK CENTRAL ASIA,BANK CENTRAL ASIA,rule
4300,1HANDSIGNED AND DATED COMMERCIAL INVOICE IN T...,CREDITO EMILIANO SPA,CREDITO EMILIANO SPA,rule
7955,SIGNED COMMERCIAL INVOICE IN 2 ORIGINALS AND ...,FIRST COMMERCIAL BANK,FIRST COMMERCIAL BANK,rule


In [9]:
get_acc(result,1),get_acc(result,0.75),get_jac(result)

(0.8393574297188755, 0.8848728246318608, 0.9315441660823186)