In [17]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from torch.optim.optimizer import Optimizer
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import random
import torch
from transformers import pipeline
import warnings 
warnings.filterwarnings('ignore')
from pytorch_lightning import seed_everything
from torch.utils.data import DataLoader
import os
import gc
import numpy as np
import re
gc.collect()

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)

def preprocess(x):
    x = str(x)
    x = re.sub('[\u4e00-\u9fa5]', '', x) # 1.去除中文
    x = re.sub('[’!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~，。,.]', '', x) # 2.去除標點符號
    x = x.replace('\n', '').replace('\r', '').replace('\t', '') # 3.去除換行符號
    x = str.strip(x) # 4.移除左右空白
    x = x.replace('x000D','') # 5.移除'x000D'
    return x

def model_predict(nlp,df):
    table = pd.DataFrame()
    for i in tqdm(df.index):
        sample = df.loc[[i]]
        string_X_train = sample['string_X_train'].values[0]
        QA_input = {
            'question': 'What is the bank name?',
            'context': string_X_train
        }
        res = nlp(QA_input)
        predict = QA_input['context'][res['start']:res['end']]
        row = pd.DataFrame({'predict:':predict},index=[i])
        table = table.append(row)
    return table

def get_bank(text):
    text = preprocess(str(text))
    keywords = ['TO ORDER OF','TO THEORDER OF','TO THE ORDER OF','TOTHE ORDER OF','TO THE ORDER+OF','TOORDER OF']
    for i in keywords:
        if i in text:
            idx = text.split(i)[1].find('BANK')
            result = preprocess(text.split(i)[1][:idx+len('BANK')])
            if ('BANK' in result) and (len(result) > 4): # 包含BANK且字串超過四個字
                return result
            
            elif str.strip(result) == 'BANK': # 只有BANK當作沒找到
                return 'not find'
            
            else:
                return 'not find'
        else:
            return 'not find'

def get_acc(df,t=0.75):
    correct = []
    correct_label = []
    for i in df.index:
        jac = get_jaccard_sim(df.loc[i,'Y_label'],df.loc[i,'predict'])
        if jac >= t:
            correct.append('yes')
        else:
            correct.append('no')
    result = pd.Series(correct)
    return result.value_counts()['yes']/len(result)

def get_jac(df):
    all_jacs = []
    for i in df.index:
        all_jacs.append(get_jaccard_sim(str(df.loc[i,'Y_label']),str(df.loc[i,'predict'])))
    return np.sum(all_jacs)/len(all_jacs)

Global seed set to 42


# LOAD DATA

In [9]:
df = pd.read_csv('../data/preprocess_for_SQUAD_銀行.csv',index_col=0)
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
print(train_df.shape)
print(val_df.shape)
val_df

(2993, 4)
(749, 4)


Unnamed: 0,string_X_train,Y_label,string_Y_1,string_Y_2
19801,SIGNED COMMERCIAL INVOICE IN 03 COPIESFULL SET...,WOORI BANK,111,121
3132,1 SIGNED COMMERCIAL INVOICE IN 03 ORIGINALS 2 ...,VIETINBANK,163,173
1247,1DRAFT FOR 100 PCT OF INVOICE VALUE2COMPLETE S...,INDUSIND BANK LIMITED,2111,2132
5193,SIGNED COMMERCIAL INVOICE IN 3 FOLDFULL SET OF...,KOOKMIN BANK,108,120
5923,1 SIGNED COMMERCIAL INVOICE IN TRIPLICATE2 SIG...,CATHAY UNITED BANK,2443,2461
...,...,...,...,...
19522,1 SEALED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK OF CHINA LTD,679,696
19114,SIGNED COMMERCIAL INVOICE IN TRIPLICATE FULL S...,INDUSTRIAL BANK OF KOREA SEOUL LETTER OF CREDI...,864,948
2087,SIGNED COMMERCIAL INVOICE IN 1 COPY INDICATING...,CTBC BANK CO LTD,2718,2734
4167,SIGNED COMMERCIAL INVOICE IN 2 ORIGINALSPACKIN...,TAIPEI FUBON COMMERCIAL BANK,435,463


# Load Model

In [10]:
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model.load_state_dict(torch.load('../models/Product_Data_SQuAD_model_銀行.pt'))
model.eval()
nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)
gc.collect()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

317

# 先規則

In [11]:
result = pd.DataFrame()
result['string_X_train'] = val_df['string_X_train']
result['Y_label'] = val_df['Y_label']
result['predict'] = val_df['string_X_train'].apply(get_bank)
result.loc[result['predict'] != 'not find','class'] = 'rule'
result

Unnamed: 0,string_X_train,Y_label,predict,class
19801,SIGNED COMMERCIAL INVOICE IN 03 COPIESFULL SET...,WOORI BANK,not find,
3132,1 SIGNED COMMERCIAL INVOICE IN 03 ORIGINALS 2 ...,VIETINBANK,VIETINBANK,rule
1247,1DRAFT FOR 100 PCT OF INVOICE VALUE2COMPLETE S...,INDUSIND BANK LIMITED,INDUSIND BANK,rule
5193,SIGNED COMMERCIAL INVOICE IN 3 FOLDFULL SET OF...,KOOKMIN BANK,not find,
5923,1 SIGNED COMMERCIAL INVOICE IN TRIPLICATE2 SIG...,CATHAY UNITED BANK,not find,
...,...,...,...,...
19522,1 SEALED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK OF CHINA LTD,not find,
19114,SIGNED COMMERCIAL INVOICE IN TRIPLICATE FULL S...,INDUSTRIAL BANK OF KOREA SEOUL LETTER OF CREDI...,not find,
2087,SIGNED COMMERCIAL INVOICE IN 1 COPY INDICATING...,CTBC BANK CO LTD,not find,
4167,SIGNED COMMERCIAL INVOICE IN 2 ORIGINALSPACKIN...,TAIPEI FUBON COMMERCIAL BANK,not find,


# 後接bert

In [12]:
not_find_df = result.loc[result['predict']=='not find',:]
display(not_find_df)

Unnamed: 0,string_X_train,Y_label,predict,class
19801,SIGNED COMMERCIAL INVOICE IN 03 COPIESFULL SET...,WOORI BANK,not find,
5193,SIGNED COMMERCIAL INVOICE IN 3 FOLDFULL SET OF...,KOOKMIN BANK,not find,
5923,1 SIGNED COMMERCIAL INVOICE IN TRIPLICATE2 SIG...,CATHAY UNITED BANK,not find,
5503,SIGNED COMMERCIAL INVOICE IN 2 FOLD PACKING LI...,KOOKMIN BANK,not find,
315,1 MULTIMODAL BILLS OF LADING FULL SET REQUIRED...,CITIBANK NA,not find,
...,...,...,...,...
19522,1 SEALED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK OF CHINA LTD,not find,
19114,SIGNED COMMERCIAL INVOICE IN TRIPLICATE FULL S...,INDUSTRIAL BANK OF KOREA SEOUL LETTER OF CREDI...,not find,
2087,SIGNED COMMERCIAL INVOICE IN 1 COPY INDICATING...,CTBC BANK CO LTD,not find,
4167,SIGNED COMMERCIAL INVOICE IN 2 ORIGINALSPACKIN...,TAIPEI FUBON COMMERCIAL BANK,not find,


In [13]:
bert_predict = model_predict(nlp,not_find_df)
bert_predict

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=664.0), HTML(value='')))




Unnamed: 0,predict:
19801,WOORI BANK
5193,KOOKMIN BANKMARKED
5923,CATHAY UNITED BANK
5503,KOOKMIN BANKMAPOYEOK
315,OFCITIBANK NA
...,...
19522,BANK OF CHINA LTD
19114,INDUSTRIAL BANK OF KOREA SEOUL
2087,CTBC BANK CO LTD
4167,HOURSWETAIPEI FUBON COMMERCIAL BANK


In [14]:
result.loc[bert_predict.index] = bert_predict.values
result.loc[bert_predict.index,'class'] = 'bert'
result

Unnamed: 0,string_X_train,Y_label,predict,class
19801,WOORI BANK,WOORI BANK,WOORI BANK,bert
3132,1 SIGNED COMMERCIAL INVOICE IN 03 ORIGINALS 2 ...,VIETINBANK,VIETINBANK,rule
1247,1DRAFT FOR 100 PCT OF INVOICE VALUE2COMPLETE S...,INDUSIND BANK LIMITED,INDUSIND BANK,rule
5193,KOOKMIN BANKMARKED,KOOKMIN BANKMARKED,KOOKMIN BANKMARKED,bert
5923,CATHAY UNITED BANK,CATHAY UNITED BANK,CATHAY UNITED BANK,bert
...,...,...,...,...
19522,BANK OF CHINA LTD,BANK OF CHINA LTD,BANK OF CHINA LTD,bert
19114,INDUSTRIAL BANK OF KOREA SEOUL,INDUSTRIAL BANK OF KOREA SEOUL,INDUSTRIAL BANK OF KOREA SEOUL,bert
2087,CTBC BANK CO LTD,CTBC BANK CO LTD,CTBC BANK CO LTD,bert
4167,HOURSWETAIPEI FUBON COMMERCIAL BANK,HOURSWETAIPEI FUBON COMMERCIAL BANK,HOURSWETAIPEI FUBON COMMERCIAL BANK,bert


In [15]:
result[result['Y_label']==result['predict']].sample(10).append(result[result['Y_label']!=result['predict']].sample(10))

Unnamed: 0,string_X_train,Y_label,predict,class
20011,AGRICULTURAL BANK OF CHINA,AGRICULTURAL BANK OF CHINA,AGRICULTURAL BANK OF CHINA,bert
1999,BANK OF AMERICA NATIONALASSOCIATION,BANK OF AMERICA NATIONALASSOCIATION,BANK OF AMERICA NATIONALASSOCIATION,bert
2196,KOREAIBK FINANCE,KOREAIBK FINANCE,KOREAIBK FINANCE,bert
15840,FIRST COMMERCIAL BANKAND,FIRST COMMERCIAL BANKAND,FIRST COMMERCIAL BANKAND,bert
7086,CITIBANK N A,CITIBANK N A,CITIBANK N A,bert
3979,KOOKMIN BANKMAPOYEOK,KOOKMIN BANKMAPOYEOK,KOOKMIN BANKMAPOYEOK,bert
4259,KEB HANA BANKMARKED,KEB HANA BANKMARKED,KEB HANA BANKMARKED,bert
4925,BANK SINOPAC2DISCREPANCY,BANK SINOPAC2DISCREPANCY,BANK SINOPAC2DISCREPANCY,bert
78,VERBUNDVOLKSBANK OWL EGINTERNATIONALES,VERBUNDVOLKSBANK OWL EGINTERNATIONALES,VERBUNDVOLKSBANK OWL EGINTERNATIONALES,bert
1989,BANK OF AMERICA NATIONALASSOCIATION,BANK OF AMERICA NATIONALASSOCIATION,BANK OF AMERICA NATIONALASSOCIATION,bert


In [18]:
get_acc(result,1),get_acc(result,0.75),get_jac(result)

(0.910547396528705, 0.9132176234979973, 0.940575828670158)

In [19]:
result.to_csv('../submit/submit_銀行.csv')