In [44]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from torch.optim.optimizer import Optimizer
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import random
import torch
from transformers import pipeline
import warnings 
warnings.filterwarnings('ignore')
from pytorch_lightning import seed_everything
from torch.utils.data import DataLoader
import os
import gc
import numpy as np
import re
gc.collect()

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)

def preprocess(x):
    x = str(x) # 0.轉字串
    x = re.sub('[\u4e00-\u9fa5]', '', x) # 1.去除中文
    x = re.sub('[’!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~，。,.]', '', x) # 2.去除標點符號
    x = x.replace('\n', '').replace('\r', '').replace('\t', '') # 3.去除換行符號
    x = str.strip(x) # 4.移除左右空白
    x = x.replace('x000D','') # 5.移除'x000D'
    return x

def model_predict(nlp,df):
    table = pd.DataFrame()
    for i in tqdm(df.index):
        sample = df.loc[[i]]
        string_X_train = sample['string_X_train'].values[0]
        QA_input = {
            'question': 'What is the bank name?',
            'context': string_X_train
        }
        res = nlp(QA_input)
        predict = QA_input['context'][res['start']:res['end']]
        row = pd.DataFrame({'predict:':predict},index=[i])
        table = table.append(row)
    return table

def get_bank(text):
    text = preprocess(str(text))
    split_keywords = ['TO ORDER OF','TO THEORDER OF','TO THE ORDER OF','TOTHE ORDER OF','TO THE ORDER+OF','TOORDER OF']
    for i in split_keywords:
        if i in text: # 若找到關鍵分割字串
            text = text.split(i)[1] # 取後半段文章
            
            # ==========各種case==================
            # case 1 LIMITED
            if 'LIMITED' in text:
                idx = text.find('LIMITED')
                result = preprocess(text[:idx+len('LIMITED')])
                if ('LIMITED' in result) and (len(result) > len('LIMITED')):
                    return result
            
            # case 2 LTD
            elif 'LTD' in text:
                idx = text.find('LTD')
                result = preprocess(text[:idx+len('LTD')])
                if ('LTD' in result) and (len(result) > len('LTD')):
                    return result
            
            # case 3 BANK
            elif 'BANK' in text:
                idx = text.find('BANK')
                result = preprocess(text[:idx+len('BANK')])
                if ('BANK' in result) and (len(result) > len('BANK')):
                    return result 
            
            # case 4 BANKMARKED
            elif 'BANKMARKED' in text:
                idx = text.find('BANKMARKED')
                result = preprocess(text[:idx+len('BANKMARKED')])
                if ('BANKMARKED' in result) and (len(result) > len('BANKMARKED')):
                    return result 
            
            # case 5 NA
            elif 'NA' in text:
                idx = text.find('NA')
                result = preprocess(text[:idx+len('NA')])
                if ('NA' in result) and (len(result) > len('NA')):
                    return result 

            # case 6 CHINA
            elif 'CHINA' in text:
                idx = text.find('CHINA')
                result = preprocess(text[:idx+len('CHINA')])
                if ('CHINA' in result) and (len(result) > len('CHINA')):
                    return result 
            
            # case 7 BRANCH
            elif 'BRANCH' in text:
                idx = text.find('BRANCH')
                result = preprocess(text[:idx+len('BRANCH')])
                if ('BRANCH' in result) and (len(result) > len('BRANCH')):
                    return result
            
            # case 8 找到關鍵分隔字 但是找不到上面的模式
            else:
                return 'not find'
        
        # case 9 找不到關鍵分隔字
        else:
            return 'not find'

def get_acc(df,t=0.75):
    correct = []
    correct_label = []
    for i in df.index:
        jac = get_jaccard_sim(df.loc[i,'Y_label'],df.loc[i,'predict'])
        if jac >= t:
            correct.append('yes')
        else:
            correct.append('no')
    result = pd.Series(correct)
    return result.value_counts()['yes']/len(result)

def get_jac(df):
    all_jacs = []
    for i in df.index:
        all_jacs.append(get_jaccard_sim(str(df.loc[i,'Y_label']),str(df.loc[i,'predict'])))
    return np.sum(all_jacs)/len(all_jacs)

Global seed set to 42


# LOAD DATA

In [45]:
df = pd.read_csv('../data/preprocess_for_SQUAD_銀行.csv',index_col=0)
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
print(train_df.shape)
print(val_df.shape)
display(val_df)
display(val_df['Y_label'].apply(lambda x:x.split(' ')[-1]).value_counts().head(10))

(2993, 4)
(749, 4)


Unnamed: 0,string_X_train,Y_label,string_Y_1,string_Y_2
19801,SIGNED COMMERCIAL INVOICE IN 03 COPIESFULL SET...,WOORI BANK,111,121
3132,1 SIGNED COMMERCIAL INVOICE IN 03 ORIGINALS 2 ...,VIETINBANK,163,173
1247,1DRAFT FOR 100 PCT OF INVOICE VALUE2COMPLETE S...,INDUSIND BANK LIMITED,2111,2132
5193,SIGNED COMMERCIAL INVOICE IN 3 FOLDFULL SET OF...,KOOKMIN BANK,108,120
5923,1 SIGNED COMMERCIAL INVOICE IN TRIPLICATE2 SIG...,CATHAY UNITED BANK,2443,2461
...,...,...,...,...
19522,1 SEALED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK OF CHINA LTD,679,696
19114,SIGNED COMMERCIAL INVOICE IN TRIPLICATE FULL S...,INDUSTRIAL BANK OF KOREA SEOUL LETTER OF CREDI...,864,948
2087,SIGNED COMMERCIAL INVOICE IN 1 COPY INDICATING...,CTBC BANK CO LTD,2718,2734
4167,SIGNED COMMERCIAL INVOICE IN 2 ORIGINALSPACKIN...,TAIPEI FUBON COMMERCIAL BANK,435,463


BANK          269
LTD           135
LIMITED        53
NA             40
BRANCH         25
CHINA          22
SEOUL          20
BERHAD         20
BM             16
VIETINBANK     14
Name: Y_label, dtype: int64

# Load Model

In [46]:
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model.load_state_dict(torch.load('../models/Product_Data_SQuAD_model_銀行.pt'))
model.eval()
nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)
gc.collect()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

93

# 先規則

In [47]:
result = pd.DataFrame()
result['string_X_train'] = val_df['string_X_train']
result['Y_label'] = val_df['Y_label']
result['predict'] = val_df['string_X_train'].apply(get_bank)
result.loc[result['predict'] != 'not find','class'] = 'rule'
result

Unnamed: 0,string_X_train,Y_label,predict,class
19801,SIGNED COMMERCIAL INVOICE IN 03 COPIESFULL SET...,WOORI BANK,not find,
3132,1 SIGNED COMMERCIAL INVOICE IN 03 ORIGINALS 2 ...,VIETINBANK,VIETINBANK,rule
1247,1DRAFT FOR 100 PCT OF INVOICE VALUE2COMPLETE S...,INDUSIND BANK LIMITED,INDUSIND BANK LTD QUOTING OUR LC AND DATE MARK...,rule
5193,SIGNED COMMERCIAL INVOICE IN 3 FOLDFULL SET OF...,KOOKMIN BANK,not find,
5923,1 SIGNED COMMERCIAL INVOICE IN TRIPLICATE2 SIG...,CATHAY UNITED BANK,not find,
...,...,...,...,...
19522,1 SEALED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK OF CHINA LTD,not find,
19114,SIGNED COMMERCIAL INVOICE IN TRIPLICATE FULL S...,INDUSTRIAL BANK OF KOREA SEOUL LETTER OF CREDI...,not find,
2087,SIGNED COMMERCIAL INVOICE IN 1 COPY INDICATING...,CTBC BANK CO LTD,not find,
4167,SIGNED COMMERCIAL INVOICE IN 2 ORIGINALSPACKIN...,TAIPEI FUBON COMMERCIAL BANK,not find,


# 後接bert

In [48]:
not_find_df = result.loc[result['predict']=='not find',:]
display(not_find_df)

Unnamed: 0,string_X_train,Y_label,predict,class
19801,SIGNED COMMERCIAL INVOICE IN 03 COPIESFULL SET...,WOORI BANK,not find,
5193,SIGNED COMMERCIAL INVOICE IN 3 FOLDFULL SET OF...,KOOKMIN BANK,not find,
5923,1 SIGNED COMMERCIAL INVOICE IN TRIPLICATE2 SIG...,CATHAY UNITED BANK,not find,
5503,SIGNED COMMERCIAL INVOICE IN 2 FOLD PACKING LI...,KOOKMIN BANK,not find,
315,1 MULTIMODAL BILLS OF LADING FULL SET REQUIRED...,CITIBANK NA,not find,
...,...,...,...,...
19522,1 SEALED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK OF CHINA LTD,not find,
19114,SIGNED COMMERCIAL INVOICE IN TRIPLICATE FULL S...,INDUSTRIAL BANK OF KOREA SEOUL LETTER OF CREDI...,not find,
2087,SIGNED COMMERCIAL INVOICE IN 1 COPY INDICATING...,CTBC BANK CO LTD,not find,
4167,SIGNED COMMERCIAL INVOICE IN 2 ORIGINALSPACKIN...,TAIPEI FUBON COMMERCIAL BANK,not find,


In [49]:
bert_predict = model_predict(nlp,not_find_df)
bert_predict

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=660.0), HTML(value='')))




Unnamed: 0,predict:
19801,WOORI BANK
5193,KOOKMIN BANKMARKED
5923,CATHAY UNITED BANK
5503,KOOKMIN BANKMAPOYEOK
315,OFCITIBANK NA
...,...
19522,BANK OF CHINA LTD
19114,INDUSTRIAL BANK OF KOREA SEOUL
2087,CTBC BANK CO LTD
4167,HOURSWETAIPEI FUBON COMMERCIAL BANK


In [50]:
result.loc[bert_predict.index] = bert_predict.values
result.loc[bert_predict.index,'class'] = 'bert'
result

Unnamed: 0,string_X_train,Y_label,predict,class
19801,WOORI BANK,WOORI BANK,WOORI BANK,bert
3132,1 SIGNED COMMERCIAL INVOICE IN 03 ORIGINALS 2 ...,VIETINBANK,VIETINBANK,rule
1247,1DRAFT FOR 100 PCT OF INVOICE VALUE2COMPLETE S...,INDUSIND BANK LIMITED,INDUSIND BANK LTD QUOTING OUR LC AND DATE MARK...,rule
5193,KOOKMIN BANKMARKED,KOOKMIN BANKMARKED,KOOKMIN BANKMARKED,bert
5923,CATHAY UNITED BANK,CATHAY UNITED BANK,CATHAY UNITED BANK,bert
...,...,...,...,...
19522,BANK OF CHINA LTD,BANK OF CHINA LTD,BANK OF CHINA LTD,bert
19114,INDUSTRIAL BANK OF KOREA SEOUL,INDUSTRIAL BANK OF KOREA SEOUL,INDUSTRIAL BANK OF KOREA SEOUL,bert
2087,CTBC BANK CO LTD,CTBC BANK CO LTD,CTBC BANK CO LTD,bert
4167,HOURSWETAIPEI FUBON COMMERCIAL BANK,HOURSWETAIPEI FUBON COMMERCIAL BANK,HOURSWETAIPEI FUBON COMMERCIAL BANK,bert


In [51]:
result[result['Y_label']==result['predict']].sample(10).append(result[result['Y_label']!=result['predict']].sample(10))

Unnamed: 0,string_X_train,Y_label,predict,class
5040,HDFC BANK LTD,HDFC BANK LTD,HDFC BANK LTD,bert
5795,KOREA DEVELOPMENT BANK,KOREA DEVELOPMENT BANK,KOREA DEVELOPMENT BANK,bert
4305,1 SIGNED COMMERCIAL INVOICE ISSUED BY BENEFICI...,VIETINBANK,VIETINBANK,rule
695,ATCITIBANK NA,ATCITIBANK NA,ATCITIBANK NA,bert
431,CATHAY UNITEDBANK,CATHAY UNITEDBANK,CATHAY UNITEDBANK,bert
2196,KOREAIBK FINANCE,KOREAIBK FINANCE,KOREAIBK FINANCE,bert
5217,TOBANK HAPOALIM BM,TOBANK HAPOALIM BM,TOBANK HAPOALIM BM,bert
7647,HDFC BANK LTDE,HDFC BANK LTDE,HDFC BANK LTDE,bert
3905,BANK CO LTD,BANK CO LTD,BANK CO LTD,bert
6957,ADDRESSBANK OF CHINA LTDXIAMEN,ADDRESSBANK OF CHINA LTDXIAMEN,ADDRESSBANK OF CHINA LTDXIAMEN,bert


In [52]:
get_acc(result,1),get_acc(result,0.75),get_jac(result)

(0.9172229639519359, 0.9185580774365821, 0.928675990852709)

In [53]:
result.to_csv('../submit/submit_銀行.csv')

In [54]:
result['Y_label'].values.tolist()

['WOORI BANK',
 'VIETINBANK',
 'INDUSIND BANK LIMITED',
 'KOOKMIN BANKMARKED',
 'CATHAY UNITED   BANK',
 'KOOKMIN BANKMAPOYEOK',
 'OFCITIBANK NA',
 'SERVICETOCHINA EVERBRIGHT BANK',
 'WOORI BANK',
 'SHANGHAIBANKING CORPORATION LTD',
 'DRAWN UNDER THEMIZUHO BANKLTDHEAD OFFICECREDIT NOLC053300001724DATED JUNE 22021',
 'BANK OF AMERICA NATIONALASSOCIATION',
 'TMB BANK PUBLIC COMPANY LIMITED',
 'FIRST COMMERCIAL BANK',
 'VIETINBANK',
 'MIZUHO BANK LTD',
 'VERBUNDVOLKSBANK OWL EGINTERNATIONALES',
 'SHINHAN BANK',
 'FIRST COMMERCIAL BANK HEAD OFFICE',
 'BANK OFAYUDHYA PUBLIC COMPANY LIMITED',
 'MIZUHO BANK LTD',
 'KOREA DEVELOPMENT BANK',
 'HABIB BANK LIMITED',
 'SHINHAN BANK',
 'SHINHAN BANK',
 'BANK OF INDIA',
 'SERVICETOCHINA EVERBRIGHT BANK',
 'PT BANK OCBC',
 'BANK OF NOVA SCOTIA TORONTO ONTARIO M4T 1Z3 CANADAPLEASE',
 'SHINHAN BANK',
 'BANK OFAYUDHYA PUBLIC COMPANY LIMITED',
 'BDO UNIBANK INC',
 'SHINHAN BANK',
 'BNP PARIBAS',
 'HDFC BANK LIMITEDTRADE',
 'THEMIZUHO BANKLTDHEAD OFFICECR

In [55]:
result.loc[result['class']=='rule','predict']

3132                                            VIETINBANK
1247     INDUSIND BANK LTD QUOTING OUR LC AND DATE MARK...
20560                                           WOORI BANK
7453     SHIPPER AND ENDORSED IN BLANKMARKED FREIGHT PR...
1725                                            VIETINBANK
                               ...                        
3238                                  ASIA COMMERCIAL BANK
1670     ICICI BANK LTD ICICICENTRE 163 HTPAREKH MARG B...
5726                   BANGKOK BANK PUBLIC COMPANY LIMITED
3960     SHIPPER ANDENDORSED TO CITIBANK NA DUBAI EVIDE...
8276     ISSUING BANKMARKED FRIGHT PREPAID NOTIFING APP...
Name: predict, Length: 89, dtype: object

In [56]:
result['Y_label'].apply(lambda x:x.split(' ')[-1]).value_counts().head(10)

BANK                   205
LTD                    131
LIMITED                 47
BANKMARKED              32
NA                      29
CHINA                   18
BRANCH                  16
BERHAD13491             14
VIETINBANK              14
NATIONALASSOCIATION     13
Name: Y_label, dtype: int64