In [99]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from torch.optim.optimizer import Optimizer
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import random
import torch
from transformers import pipeline
import warnings 
warnings.filterwarnings('ignore')
from pytorch_lightning import seed_everything
from torch.utils.data import DataLoader
import os
import gc
gc.collect()

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# set_seed(42)

In [100]:
def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)

Global seed set to 42


# LOAD DATA

In [101]:
df = pd.read_csv('46A47A78LCBK.csv',index_col=0)
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
print(train_df.shape)
print(val_df.shape)
val_df

(1160, 4)
(291, 4)


Unnamed: 0,string_X_train,Y_label,string_Y_1,string_Y_2
538,1THE AMOUNT OF EACH PRESENTATION UNDER THIS CR...,CHINA CITIC BANK,279,295
754,SIGNED COMMERCIAL INVOICE IN 3 COPIESx000DFULL...,KEB HANA BANK,120,133
49,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD,133,146
1447,APPLICANTS ADDRESSx000D NO33 KEFENG ROAD SCIEN...,CTBC BANK CO LTD,1048,1064
141,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD,133,146
...,...,...,...,...
1299,IMMEDIATELY AFTER NEGOTIATION DOCUMENTS MUST B...,SAUDI BRITISH BANK,106,124
1293,SIGNED COMMERCIAL INVOICE IN 1 ORIGINAL INDICA...,TAISHIN INTERNATIONAL BANK,185,211
714,1 ALL DOCUMENTS TO BE FORWARDED IN ONE COVERx0...,BANK OF CHINA LTD,62,79
254,1 SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK CENTRAL ASIA,214,231


# Load Model

In [102]:
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model.load_state_dict(torch.load('Product_Data_SQuAD_model_bank.pt'))
model.eval()
nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)
gc.collect()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

244

# rule base

In [103]:
import numpy as np

import re

def preprocess(x):
    x = str(x)
    x = re.sub('[\u4e00-\u9fa5]', '', x) # 1.去除中文
    x = re.sub('[’!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~，。,.]', '', x) # 2.去除標點符號
    x = x.replace('\n', '').replace('\r', '').replace('\t', '') # 3.去除換行符號
    x = str.strip(x) # 4.移除左右空白
    if 'x000D' in x:
        x = x.replace('x000D','')
    return x

def get_bank(text):
    text = str(text)
    text = preprocess(text)
    keywords = ['TO ORDER OF','TO THEORDER OF','TO THE ORDER OF','TOTHE ORDER OF','TO THE ORDER+OF','TOORDER OF']
    for i in keywords:
        if i in text:
            idx = text.split(i)[1].find('BANK')
            result = preprocess(text.split(i)[1][:idx+len('BANK')])
            if 'BANK' in result:
                return result
            else:
                return None
        else:
            return None

In [104]:
result = pd.DataFrame()
result['string_X_train'] = val_df['string_X_train']
result['Y_label'] = val_df['Y_label']
result

Unnamed: 0,string_X_train,Y_label
538,1THE AMOUNT OF EACH PRESENTATION UNDER THIS CR...,CHINA CITIC BANK
754,SIGNED COMMERCIAL INVOICE IN 3 COPIESx000DFULL...,KEB HANA BANK
49,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD
1447,APPLICANTS ADDRESSx000D NO33 KEFENG ROAD SCIEN...,CTBC BANK CO LTD
141,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD
...,...,...
1299,IMMEDIATELY AFTER NEGOTIATION DOCUMENTS MUST B...,SAUDI BRITISH BANK
1293,SIGNED COMMERCIAL INVOICE IN 1 ORIGINAL INDICA...,TAISHIN INTERNATIONAL BANK
714,1 ALL DOCUMENTS TO BE FORWARDED IN ONE COVERx0...,BANK OF CHINA LTD
254,1 SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK CENTRAL ASIA


In [105]:
a = val_df['string_X_train'].apply(get_bank)
a = a.dropna(axis=0)
a

1006                            VIETINBANK
1324                  SUMITOMO MITSUI BANK
602                               YES BANK
168                        UNITEDARAB BANK
643                             STATE BANK
809                                   BANK
239                             ICICI BANK
1088                  ASIA COMMERCIAL BANK
240                             ICICI BANK
528                             VIETINBANK
677                                   BANK
599                   SUMITOMO MITSUI BANK
630                                   BANK
796                             VIETINBANK
576                                   BANK
1208                          BANGKOK BANK
619                                   BANK
261                             VIETINBANK
351     MEGA INTERNATIONAL COMMERCIAL BANK
184                        UNITEDARAB BANK
1030                          BANGKOK BANK
1187                                  BANK
1029                          BANGKOK BANK
371        

In [106]:
result['predict'] = val_df['string_X_train'].apply(get_bank)
result

Unnamed: 0,string_X_train,Y_label,predict
538,1THE AMOUNT OF EACH PRESENTATION UNDER THIS CR...,CHINA CITIC BANK,
754,SIGNED COMMERCIAL INVOICE IN 3 COPIESx000DFULL...,KEB HANA BANK,
49,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD,
1447,APPLICANTS ADDRESSx000D NO33 KEFENG ROAD SCIEN...,CTBC BANK CO LTD,
141,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD,
...,...,...,...
1299,IMMEDIATELY AFTER NEGOTIATION DOCUMENTS MUST B...,SAUDI BRITISH BANK,
1293,SIGNED COMMERCIAL INVOICE IN 1 ORIGINAL INDICA...,TAISHIN INTERNATIONAL BANK,
714,1 ALL DOCUMENTS TO BE FORWARDED IN ONE COVERx0...,BANK OF CHINA LTD,
254,1 SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK CENTRAL ASIA,BANK


In [107]:
result

Unnamed: 0,string_X_train,Y_label,predict
538,1THE AMOUNT OF EACH PRESENTATION UNDER THIS CR...,CHINA CITIC BANK,
754,SIGNED COMMERCIAL INVOICE IN 3 COPIESx000DFULL...,KEB HANA BANK,
49,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD,
1447,APPLICANTS ADDRESSx000D NO33 KEFENG ROAD SCIEN...,CTBC BANK CO LTD,
141,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD,
...,...,...,...
1299,IMMEDIATELY AFTER NEGOTIATION DOCUMENTS MUST B...,SAUDI BRITISH BANK,
1293,SIGNED COMMERCIAL INVOICE IN 1 ORIGINAL INDICA...,TAISHIN INTERNATIONAL BANK,
714,1 ALL DOCUMENTS TO BE FORWARDED IN ONE COVERx0...,BANK OF CHINA LTD,
254,1 SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK CENTRAL ASIA,BANK


# 接bert

In [108]:
not_find = []
for j,i in enumerate(result.iloc[:,2].values):
    if i == None:
        not_find.append(j)
len(not_find)

249

In [109]:
not_find_df = result.iloc[not_find]
not_find_df

Unnamed: 0,string_X_train,Y_label,predict
538,1THE AMOUNT OF EACH PRESENTATION UNDER THIS CR...,CHINA CITIC BANK,
754,SIGNED COMMERCIAL INVOICE IN 3 COPIESx000DFULL...,KEB HANA BANK,
49,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD,
1447,APPLICANTS ADDRESSx000D NO33 KEFENG ROAD SCIEN...,CTBC BANK CO LTD,
141,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD,
...,...,...,...
573,1 ALL DOCUMENTS TO BE FORWARDED IN ONE COVERx0...,BANK OF CHINA,
1299,IMMEDIATELY AFTER NEGOTIATION DOCUMENTS MUST B...,SAUDI BRITISH BANK,
1293,SIGNED COMMERCIAL INVOICE IN 1 ORIGINAL INDICA...,TAISHIN INTERNATIONAL BANK,
714,1 ALL DOCUMENTS TO BE FORWARDED IN ONE COVERx0...,BANK OF CHINA LTD,


In [112]:
def model_predict(nlp,df):
    table = pd.DataFrame()
    for i in tqdm(df.index):
        sample = df.loc[[i]]
        string_X_train = sample['string_X_train'].values[0]
        QA_input = {
            'question': 'What is the bank name?',
            'context': string_X_train
        }
        res = nlp(QA_input)
        predict = QA_input['context'][res['start']:res['end']]
        row = pd.DataFrame({'predict:':predict},index=[i])
        table = table.append(row)
    return table

In [113]:
bert_predict = model_predict(nlp,not_find_df)
bert_predict

  0%|          | 0/249 [00:00<?, ?it/s]

Unnamed: 0,predict:
538,SERVICECHINA CITIC BANKGUANGZHOU
754,KEB HANA BANKx000DMARKED
49,MUFG BANK LTD
1447,CTBC BANK CO LTD
141,MUFG BANK LTD
...,...
573,BANK OF CHINA
1299,SAUDI BRITISH BANK
1293,TAISHIN INTERNATIONAL BANK
714,BANK OF CHINA


In [116]:
result.loc[bert_predict.index] = bert_predict.values
result

Unnamed: 0,string_X_train,Y_label,predict
538,SERVICECHINA CITIC BANKGUANGZHOU,SERVICECHINA CITIC BANKGUANGZHOU,SERVICECHINA CITIC BANKGUANGZHOU
754,KEB HANA BANKx000DMARKED,KEB HANA BANKx000DMARKED,KEB HANA BANKx000DMARKED
49,MUFG BANK LTD,MUFG BANK LTD,MUFG BANK LTD
1447,CTBC BANK CO LTD,CTBC BANK CO LTD,CTBC BANK CO LTD
141,MUFG BANK LTD,MUFG BANK LTD,MUFG BANK LTD
...,...,...,...
1299,SAUDI BRITISH BANK,SAUDI BRITISH BANK,SAUDI BRITISH BANK
1293,TAISHIN INTERNATIONAL BANK,TAISHIN INTERNATIONAL BANK,TAISHIN INTERNATIONAL BANK
714,BANK OF CHINA,BANK OF CHINA,BANK OF CHINA
254,1 SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK CENTRAL ASIA,BANK


# acc,放水acc

In [121]:
def get_acc(df,t=0.75):
    correct = []
    correct_label = []
    for i in df.index:
        jac = get_jaccard_sim(df.loc[i,'Y_label'],df.loc[i,'predict'])
        if jac >= t:
            correct.append('yes')
        else:
            correct.append('no')
    result = pd.Series(correct)
    return result.value_counts()['yes']/len(result)

In [122]:
get_acc(result,1),get_acc(result,0.75)

(0.8865979381443299, 0.8900343642611683)