In [5]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm_notebook as tqdm

def preprocess(x):
    x = str(x)
    x = re.sub('[\u4e00-\u9fa5]', '', x) # 1.去除中文
    x = re.sub('[’!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~，。,.]', '', x) # 2.去除標點符號
    x = x.replace('\n', '').replace('\r', '').replace('\t', '') # 3.去除換行符號
    x = str.strip(x) # 4.移除左右空白
    return x

def get_bank(text):
    text = preprocess(text)
    keywords = ['TO ORDER OF','TO THEORDER OF','TO THE ORDER OF','TOTHE ORDER OF','TO THE ORDER+OF','TOORDER OF']
    for i in keywords:
        if i in text:
            idx = text.split(i)[1].find('BANK')
            result = preprocess(text.split(i)[1][:idx+len('BANK')])
            if 'BANK' in result:
                return result
            else:
                return ''
        else:
            return ''

def str2index(context,string):
    ys = context.find(string)
    ye = ys + len(string)
    return ys,ye

df = pd.read_excel('../data/combined_excel.xlsx',index_col=0)[['46A','47A','78','LCBK']]
df.head()

Unnamed: 0,46A,47A,78,LCBK
0,1. FULL SET OF(3 NEGOTIABLE COPIES PLUS 3 NON-...,1.ORIGINAL DOCUMENTS TO BE SENT IN ONE LOT BY ...,WE SHALL REMIT THE PROCEEDS TO YOU UPON RECEIP...,
1,1.DRAFTS FOR 100PCT OF INVOICE VALUE..2.COMPLE...,1.ALL DOCUMENT MUST MENTION OUR LC NUMBER AND ...,REFER FIELD 47A.,
2,+1. SHIPPED ON BOARD OCEAN BILLS OF LADING (FU...,+1. DOCUMENTS TO BE SENT DIRECTLY TO US IN ONE...,WE HEREBY UNDERTAKE WITH DRAWERS AND/OR BONAFI...,
3,1. SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AN...,1. ALL DOCUMENTS MUST BE IN ENGLISH.2. ALL DOC...,+UPON RECEIPT OF CREDIT COMPLIANT DOCUMENTS AT...,"ICICI BANK LTD.,\nSHALIMAR TOWER, 31/54 M.G.MA..."
4,1. SIGNED COMMERCIAL INVOICE IN ORIGINAL AND 4...,(A) ALL DOCUMENTS AND DRAFTS (IF CALLED FOR UN...,IN REIMBURSEMENT OF NEGOTIATION MADE BY YOUIN ...,"NATIONAL COMMERCIAL BANK, THE\n(HEAD OFFICE)"


In [6]:
# 預處理
df['46A'] = df['46A'].apply(preprocess)
df['47A'] = df['47A'].apply(preprocess)
df['78'] = df['78'].apply(preprocess)
df['LCBK'] = df['LCBK'].apply(preprocess)

# X,Y製作
table = pd.DataFrame()
table['string_X_train'] = df['46A'] +df['47A'] +df['78']
table['Y_label'] = df['LCBK']
df = table

# Y1,Y2製作
ys_lst = []
ye_lst = []
for i in range(len(df)):
    context= df['string_X_train'].values[i]
    string = df['Y_label'].values[i]
    ys,ye = str2index(context,string)
    ys_lst.append(ys)
    ye_lst.append(ye)
df['string_Y_1'] = ys_lst
df['string_Y_2'] = ye_lst

# 清洗
df = df[df['string_Y_1']!=-1]
df = df[df['Y_label']!='']
df = df[df['Y_label']!=str('nan')]
print(df.shape)
df.head(20)

(3742, 4)


Unnamed: 0,string_X_train,Y_label,string_Y_1,string_Y_2
16,SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS INDIC...,CTBC BANK CO LTD,1859,1875
18,SIGNED COMMERCIAL INVOICE IN TRIPLICATE FULL S...,INDUSTRIAL BANK OF KOREA SEOUL,873,903
19,SIGNED COMMERCIAL INVOICE IN TRIPLICATE FULL S...,INDUSTRIAL BANK OF KOREA SEOUL,873,903
22,SIGNED COMMERCIAL INVOICE IN 2 ORIGINALINDICAT...,CTBC BANK CO LTD,4923,4939
23,SIGNED COMMERCIAL INVOICE IN 2 ORIGINALINDICAT...,CTBC BANK CO LTD,4923,4939
24,SIGNED COMMERCIAL INVOICE IN 2 ORIGINALINDICAT...,CTBC BANK CO LTD,4923,4939
30,1SIGNED COMMERCIAL INVOICE IN DUPLICATE SHOWIN...,CHANG HWA COMMERCIAL BANK LTD,5344,5373
33,SIGNED COMMERCIAL INVOICE IN 03 COPIESFULL SET...,KEB HANA BANK,111,124
34,SIGNED COMMERCIAL INVOICE IN 03 COPIESFULL SET...,KEB HANA BANK,111,124
41,1FULL SET OF ORIGINALS AND 2 NON-NEGOTIABLE CO...,STANDARD CHARTERED BANK,1595,1618


In [7]:
# 驗證
for i in tqdm(range(len(df))):
    assert df['string_X_train'].values[i][df['string_Y_1'].values[i]:df['string_Y_2'].values[i]] == df['Y_label'].values[i]
find = df['string_X_train'].apply(get_bank)
len(find)/len(df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(len(df))):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3742.0), HTML(value='')))




1.0

In [8]:
# 保存
df.to_csv('../data/preprocess_for_SQUAD_銀行.csv')