In [1]:
import pandas as pd
import numpy as np
import random
import torch
from pytorch_lightning import seed_everything
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering
from transformers import pipeline
from tqdm import tqdm_notebook as tqdm
import re

def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)
df = pd.read_excel('..\data\combined_excel.xlsx',index_col=0)
print(df.columns)
df['產品名'] = np.nan#df['SPEC']
df['開狀銀行'] = df['LTADDRESS.1'].apply(lambda x:str(x)[:-3])#LTADDRESS的末三碼可以剃除
df['受益人'] = np.nan
df['開狀人'] = np.nan
df = df[['45A', #產品名輸入
'50', # 開狀人輸入
'59', # 公司名(受益人)輸入
'46A','47A','78', #開狀銀行輸入
'產品名',
'開狀人',
'受益人',
'開狀銀行',
'EXPNO']]
df.head()

Global seed set to 42


Index(['59', '45A', '46A', '47A', '31D', '44C', '48', '71D', '78', '50', '51A',
       '51D', 'LCNO', 'LTADDRESS.1', 'from', 'EXPNO', 'LCBK', 'CU1', 'SPEC'],
      dtype='object')


Unnamed: 0,45A,50,59,46A,47A,78,產品名,開狀人,受益人,開狀銀行,EXPNO
0,SHIPMENT OFPVC SUSPENSION RESIN S-65D QTY 18 M...,"RYTOIL PETROCHEMICALS LLPBASEMENT 2095, I BLOC...","FORMOSA PLASTICS CORPORATION201,TUNG HWA NORTH...",1. FULL SET OF(3 NEGOTIABLE COPIES PLUS 3 NON-...,1.ORIGINAL DOCUMENTS TO BE SENT IN ONE LOT BY ...,WE SHALL REMIT THE PROCEEDS TO YOU UPON RECEIP...,,,,PUNBINBBA,
1,"17 MT PVC COPOLYMER RESIN, C-15 AT USD 1470 ...","S R POLYVINYL LTD4261/3, JAI MATA MARKET TRI N...","FORMOSA PLASTICS CORPORATION201, TUNG HWA NORT...",1.DRAFTS FOR 100PCT OF INVOICE VALUE..2.COMPLE...,1.ALL DOCUMENT MUST MENTION OUR LC NUMBER AND ...,REFER FIELD 47A.,,,,INDBINBBA,
2,"+TERMS OF SALE: CIF NHAVA SHEVA PORT, INDIA++7...",OSWAL CABLE PRODUCTS PVT LTDA 93/1 WAZIRPUR GR...,FORMOSA PLASTICS CORPORATION201. TUNG HWA N. R...,+1. SHIPPED ON BOARD OCEAN BILLS OF LADING (FU...,+1. DOCUMENTS TO BE SENT DIRECTLY TO US IN ONE...,WE HEREBY UNDERTAKE WITH DRAWERS AND/OR BONAFI...,,,,CITIINBXA,
3,"MASS PVC RESIN, B-57QUANTITY 175 MT AT 1300 US...",OJUS PETROCHEMICALS LLPC 289 NIRALA NAGAR LUCK...,"FORMOSA PLASTICS CORPORATION201,TUNG HWA N ROA...",1. SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AN...,1. ALL DOCUMENTS MUST BE IN ENGLISH.2. ALL DOC...,+UPON RECEIPT OF CREDIT COMPLIANT DOCUMENTS AT...,,,,ICICINBBA,11G11F59
4,PHTHALIC ANHYDRIDE (PA)QUANTITY = 306 MT UNIT ...,AHMED SAEED AFIFI FACTORY CO. FORRESINS LTD. P...,NAN YA PLASTICS CORPORATION201 TUN HWA NORTH R...,1. SIGNED COMMERCIAL INVOICE IN ORIGINAL AND 4...,(A) ALL DOCUMENTS AND DRAFTS (IF CALLED FOR UN...,IN REIMBURSEMENT OF NEGOTIATION MADE BY YOUIN ...,,,,NCBKSAJEA,27P11346


In [2]:

class 產品名預測器(object):
    def __init__(self,data_path,model_path):
        
        # 讀取訓練資料(SPEC)
        train_df = pd.read_csv(data_path)[['string_X_train','Y_label','EXPNO']]
        train_df['Y_label'] = train_df['Y_label'].apply(lambda x:self.product_name_postprocess(x))

        # 讀取台塑網提供之(寶典人工手動修正過刪除線問題)
        root = '../data/寶典/寶典人工處理後/'
        df5 = pd.read_excel(root+'寶典.v6.20211020.xlsx',engine='openpyxl')[['CODIV','DIVNM','ITEMNM']]
        df5 = df5.rename(columns={'ITEMNM':'品名','DIVNM':'公司事業部門','CODIV':'公司代號'})
        df_by_ricky = pd.read_excel(root+'寶典_by_ricky.xlsx',engine='openpyxl')[['CODIV','DIVNM','ITEMNM']]
        df_by_ricky = df_by_ricky.rename(columns={'ITEMNM':'品名','DIVNM':'公司事業部門','CODIV':'公司代號'})
        df = df5.append(df_by_ricky) # 合併官方寶典和我做的寶典
        df['品名'] = df['品名'].apply(lambda x:self.product_name_postprocess(x)) #品名後處理
        
        # 寶典跟bert
        self.產品集合 = set(df['品名'].values.tolist() + train_df['Y_label'].values.tolist())
        self.nlp = self.load_nlp(model_path)
    
    def product_name_postprocess(self,x):
        x = str(x)
        x = x.replace('-',' ')
        x = x.strip()
        x = self.add_space(x)
        x = str(x).upper() # 轉大寫字串
        x = re.sub('[\u4e00-\u9fa5]', '', x) # 去除中文
        x = re.sub(r'[^\w\s]','',x) # 去除標點符號
        x = x.replace('\n', '').replace('\r', '').replace('\t', '') # 換行符號去除
        return x
    
    def add_space(self,x):
        if (' ' not in x)&(len(x)<=5):
            return ' ' + x + ' '
        else:
            return x
    
    def load_nlp(self,path):
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
        model.load_state_dict(torch.load(path,map_location=torch.device('cpu')))
        model.eval()
        nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)
        return nlp
    
    def preprocess_45(self,x):
        x = str(x).upper() # 轉大寫字串
        x = re.sub('[\u4e00-\u9fa5]', '', x) # 去除中文
        x = re.sub(r'[^\w\s]','',x) # 去除標點符號
        x = x.replace('\n', '').replace('\r', '').replace('\t', '') # 換行符號去除
        str.strip(x) # 移除左右空白
        # 去除多重空白
        x = x.replace('   ', ' ')
        x = x.replace('  ', ' ')
        # 出現在頭的 就不可能對到前後加空格的 這種情形要想想怎麼對照(加上左右空白)
        x = ' ' + x + ' '
        return x

    def bert_postprocess(self,x):
        x = x.replace('QUANTITY','')
        if 'PACKING' in x: #像這個 有辦法將 packing之後的都幹掉嗎
            x = x[:x.find('PACKING')+len('PACKING')]
        return x

    # 寶典比對法
    def Collection_method(self,df,產品集合,x_col):
        labels = {}
        labels_max = {}
        for i in tqdm(df.index):
            products = []
            for p in 產品集合:
                if p in df.loc[i,x_col]:
                    products.append(p) # 加入候選清單
            if len(products) > 0: # 如果有找到產品 
                labels[i] = products # 複數個產品,之後配合公司去篩選出一個
                labels_max[i] = max(products,key=len) # 取長度最長的產品
            else:
                labels[i] = 'not find'
                labels_max[i] = 'not find'
        predict = pd.DataFrame(index=labels.keys(),columns=['預測產品'])
        predict['預測產品'] = labels.values()
        predict['預測產品(取長度最長)'] = labels_max.values()
        predict['預測產品使用方式'] = 'rule'
        return predict
    
    # bert 預測法
    def model_predict(self,nlp,df,question='What is the product name?',start_from0=False,x_col='45A',y_col='預測產品'):
        table = pd.DataFrame()
        idx_list = sorted(df.index.tolist())
        for i in tqdm(idx_list):
            sample = df.loc[[i]]
            string_X_train = sample[x_col].values[0]
            QA_input = {
                'question': question,
                'context': string_X_train
            }
            res = nlp(QA_input)
            if start_from0 == False:
                predict = QA_input['context'][res['start']:res['end']]
            else:
                predict = QA_input['context'][0:res['end']]
            row = pd.DataFrame({y_col:predict},index=[i])
            table = table.append(row)
        table[y_col] = table[y_col].apply(lambda x:[self.bert_postprocess(x)])
        return [ i[0] for i in table[y_col].values.tolist()] # list of string
    
    def predict(self,df):
        df['45A'] = df['45A'].apply(lambda x:self.preprocess_45(x)) 
        output = self.Collection_method(df,self.產品集合,'45A')
        not_find_idx = output.loc[output['預測產品'] == 'not find',:].index
        if len(not_find_idx) > 0:
            bert_predict = self.model_predict(self.nlp,df.loc[not_find_idx])
            output.loc[not_find_idx,'預測產品'] = [ [i] for i in bert_predict]
            output.loc[not_find_idx,'預測產品(取長度最長)'] = bert_predict
            output.loc[not_find_idx,'預測產品使用方式'] = 'bert'
        output.loc[not_find_idx,'預測產品(取長度最長)'] = output.loc[not_find_idx,'預測產品(取長度最長)'].apply(lambda x:self.product_name_postprocess(x))
        return output



    
product_predictor = 產品名預測器('../data/preprocess_for_SQUAD_產品.csv','../models/Product_Data_SQuAD_model_product.pt')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [3]:
error_idx = list(set(df.index) - set(df.dropna(subset=['產品名']).index))
df.loc[error_idx,'產品名'] = product_predictor.predict(df.loc[error_idx,['45A']])['預測產品(取長度最長)'].values.tolist()
df

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/20737 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/1653 [00:00<?, ?it/s]

  tensor = as_tensor(value)
  for span_id in range(num_spans)


Unnamed: 0,45A,50,59,46A,47A,78,產品名,開狀人,受益人,開狀銀行,EXPNO
0,SHIPMENT OFPVC SUSPENSION RESIN S-65D QTY 18 M...,"RYTOIL PETROCHEMICALS LLPBASEMENT 2095, I BLOC...","FORMOSA PLASTICS CORPORATION201,TUNG HWA NORTH...",1. FULL SET OF(3 NEGOTIABLE COPIES PLUS 3 NON-...,1.ORIGINAL DOCUMENTS TO BE SENT IN ONE LOT BY ...,WE SHALL REMIT THE PROCEEDS TO YOU UPON RECEIP...,PVC SUSPENSION RESIN,,,PUNBINBBA,
1,"17 MT PVC COPOLYMER RESIN, C-15 AT USD 1470 ...","S R POLYVINYL LTD4261/3, JAI MATA MARKET TRI N...","FORMOSA PLASTICS CORPORATION201, TUNG HWA NORT...",1.DRAFTS FOR 100PCT OF INVOICE VALUE..2.COMPLE...,1.ALL DOCUMENT MUST MENTION OUR LC NUMBER AND ...,REFER FIELD 47A.,PVC COPOLYMER RESIN,,,INDBINBBA,
2,"+TERMS OF SALE: CIF NHAVA SHEVA PORT, INDIA++7...",OSWAL CABLE PRODUCTS PVT LTDA 93/1 WAZIRPUR GR...,FORMOSA PLASTICS CORPORATION201. TUNG HWA N. R...,+1. SHIPPED ON BOARD OCEAN BILLS OF LADING (FU...,+1. DOCUMENTS TO BE SENT DIRECTLY TO US IN ONE...,WE HEREBY UNDERTAKE WITH DRAWERS AND/OR BONAFI...,PVC RESIN S60,,,CITIINBXA,
3,"MASS PVC RESIN, B-57QUANTITY 175 MT AT 1300 US...",OJUS PETROCHEMICALS LLPC 289 NIRALA NAGAR LUCK...,"FORMOSA PLASTICS CORPORATION201,TUNG HWA N ROA...",1. SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AN...,1. ALL DOCUMENTS MUST BE IN ENGLISH.2. ALL DOC...,+UPON RECEIPT OF CREDIT COMPLIANT DOCUMENTS AT...,PVC RESIN B57,,,ICICINBBA,11G11F59
4,PHTHALIC ANHYDRIDE (PA)QUANTITY = 306 MT UNIT ...,AHMED SAEED AFIFI FACTORY CO. FORRESINS LTD. P...,NAN YA PLASTICS CORPORATION201 TUN HWA NORTH R...,1. SIGNED COMMERCIAL INVOICE IN ORIGINAL AND 4...,(A) ALL DOCUMENTS AND DRAFTS (IF CALLED FOR UN...,IN REIMBURSEMENT OF NEGOTIATION MADE BY YOUIN ...,PHTHALIC ANHYDRIDE PA,,,NCBKSAJEA,27P11346
...,...,...,...,...,...,...,...,...,...,...,...
20732,PVC SUSPENSION RESIN S65D FOR 70MT AT USD 1270...,"MOHAN TRADING CO._x000D_\n19 - DADA COLONY, IN...","FORMOSA PLASTICS CORPORATION_x000D_\n201, TUNG...",1.BENEFICIARY'S SIGNED COMMERCIAL INVOICES IN ...,1.ALL DOCUMENTS MUST BE IN ENGLISH._x000D_\n+_...,ON RECEIPT OF DOCUMENTS STRICTLY IN CONFORMITY...,PVC SUSPENSION RESIN,,,KKBKINBBA,
20733,70.00 MT (IN 4X20 FT CONTAINERS)_x000D_\nPVC R...,"YASH PACKAGING_x000D_\nPLOT NO. A2/2220, 3RD P...","FORMOSA PLASTICS CORPORATION_x000D_\n201, TUNG...",1.BENEFICIARYS MANUALLY SIGNED COMMERCIAL INVO...,1. ALL DOCUMENTS MUST BE IN ENGLISH._x000D_\n+...,ON RECEIPT OF DOCUMENTS STRICTLY IN CONFORMITY...,PVC RESIN S60,,,KKBKINBBA,11G11J49
20734,EPOXY RESIN_x000D_\nNPEF-178_x000D_\nFOB ANY P...,"ALLCHEM INTERNATIONAL PTY., LTD._x000D_\n4/F, ...",NAN YA PLASTICS CORP_x000D_\n201 TUNG HWA N. R...,+SIGNED COMMERCIAL INVOICE IN 1 ORIGINAL AND 1...,+DOCUMENTS MUST BE PRESENTED WITHIN 10 DAYS AF...,+TO NEGOTIATING BANK ONLY: PLEASE FORWARD ALL ...,EPOXY RESIN,,,CTCBHKHHA,
20735,EPOXY RESIN_x000D_\nNPEL-128_x000D_\nCIF SAVAN...,"ALLCHEM INTERNATIONAL PTY., LTD._x000D_\n4/F, ...",NAN YA PLASTICS CORP_x000D_\n201 TUNG HWA N. R...,+SIGNED COMMERCIAL INVOICE IN 1 ORIGINAL AND 1...,+DOCUMENTS MUST BE PRESENTED WITHIN 10 DAYS AF...,+TO NEGOTIATING BANK ONLY: PLEASE FORWARD ALL ...,EPOXY RESIN,,,CTCBHKHHA,


In [4]:

class 開狀人預測器(object):
    def __init__(self):
        self.開狀人寶典 = pd.read_csv('../data/寶典/開狀人寶典.csv')
        self.開狀人尾綴 = pd.read_csv('../data/寶典/開狀人尾綴.csv')
        self.nlp = self.load_nlp('../models/Product_Data_SQuAD_model_開狀人.pt')
        
    def load_nlp(self,path):
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
        model.load_state_dict(torch.load(path,map_location=torch.device('cpu')))
        model.eval()
        nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)
        return nlp
    
    def preprocess_50(self,x):
        x = str(x)
        x = re.sub('[\u4e00-\u9fa5]', '', x) # 去除中文
        x = re.sub(r'[^\w\s]','',x) # 去除標點符號
        x = x.replace('\n', '').replace('\r', '').replace('\t', '') # 換行符號去除
        return str.strip(x) # 移除左右空白

    def model_predict(self,nlp,df,question,x_col,y_col,start_from0=False):
        table = pd.DataFrame()
        idx_list = sorted(df.index.tolist())
        for i in tqdm(idx_list):
            sample = df.loc[[i]]
            string_X_train = sample[x_col].values[0]
            QA_input = {
                'question': question,
                'context': string_X_train
            }
            res = nlp(QA_input)
            if start_from0 == False:
                predict = QA_input['context'][res['start']:res['end']]
            else:
                predict = QA_input['context'][0:res['end']]
            row = pd.DataFrame({y_col:predict},index=[i])
            table = table.append(row)
        return [ i for i in table[y_col].values.tolist()] # list of string

    def predict(self,df):
        df['50'] = df['50'].apply(lambda x:self.preprocess_50(x))
        df['預測開狀人'] = 'not find'
        for i in df.index:
            x = df.loc[i,'50']
            # 1寶典匹配法
            for a in self.開狀人寶典['開狀人'].values.tolist():
                if (a in x) & (df.loc[i,'預測開狀人']=='not find'):
                    df.loc[i,'預測開狀人'] = a
            # 2尾綴匹配法
            for b in self.開狀人尾綴['尾綴'].values.tolist():
                if (b in x) & (df.loc[i,'預測開狀人']=='not find'):
                    df.loc[i,'預測開狀人'] = x[:x.find(b)+len(b)]
            # 若 1,2 方法都不行則用bert
            not_find_idx = df.loc[df['預測開狀人'] == 'not find',:].index
            if len(not_find_idx) > 0:
                bert_predict = self.model_predict(
                    self.nlp,
                    df.rename(columns={'50':'string_X_train'}).loc[not_find_idx],
                    question='What is the Applicant name?',
                    start_from0=True,
                    x_col='string_X_train',
                    y_col='預測開狀人')
                df.loc[not_find_idx,'預測開狀人'] = bert_predict
        return df
開狀人_predictor = 開狀人預測器()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [5]:
error_idx = list(set(df.index) - set(df.dropna(subset=['開狀人']).index))
df.loc[error_idx,'開狀人'] = 開狀人_predictor.predict(df.loc[error_idx,['50']])['預測開狀人'].values.tolist()
df

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/20736 [00:00<?, ?it/s]

Unnamed: 0,45A,50,59,46A,47A,78,產品名,開狀人,受益人,開狀銀行,EXPNO
0,SHIPMENT OFPVC SUSPENSION RESIN S-65D QTY 18 M...,"RYTOIL PETROCHEMICALS LLPBASEMENT 2095, I BLOC...","FORMOSA PLASTICS CORPORATION201,TUNG HWA NORTH...",1. FULL SET OF(3 NEGOTIABLE COPIES PLUS 3 NON-...,1.ORIGINAL DOCUMENTS TO BE SENT IN ONE LOT BY ...,WE SHALL REMIT THE PROCEEDS TO YOU UPON RECEIP...,PVC SUSPENSION RESIN,RYTOIL PETROCHEMICALS LLP,,PUNBINBBA,
1,"17 MT PVC COPOLYMER RESIN, C-15 AT USD 1470 ...","S R POLYVINYL LTD4261/3, JAI MATA MARKET TRI N...","FORMOSA PLASTICS CORPORATION201, TUNG HWA NORT...",1.DRAFTS FOR 100PCT OF INVOICE VALUE..2.COMPLE...,1.ALL DOCUMENT MUST MENTION OUR LC NUMBER AND ...,REFER FIELD 47A.,PVC COPOLYMER RESIN,S R POLYVINYL LTD42613,,INDBINBBA,
2,"+TERMS OF SALE: CIF NHAVA SHEVA PORT, INDIA++7...",OSWAL CABLE PRODUCTS PVT LTDA 93/1 WAZIRPUR GR...,FORMOSA PLASTICS CORPORATION201. TUNG HWA N. R...,+1. SHIPPED ON BOARD OCEAN BILLS OF LADING (FU...,+1. DOCUMENTS TO BE SENT DIRECTLY TO US IN ONE...,WE HEREBY UNDERTAKE WITH DRAWERS AND/OR BONAFI...,PVC RESIN S60,OSWAL CABLE PRODUCTS PVT LTDA,,CITIINBXA,
3,"MASS PVC RESIN, B-57QUANTITY 175 MT AT 1300 US...",OJUS PETROCHEMICALS LLPC 289 NIRALA NAGAR LUCK...,"FORMOSA PLASTICS CORPORATION201,TUNG HWA N ROA...",1. SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AN...,1. ALL DOCUMENTS MUST BE IN ENGLISH.2. ALL DOC...,+UPON RECEIPT OF CREDIT COMPLIANT DOCUMENTS AT...,PVC RESIN B57,OJUS PETROCHEMICALS LLPC,,ICICINBBA,11G11F59
4,PHTHALIC ANHYDRIDE (PA)QUANTITY = 306 MT UNIT ...,AHMED SAEED AFIFI FACTORY CO. FORRESINS LTD. P...,NAN YA PLASTICS CORPORATION201 TUN HWA NORTH R...,1. SIGNED COMMERCIAL INVOICE IN ORIGINAL AND 4...,(A) ALL DOCUMENTS AND DRAFTS (IF CALLED FOR UN...,IN REIMBURSEMENT OF NEGOTIATION MADE BY YOUIN ...,PHTHALIC ANHYDRIDE PA,AHMED SAEED AFIFI FACTORY CO FORRESINS LTD,,NCBKSAJEA,27P11346
...,...,...,...,...,...,...,...,...,...,...,...
20732,PVC SUSPENSION RESIN S65D FOR 70MT AT USD 1270...,"MOHAN TRADING CO._x000D_\n19 - DADA COLONY, IN...","FORMOSA PLASTICS CORPORATION_x000D_\n201, TUNG...",1.BENEFICIARY'S SIGNED COMMERCIAL INVOICES IN ...,1.ALL DOCUMENTS MUST BE IN ENGLISH._x000D_\n+_...,ON RECEIPT OF DOCUMENTS STRICTLY IN CONFORMITY...,PVC SUSPENSION RESIN,MOHAN TRADING CO,,KKBKINBBA,
20733,70.00 MT (IN 4X20 FT CONTAINERS)_x000D_\nPVC R...,"YASH PACKAGING_x000D_\nPLOT NO. A2/2220, 3RD P...","FORMOSA PLASTICS CORPORATION_x000D_\n201, TUNG...",1.BENEFICIARYS MANUALLY SIGNED COMMERCIAL INVO...,1. ALL DOCUMENTS MUST BE IN ENGLISH._x000D_\n+...,ON RECEIPT OF DOCUMENTS STRICTLY IN CONFORMITY...,PVC RESIN S60,YASH PACKAGING_x000D,,KKBKINBBA,11G11J49
20734,EPOXY RESIN_x000D_\nNPEF-178_x000D_\nFOB ANY P...,"ALLCHEM INTERNATIONAL PTY., LTD._x000D_\n4/F, ...",NAN YA PLASTICS CORP_x000D_\n201 TUNG HWA N. R...,+SIGNED COMMERCIAL INVOICE IN 1 ORIGINAL AND 1...,+DOCUMENTS MUST BE PRESENTED WITHIN 10 DAYS AF...,+TO NEGOTIATING BANK ONLY: PLEASE FORWARD ALL ...,EPOXY RESIN,ALLCHEM INTERNATIONAL PTY LTD,,CTCBHKHHA,
20735,EPOXY RESIN_x000D_\nNPEL-128_x000D_\nCIF SAVAN...,"ALLCHEM INTERNATIONAL PTY., LTD._x000D_\n4/F, ...",NAN YA PLASTICS CORP_x000D_\n201 TUNG HWA N. R...,+SIGNED COMMERCIAL INVOICE IN 1 ORIGINAL AND 1...,+DOCUMENTS MUST BE PRESENTED WITHIN 10 DAYS AF...,+TO NEGOTIATING BANK ONLY: PLEASE FORWARD ALL ...,EPOXY RESIN,ALLCHEM INTERNATIONAL PTY LTD,,CTCBHKHHA,


In [6]:

class 公司預測器(object):
    def __init__(self):
        self.公司寶典 = pd.read_csv('../data/寶典/公司寶典加尾綴(擴充版).csv')
        self.nlp = self.load_nlp('../models/Product_Data_SQuAD_model_公司.pt')
        
    def load_nlp(self,path):
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
        model.load_state_dict(torch.load(path,map_location=torch.device('cpu')))
        model.eval()
        nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)
        return nlp
    
    def preprocess_59(self,x): # 公司59欄位預處理
        x = str(x) #轉str
        x = re.sub('[\u4e00-\u9fa5]', '', x) # 去除中文
        x = re.sub(r'[^\w\s]','',x) # 去除標點符號
        x = x.replace('\n', '').replace('\r', '').replace('\t', '') # 去除換行符號
        x = x.replace('r','').replace('n','')
        return str.strip(x)

    def model_predict(self,nlp,df,question,x_col,y_col,start_from0=False):
        table = pd.DataFrame()
        idx_list = sorted(df.index.tolist())
        for i in tqdm(idx_list):
            sample = df.loc[[i]]
            string_X_train = sample[x_col].values[0]
            QA_input = {
                'question': question,
                'context': string_X_train
            }
            res = nlp(QA_input)
            if start_from0 == False:
                predict = QA_input['context'][res['start']:res['end']]
            else:
                predict = QA_input['context'][0:res['end']]
            row = pd.DataFrame({y_col:predict},index=[i])
            table = table.append(row)
        return [ i for i in table[y_col].values.tolist()] # list of string


    def predict(self,df):
        df['59'] = df['59'].apply(lambda x:self.preprocess_59(x))
        df['受益人'] = 'not find'
        for i in df.index:
            x = df.loc[i,'59']
            # 1寶典匹配法
            for a in self.公司寶典['公司英文名稱'].values.tolist():
                if (a in x) & (df.loc[i,'受益人'] == 'not find'):
                    df.loc[i,'受益人'] = a
            # 2尾綴匹配法
            for b in self.公司寶典['尾綴'].values.tolist():
                if (b in x) & (df.loc[i,'受益人'] == 'not find'):
                    df.loc[i,'受益人'] = x[:x.find(b)+len(b)]
        # 若 1,2 方法都不行則用bert
        not_find_idx = df.loc[df['受益人'] == 'not find',:].index
        if len(not_find_idx) > 0:
            bert_predict = self.model_predict(
                self.nlp,
                df.rename(columns={'59':'string_X_train'}).loc[not_find_idx],
                question = 'What is the company name?',
                start_from0 = True,
                x_col='string_X_train',
                y_col='受益人')
            df.loc[not_find_idx,'受益人'] = bert_predict
        return df
受益人_predictor = 公司預測器()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [7]:
error_idx = list(set(df.index) - set(df.dropna(subset=['受益人']).index))
df.loc[error_idx,'受益人'] = 受益人_predictor.predict(df.loc[error_idx,['59']])['受益人'].values.tolist()
df.sample(15)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/6471 [00:00<?, ?it/s]

Unnamed: 0,45A,50,59,46A,47A,78,產品名,開狀人,受益人,開狀銀行,EXPNO
4680,CIF NAGOYA_x000D_\n1)PURCHASE CONTRACT NO.JG18...,"MARUBENI INTEX CO.,LTD._x000D_\n2-1 DOJIMAHAMA...","NAN YA PLASTICS CORP_x000D_\n201,TUNG HWA NORT...",+ COMMERCIAL INVOICE IN 1 ORIGINAL 2 COPIES_x0...,+ T.T.REIMBURSEMENT : PROHIBITED_x000D_\n+ 5 P...,"ALL DOCS TO BE SENT DIRECTLY TO US(3-10-19, MI...",POLYESTER FILAMENT YARN,MARUBENI INTEX COLTD,NAN YA PLASTICS CORP,SMBCJPJTD,
447,NO. COMMODITY QUANTITY UNIT PRICE A...,BINH TIEN BIEN HOA COMPANY LIMITED_x000D_\nADD...,FORMOSA PLASTICS CORPORATION_x000D_\nADD.:201 ...,1. SIGNED COMMERCIAL INVOICE_x000D_\n2. FULL (...,1. ONE ADDITIONAL COPY/PHOTOCOPY OF ALL REQUIR...,+THE AMOUNT AND DATE OF EACH DRAWING MUST BE E...,PACKING,BINH TIEN BIEN HOA COMPANY LIMITED,FORMOSA PLASTICS CORPORATION,ICBVVNVXA,18F12021
14773,1.POLYESTER RECYCLED TEXTURED YARN A GRADE2.PO...,"LIANG WOEI FIBRE CO., LTD.ROOM C,FL.10,NO.63,S...",NAN YA PLASTICS CORP.POLYESTER FIBER DIV.201 T...,+SIGNED COMMERCIAL INVOICE IN 2 ORIGINAL(S) IN...,+AN EXTRA COPY OF ALL DOCUMENTS IS REQUIRED FO...,+A DISCREPANCY FEE OF USD70.00(JPY7700.00 OR E...,POLYESTER,LIANG WOEI FIBRE CO LTDROOM,NAN YA PLASTICS CORP,BANK,
9015,,,200115,ANY PORT IN TAIWAN,"CAT LAI PORT, HOCHIMINH CITY,VIETNAM",10,NAN,,200115,BKTWTWT0A,14C9C060
2751,25.2 MT TAIRILAC ABS RESINS GRADE NO AG12A0 AT...,"MONSTER POLYMERS INC._x000D_\nPO BOX 145, 300 ...","FORMOSA CHEMICALS AND FIBRE CORP_x000D_\n201, ...",+3 ORIGINALS AND 3 COPIES COMMERCIAL INVOICE ...,+CONTAINER SHIPMENT REQUIRED._x000D_\nPO C8012...,UNLESS OTHERWISE SPECIFIED NEGOTIATING BANK WI...,ABS RESIN,MONSTER POLYMERS INC,FORMOSA CHEMICALS AND FIBRE CORP,ROYCCAT2D,4AT11056
911,SHIPMENT OF_x000D_\nPVC SUSPENSION RESIN S-65D...,RYTOIL PETROCHEMICALS LLP_x000D_\nBASEMENT 209...,"FORMOSA PLASTICS CORPORATION_x000D_\n201,TUNG ...",1. FULL SET OF(3 NEGOTIABLE COPIES PLUS 3 NON-...,1.ORIGINAL DOCUMENTS TO BE SENT IN ONE LOT BY ...,WE SHALL REMIT THE PROCEEDS TO YOU UPON RECEIP...,PVC SUSPENSION RESIN,RYTOIL PETROCHEMICALS LLP,FORMOSA PLASTICS CORPORATION,PUNBINBBA,
11114,,,200902,ANY TAIWAN SEAPORT,"KARACHI SEAPORT, PAKISTAN",15/FRM SHIPMENT DATE BUT WITHIN EXPIRY,NAN,,200902,BKTWTWT0A,2PL16168
10690,,,200319,KAOHSIUNG PORT,"NHAVA SHEVA PORT, INDIA",21/DAYS FROM THE DATE OF SHIPMENT,NAN,,200319,BKTWTWT0A,61B15247
6903,COMMODITY QUANTITY UNIT PRICE_x000D...,"SHANTOU LIXIN PLASTIC PRODUCTS_x000D_\nCO.,LTD...","FORMOSA PLASTICS CORPORATION_x000D_\nNO.201, T...",1. SIGNED COMMERCIAL INVOICE IN 6 ORIGINALS IN...,1. ALL DOCUMENTS TO BE FORWARDED IN ONE COVER....,"UPON OUR RECEIPT OF THE DOCUMENTS IN ORDER, WE...",HDPE TAISOX 8010,SHANTOU LIXIN PLASTIC PRODUCTS_x000D_COLTD,FORMOSA PLASTICS CORPORATION,BKCHCNBJA,18F15029
11167,,,200907,ANY TAIWANESE PORT,BUSAN PORT SOUTH KOREA,21,NAN,,200907,BKTWTWT0A,2PL15156


In [8]:
#==================銀行預測部分==================================================================

class 銀行預測器(object):
    def __init__(self):
        self.銀行列表 = np.load('../data/寶典/銀行寶典.npy')
        self.nlp = self.load_nlp('../models/Product_Data_SQuAD_model_銀行.pt')
        
    def load_nlp(self,path):
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
        model.load_state_dict(torch.load(path,map_location=torch.device('cpu')))
        model.eval()
        nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)
        return nlp
    
    def preprocess_銀行(self,x):
        x = str(x) # 0.轉字串
        x = re.sub('[\u4e00-\u9fa5]', '', x) # 1.去除中文
        x = re.sub('[’!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~，。,.]', '', x) # 2.去除標點符號
        x = x.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') # 3.去除換行符號
        x = x.replace('x000D','') # 4.移除'x000D'
        x = ' ' + str.strip(x) + ' ' # 5.移除左右空白 在左右各加一格空白
        return x
    
    def preprocess_銀行輸出(self,x):
        x = str(x) # 0.轉字串
        x = re.sub('[\u4e00-\u9fa5]', '', x) # 1.去除中文
        x = re.sub('[’!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~，。,.]', '', x) # 2.去除標點符號
        x = x.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') # 3.去除換行符號
        x = ' '.join([ i.replace(' ','') for i in x.split(' ')])
        x = str.strip(x) # 4.移除左右空白
        return x

    def model_predict(self,nlp,df,question,x_col,y_col,start_from0=False):
        table = pd.DataFrame()
        idx_list = sorted(df.index.tolist())
        for i in tqdm(idx_list):
            sample = df.loc[[i]]
            string_X_train = sample[x_col].values[0]
            QA_input = {
                'question': question,
                'context': str(string_X_train)
            }
            res = nlp(QA_input)
            if start_from0 == False:
                predict = QA_input['context'][res['start']:res['end']]
            else:
                predict = QA_input['context'][0:res['end']]
            row = pd.DataFrame({y_col:predict},index=[i])
            table = table.append(row)
        return [ i for i in table[y_col].values.tolist()] # list of string

    def predict(self,df):
        df['銀行輸入'] = df['46A'] + ' ' + df['47A'] + ' ' + df['78']
        df['開狀銀行'] = 'not find'
        for i in df.index:
            x = df.loc[i,'銀行輸入']
            # 先試寶典匹配法
            for a in self.銀行列表:
                if (str(a) in str(x)) & (df.loc[i,'開狀銀行'] == 'not find'):
                    df.loc[i,'開狀銀行'] = a
        # 若寶典匹配不到則用bert
        not_find_idx = df.loc[df['開狀銀行'] == 'not find',:].index
        if len(not_find_idx) > 0:
            bert_predict = self.model_predict(
                self.nlp,
                df.rename(columns={'銀行輸入':'string_X_train'}).loc[not_find_idx],
                question = 'What is the bank name?',
                start_from0 = False,
                x_col = 'string_X_train',
                y_col = '開狀銀行',
                )
            df.loc[not_find_idx,'開狀銀行'] = bert_predict
            df.loc[not_find_idx,'開狀銀行'] = df.loc[not_find_idx,'開狀銀行'].apply(lambda x:self.preprocess_銀行輸出(x))
        return df
銀行_predictor = 銀行預測器()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [9]:
#error_idx = list(set(df.index) - set(df.dropna(subset=['開狀銀行']).index))
#df.loc[error_idx,'開狀銀行'] = 銀行_predictor.predict(df.loc[error_idx,['46A','47A','78']])['開狀銀行'].values.tolist()
df.head(10)

Unnamed: 0,45A,50,59,46A,47A,78,產品名,開狀人,受益人,開狀銀行,EXPNO
0,SHIPMENT OFPVC SUSPENSION RESIN S-65D QTY 18 M...,"RYTOIL PETROCHEMICALS LLPBASEMENT 2095, I BLOC...","FORMOSA PLASTICS CORPORATION201,TUNG HWA NORTH...",1. FULL SET OF(3 NEGOTIABLE COPIES PLUS 3 NON-...,1.ORIGINAL DOCUMENTS TO BE SENT IN ONE LOT BY ...,WE SHALL REMIT THE PROCEEDS TO YOU UPON RECEIP...,PVC SUSPENSION RESIN,RYTOIL PETROCHEMICALS LLP,FORMOSA PLASTICS CORPORATION,PUNBINBBA,
1,"17 MT PVC COPOLYMER RESIN, C-15 AT USD 1470 ...","S R POLYVINYL LTD4261/3, JAI MATA MARKET TRI N...","FORMOSA PLASTICS CORPORATION201, TUNG HWA NORT...",1.DRAFTS FOR 100PCT OF INVOICE VALUE..2.COMPLE...,1.ALL DOCUMENT MUST MENTION OUR LC NUMBER AND ...,REFER FIELD 47A.,PVC COPOLYMER RESIN,S R POLYVINYL LTD42613,FORMOSA PLASTICS CORPORATION,INDBINBBA,
2,"+TERMS OF SALE: CIF NHAVA SHEVA PORT, INDIA++7...",OSWAL CABLE PRODUCTS PVT LTDA 93/1 WAZIRPUR GR...,FORMOSA PLASTICS CORPORATION201. TUNG HWA N. R...,+1. SHIPPED ON BOARD OCEAN BILLS OF LADING (FU...,+1. DOCUMENTS TO BE SENT DIRECTLY TO US IN ONE...,WE HEREBY UNDERTAKE WITH DRAWERS AND/OR BONAFI...,PVC RESIN S60,OSWAL CABLE PRODUCTS PVT LTDA,FORMOSA PLASTICS CORPORATION,CITIINBXA,
3,"MASS PVC RESIN, B-57QUANTITY 175 MT AT 1300 US...",OJUS PETROCHEMICALS LLPC 289 NIRALA NAGAR LUCK...,"FORMOSA PLASTICS CORPORATION201,TUNG HWA N ROA...",1. SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AN...,1. ALL DOCUMENTS MUST BE IN ENGLISH.2. ALL DOC...,+UPON RECEIPT OF CREDIT COMPLIANT DOCUMENTS AT...,PVC RESIN B57,OJUS PETROCHEMICALS LLPC,FORMOSA PLASTICS CORPORATION,ICICINBBA,11G11F59
4,PHTHALIC ANHYDRIDE (PA)QUANTITY = 306 MT UNIT ...,AHMED SAEED AFIFI FACTORY CO. FORRESINS LTD. P...,NAN YA PLASTICS CORPORATION201 TUN HWA NORTH R...,1. SIGNED COMMERCIAL INVOICE IN ORIGINAL AND 4...,(A) ALL DOCUMENTS AND DRAFTS (IF CALLED FOR UN...,IN REIMBURSEMENT OF NEGOTIATION MADE BY YOUIN ...,PHTHALIC ANHYDRIDE PA,AHMED SAEED AFIFI FACTORY CO FORRESINS LTD,NAN YA PLASTICS CORPORATION,NCBKSAJEA,27P11346
5,+ COMMODITY: LLDPE TAISOX 3470+ QUANTITY: 32.0...,MINH KHANG CHEMICAL TRADINGJOINT STOCK COMPANY...,"FORMOSA PLASTICS CORPORATION201,TUNG HWA NORTH...",1.SIGNED COMMERCIAL INVOICE IN 03 ORIGINALS AN...,+ALL DOCUMENTS MUST MADE IN ENGLISH.+ALL DOCUM...,+ UPON RECEIPT OF ALL DOCUMENTS SENT TO US(VIE...,CIF HAIPHONG PORT,MINH KHANG CHEMICAL TRADINGJOINT,FORMOSA PLASTICS CORPORATION,VBAAVNVXA,18Q0C097
6,"+TERM OF SALE: CIF MUNDRA SEAPORT, INDIA++70 M...",OSWAL CABLE PRODUCTS PVT LTDA 93/1 WAZIRPUR GR...,"FORMOSA PLASTICS CORPORATION201, TUNG HWA NORT...",+1. SHIPPED ON BOARD OCEAN BILLS OF LADING (FU...,+1. DOCUMENTS TO BE SENT DIRECTLY TO US IN ONE...,WE HEREBY UNDERTAKE WITH DRAWERS AND/OR BONAFI...,PVC SUSPENSION RESIN,OSWAL CABLE PRODUCTS PVT LTDA,FORMOSA PLASTICS CORPORATION,CITIINBXA,
7,COMMODITY ...,"CHORI CO., LTD.TEL NO.81-76-232-3008FAX NO.81-...","NAN YA PLASTICS CORPORATIONNO.201,TUNG HWA N.R...",+SIGNED COMMERCIAL INVOICE IN TRIPLICATE INDIC...,ALL DRAFT(S) DRAWN HEREUNDER MUST BE MARKED ''...,ALL DOCUMENTS INCLUDING BENEFICIARY'S DRAFT(S)...,POLYESTER PARTIALY ORIENTED YARN,CHORI CO LTDTEL,NAN YA PLASTICS CORPORATION,MHCBJPJTD,25D11126
8,COMMODITY ...,"CHORI CO., LTD.TEL NO.81-76-232-3008FAX NO.81-...","NAN YA PLASTICS CORPORATIONNO.201,TUNG HWA N.R...",+SIGNED COMMERCIAL INVOICE IN TRIPLICATE INDIC...,ALL DRAFT(S) DRAWN HEREUNDER MUST BE MARKED ''...,ALL DOCUMENTS INCLUDING BENEFICIARY'S DRAFT(S)...,POLYESTER PARTIALY ORIENTED YARN,CHORI CO LTDTEL,NAN YA PLASTICS CORPORATION,MHCBJPJTD,25D11126
9,COMMODITY ...,"CHORI CO., LTD.TEL NO.81-76-232-3008FAX NO.81-...","NAN YA PLASTICS CORPORATIONNO.201,TUNG HWA N.R...",+SIGNED COMMERCIAL INVOICE IN TRIPLICATE INDIC...,ALL DRAFT(S) DRAWN HEREUNDER MUST BE MARKED ''...,ALL DOCUMENTS INCLUDING BENEFICIARY'S DRAFT(S)...,POLYESTER PARTIALY ORIENTED YARN,CHORI CO LTDTEL,NAN YA PLASTICS CORPORATION,MHCBJPJTD,25D12134


In [10]:
df.shape

(20737, 11)

In [11]:
df.dropna(subset=['產品名','開狀人','受益人','開狀銀行','EXPNO']).shape

(13973, 11)

In [12]:
df = df.dropna(subset=['產品名','開狀人','受益人','開狀銀行','EXPNO'])
df

Unnamed: 0,45A,50,59,46A,47A,78,產品名,開狀人,受益人,開狀銀行,EXPNO
3,"MASS PVC RESIN, B-57QUANTITY 175 MT AT 1300 US...",OJUS PETROCHEMICALS LLPC 289 NIRALA NAGAR LUCK...,"FORMOSA PLASTICS CORPORATION201,TUNG HWA N ROA...",1. SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AN...,1. ALL DOCUMENTS MUST BE IN ENGLISH.2. ALL DOC...,+UPON RECEIPT OF CREDIT COMPLIANT DOCUMENTS AT...,PVC RESIN B57,OJUS PETROCHEMICALS LLPC,FORMOSA PLASTICS CORPORATION,ICICINBBA,11G11F59
4,PHTHALIC ANHYDRIDE (PA)QUANTITY = 306 MT UNIT ...,AHMED SAEED AFIFI FACTORY CO. FORRESINS LTD. P...,NAN YA PLASTICS CORPORATION201 TUN HWA NORTH R...,1. SIGNED COMMERCIAL INVOICE IN ORIGINAL AND 4...,(A) ALL DOCUMENTS AND DRAFTS (IF CALLED FOR UN...,IN REIMBURSEMENT OF NEGOTIATION MADE BY YOUIN ...,PHTHALIC ANHYDRIDE PA,AHMED SAEED AFIFI FACTORY CO FORRESINS LTD,NAN YA PLASTICS CORPORATION,NCBKSAJEA,27P11346
5,+ COMMODITY: LLDPE TAISOX 3470+ QUANTITY: 32.0...,MINH KHANG CHEMICAL TRADINGJOINT STOCK COMPANY...,"FORMOSA PLASTICS CORPORATION201,TUNG HWA NORTH...",1.SIGNED COMMERCIAL INVOICE IN 03 ORIGINALS AN...,+ALL DOCUMENTS MUST MADE IN ENGLISH.+ALL DOCUM...,+ UPON RECEIPT OF ALL DOCUMENTS SENT TO US(VIE...,CIF HAIPHONG PORT,MINH KHANG CHEMICAL TRADINGJOINT,FORMOSA PLASTICS CORPORATION,VBAAVNVXA,18Q0C097
7,COMMODITY ...,"CHORI CO., LTD.TEL NO.81-76-232-3008FAX NO.81-...","NAN YA PLASTICS CORPORATIONNO.201,TUNG HWA N.R...",+SIGNED COMMERCIAL INVOICE IN TRIPLICATE INDIC...,ALL DRAFT(S) DRAWN HEREUNDER MUST BE MARKED ''...,ALL DOCUMENTS INCLUDING BENEFICIARY'S DRAFT(S)...,POLYESTER PARTIALY ORIENTED YARN,CHORI CO LTDTEL,NAN YA PLASTICS CORPORATION,MHCBJPJTD,25D11126
8,COMMODITY ...,"CHORI CO., LTD.TEL NO.81-76-232-3008FAX NO.81-...","NAN YA PLASTICS CORPORATIONNO.201,TUNG HWA N.R...",+SIGNED COMMERCIAL INVOICE IN TRIPLICATE INDIC...,ALL DRAFT(S) DRAWN HEREUNDER MUST BE MARKED ''...,ALL DOCUMENTS INCLUDING BENEFICIARY'S DRAFT(S)...,POLYESTER PARTIALY ORIENTED YARN,CHORI CO LTDTEL,NAN YA PLASTICS CORPORATION,MHCBJPJTD,25D11126
...,...,...,...,...,...,...,...,...,...,...,...
20728,"CFR KOBE, JAPAN_x000D_\nVISCOSE RAYON STAPLE F...","MARUBENI INTEX CO.,LTD._x000D_\n1-2-1 DOJIMAHA...",FORMOSA CHEMICALS AND FIBRE_x000D_\nCORPORATIO...,+ COMMERCIAL INVOICE IN 3 ORIGINAL_x000D_\n+ 2...,+ T.T.REIMBURSEMENT : PROHIBITED_x000D_\n+ INS...,"ALL DOCS TO BE SENT DIRECTLY TO US (3-10-19, M...",VISCOSE RAYON STAPLE FIBER,MARUBENI INTEX COLTD,FORMOSA CHEMICALS AND FIBRE_x000D_CORPORATION,SMBCJPJTD,41G0C164
20729,"CFR KOBE, JAPAN_x000D_\nVISCOSE RAYON STAPLE F...","MARUBENI INTEX CO.,LTD._x000D_\n1-2-1 DOJIMAHA...",FORMOSA CHEMICALS AND FIBRE_x000D_\nCORPORATIO...,+ COMMERCIAL INVOICE IN 3 ORIGINAL_x000D_\n+ 2...,+ T.T.REIMBURSEMENT : PROHIBITED_x000D_\n+ INS...,"ALL DOCS TO BE SENT DIRECTLY TO US (3-10-19, M...",VISCOSE RAYON STAPLE FIBER,MARUBENI INTEX COLTD,FORMOSA CHEMICALS AND FIBRE_x000D_CORPORATION,SMBCJPJTD,41G12129
20730,"CFR KOBE, JAPAN_x000D_\nVISCOSE RAYON STAPLE F...","MARUBENI INTEX CO.,LTD._x000D_\n1-2-1 DOJIMAHA...",FORMOSA CHEMICALS AND FIBRE_x000D_\nCORPORATIO...,+ COMMERCIAL INVOICE IN 3 ORIGINAL_x000D_\n+ 2...,+ T.T.REIMBURSEMENT : PROHIBITED_x000D_\n+ INS...,"ALL DOCS TO BE SENT DIRECTLY TO US (3-10-19, M...",VISCOSE RAYON STAPLE FIBER,MARUBENI INTEX COLTD,FORMOSA CHEMICALS AND FIBRE_x000D_CORPORATION,SMBCJPJTD,41G12129
20731,187 MT PVC RESIN SUSPENSION GRADE S-65D OF FOR...,"PREMIER POLYFILM LTD_x000D_\nA-13, INDUSTRIAL ...","FORMOSA PLASTICS CORPORATION_x000D_\n201, TUNG...",1.BENEFICIARY'S SIGNED COMMERCIAL INVOICE IN T...,1.ALL DOCUMENTS MUST BE IN ENGLISH LANGUAGE._x...,ON RECEIPT OF DOCUMENTS STRICTLY IN CONFORMITY...,PVC RESIN,PREMIER POLYFILM LTD,FORMOSA PLASTICS CORPORATION,KKBKINBBA,11G11G85


In [13]:
df = df.replace('nan',np.nan)
df.dropna(subset=['產品名','開狀人','受益人','開狀銀行','EXPNO']).shape

(8231, 11)

In [14]:
df = df.dropna(subset=['產品名','開狀人','受益人','開狀銀行','EXPNO'])
df

Unnamed: 0,45A,50,59,46A,47A,78,產品名,開狀人,受益人,開狀銀行,EXPNO
3,"MASS PVC RESIN, B-57QUANTITY 175 MT AT 1300 US...",OJUS PETROCHEMICALS LLPC 289 NIRALA NAGAR LUCK...,"FORMOSA PLASTICS CORPORATION201,TUNG HWA N ROA...",1. SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AN...,1. ALL DOCUMENTS MUST BE IN ENGLISH.2. ALL DOC...,+UPON RECEIPT OF CREDIT COMPLIANT DOCUMENTS AT...,PVC RESIN B57,OJUS PETROCHEMICALS LLPC,FORMOSA PLASTICS CORPORATION,ICICINBBA,11G11F59
4,PHTHALIC ANHYDRIDE (PA)QUANTITY = 306 MT UNIT ...,AHMED SAEED AFIFI FACTORY CO. FORRESINS LTD. P...,NAN YA PLASTICS CORPORATION201 TUN HWA NORTH R...,1. SIGNED COMMERCIAL INVOICE IN ORIGINAL AND 4...,(A) ALL DOCUMENTS AND DRAFTS (IF CALLED FOR UN...,IN REIMBURSEMENT OF NEGOTIATION MADE BY YOUIN ...,PHTHALIC ANHYDRIDE PA,AHMED SAEED AFIFI FACTORY CO FORRESINS LTD,NAN YA PLASTICS CORPORATION,NCBKSAJEA,27P11346
5,+ COMMODITY: LLDPE TAISOX 3470+ QUANTITY: 32.0...,MINH KHANG CHEMICAL TRADINGJOINT STOCK COMPANY...,"FORMOSA PLASTICS CORPORATION201,TUNG HWA NORTH...",1.SIGNED COMMERCIAL INVOICE IN 03 ORIGINALS AN...,+ALL DOCUMENTS MUST MADE IN ENGLISH.+ALL DOCUM...,+ UPON RECEIPT OF ALL DOCUMENTS SENT TO US(VIE...,CIF HAIPHONG PORT,MINH KHANG CHEMICAL TRADINGJOINT,FORMOSA PLASTICS CORPORATION,VBAAVNVXA,18Q0C097
7,COMMODITY ...,"CHORI CO., LTD.TEL NO.81-76-232-3008FAX NO.81-...","NAN YA PLASTICS CORPORATIONNO.201,TUNG HWA N.R...",+SIGNED COMMERCIAL INVOICE IN TRIPLICATE INDIC...,ALL DRAFT(S) DRAWN HEREUNDER MUST BE MARKED ''...,ALL DOCUMENTS INCLUDING BENEFICIARY'S DRAFT(S)...,POLYESTER PARTIALY ORIENTED YARN,CHORI CO LTDTEL,NAN YA PLASTICS CORPORATION,MHCBJPJTD,25D11126
8,COMMODITY ...,"CHORI CO., LTD.TEL NO.81-76-232-3008FAX NO.81-...","NAN YA PLASTICS CORPORATIONNO.201,TUNG HWA N.R...",+SIGNED COMMERCIAL INVOICE IN TRIPLICATE INDIC...,ALL DRAFT(S) DRAWN HEREUNDER MUST BE MARKED ''...,ALL DOCUMENTS INCLUDING BENEFICIARY'S DRAFT(S)...,POLYESTER PARTIALY ORIENTED YARN,CHORI CO LTDTEL,NAN YA PLASTICS CORPORATION,MHCBJPJTD,25D11126
...,...,...,...,...,...,...,...,...,...,...,...
20728,"CFR KOBE, JAPAN_x000D_\nVISCOSE RAYON STAPLE F...","MARUBENI INTEX CO.,LTD._x000D_\n1-2-1 DOJIMAHA...",FORMOSA CHEMICALS AND FIBRE_x000D_\nCORPORATIO...,+ COMMERCIAL INVOICE IN 3 ORIGINAL_x000D_\n+ 2...,+ T.T.REIMBURSEMENT : PROHIBITED_x000D_\n+ INS...,"ALL DOCS TO BE SENT DIRECTLY TO US (3-10-19, M...",VISCOSE RAYON STAPLE FIBER,MARUBENI INTEX COLTD,FORMOSA CHEMICALS AND FIBRE_x000D_CORPORATION,SMBCJPJTD,41G0C164
20729,"CFR KOBE, JAPAN_x000D_\nVISCOSE RAYON STAPLE F...","MARUBENI INTEX CO.,LTD._x000D_\n1-2-1 DOJIMAHA...",FORMOSA CHEMICALS AND FIBRE_x000D_\nCORPORATIO...,+ COMMERCIAL INVOICE IN 3 ORIGINAL_x000D_\n+ 2...,+ T.T.REIMBURSEMENT : PROHIBITED_x000D_\n+ INS...,"ALL DOCS TO BE SENT DIRECTLY TO US (3-10-19, M...",VISCOSE RAYON STAPLE FIBER,MARUBENI INTEX COLTD,FORMOSA CHEMICALS AND FIBRE_x000D_CORPORATION,SMBCJPJTD,41G12129
20730,"CFR KOBE, JAPAN_x000D_\nVISCOSE RAYON STAPLE F...","MARUBENI INTEX CO.,LTD._x000D_\n1-2-1 DOJIMAHA...",FORMOSA CHEMICALS AND FIBRE_x000D_\nCORPORATIO...,+ COMMERCIAL INVOICE IN 3 ORIGINAL_x000D_\n+ 2...,+ T.T.REIMBURSEMENT : PROHIBITED_x000D_\n+ INS...,"ALL DOCS TO BE SENT DIRECTLY TO US (3-10-19, M...",VISCOSE RAYON STAPLE FIBER,MARUBENI INTEX COLTD,FORMOSA CHEMICALS AND FIBRE_x000D_CORPORATION,SMBCJPJTD,41G12129
20731,187 MT PVC RESIN SUSPENSION GRADE S-65D OF FOR...,"PREMIER POLYFILM LTD_x000D_\nA-13, INDUSTRIAL ...","FORMOSA PLASTICS CORPORATION_x000D_\n201, TUNG...",1.BENEFICIARY'S SIGNED COMMERCIAL INVOICE IN T...,1.ALL DOCUMENTS MUST BE IN ENGLISH LANGUAGE._x...,ON RECEIPT OF DOCUMENTS STRICTLY IN CONFORMITY...,PVC RESIN,PREMIER POLYFILM LTD,FORMOSA PLASTICS CORPORATION,KKBKINBBA,11G11G85


In [15]:
df.to_csv('../data/對應表/EXPNO對應表.csv')

In [16]:
1+1

2