In [133]:
import pandas as pd
import re
import time

# 基於規則之匹配算法
def matching(sentence,database):
  candidate_list = []
  for word in database:
    if word in sentence: 
      candidate_list.append(word)
  return candidate_list

# rule對出來的產品名若為其他產品名的子集則剔除
def substringSieve(string_list):
    string_list.sort(key=lambda s: len(s), reverse=True)
    out = []
    for s in string_list:
        if not any([s in o for o in out]):
            out.append(s)
    return out

# 輸入sentence前處理
def preprocess_raw_sentence(x):
    x = str(x).upper() # 轉大寫字串
    x = re.sub('[\u4e00-\u9fa5]', '', x) # 去除中文
    x = re.sub(r'[^\w\s]','',x) # 去除標點符號
    x = x.replace('\n', '').replace('\r', '').replace('\t', '') # 去除換行符號
    str.strip(x) # 移除左右空白
    x = x.replace('   ', ' ')# 去除三重空白
    x = x.replace('  ', ' ')# 去除雙重空白
    x = ' ' + x + ' '# 出現在頭的 就不可能對到前後加空格的 這種情形要想想怎麼對照(加上左右空白)
    return x

# 產品後處理
def product_name_postprocess(x):
    x = str(x).upper() # 轉大寫字串
    x = x.replace('-','')# 去除標點符號
    x = x.replace('.','')# 去除標點符號
    x = x.replace(',','')# 去除標點符號
    x = x.strip() # 去除空白
    return x

# 預測函數
def predict_keyword(title,test_df,Unrecognized,input_col,database,output_col):
    output_list = []
    for i in test_df.index:
        candidate_list = matching(
            sentence = test_df.loc[i,input_col],
            database = set(database) - set(Unrecognized)
            )
        try:
            output_list.append(max(substringSieve(candidate_list), key=len))
        except:
            output_list.append(None)
    return output_list

# 讀取產品名資料庫
品名寶典 = pd.read_excel('../data/寶典/寶典人工處理後/寶典.v8.202111202.xlsx',engine='openpyxl')[['CODIV','DIVNM','ITEMNM']]
品名寶典 = 品名寶典.rename(columns={'ITEMNM':'品名','DIVNM':'公司事業部門','CODIV':'公司代號'})
品名寶典['品名'] = 品名寶典['品名'].apply(lambda x:product_name_postprocess(x))

# 上傳檔案
test_df = pd.read_csv('../data/測試數據/0927到2022.csv')

# 針對模型輸入做預處理
產品名輸入 = '45A' #產品名
開狀人輸入 = '50' #開狀人
受益人輸入 = '59' #受益人
開狀銀行輸入 = 'LTADDRESS.1' #銀行輸入
for i in [產品名輸入,開狀人輸入,受益人輸入]:
    test_df[i] = test_df[i].apply(lambda x:preprocess_raw_sentence(x))

# 預測程序
output = pd.DataFrame()
output[產品名輸入] = test_df[產品名輸入]
output['產品名'] = predict_keyword(
    title = '正在預測產品',
    test_df = test_df,
    Unrecognized = ['PE','MA','EA','GRADE','INA','PACK','PP','PA','',' '],
    input_col = 產品名輸入,
    database = 品名寶典['品名'].values.tolist(),
    output_col = '產品名',
)
output = output.dropna(axis=0)
output

Unnamed: 0,45A,產品名
0,48 MT PVC SUSPENSION RESIN GRADE S65AT THE RA...,PVC SUSPENSION RESIN
1,TETRAHYDROFURAN AT USD 5550 TOTAL AMOUNTUSD 1...,TETRAHYDROFURAN
2,A 19800 MT OF TAIRILAC AG15A1HABS ABS RESINAT...,ABS RESIN
3,80 MT PVC SUSPENSION RESIN GRADE S65S AT THE ...,PVC SUSPENSION RESIN
4,PVC SUSPENSION RESIN S70RQUANTITY 72 MT UNIT ...,PVC SUSPENSION RESIN
...,...,...
2421,210 MT OF PVC MASS RESIN B 57 AT THE RATE OF ...,PVC MASS RESIN
2423,WHITE OIL F380N WHITE OIL F550NCONTRACT NO BO...,WHITE OIL
2424,QTY 54 MTS OF PVC SUSPENSION RESIN S65D AT TH...,PVC SUSPENSION RESIN
2425,COMMODITY AMOUNTPOLYESTER TEXTURED YA...,POLYESTER PARTIALY ORIENTED YARN


# 起始結束位置標註

In [134]:
df = output.copy()
df = df.rename(columns = {
    '45A':'string_X_train',
    '產品名':'Y_label'})

def str2index(context,string):
    ys = context.find(string)
    ye = ys + len(string)
    return ys,ye

ys_lst,ye_lst = [],[]
for i in range(len(df)):
    ys,ye = str2index(df['string_X_train'].values[i],df['Y_label'].values[i])
    ys_lst.append(ys),ye_lst.append(ye)
    
df['string_Y_1'] = ys_lst
df['string_Y_2'] = ye_lst
print(1,df.shape)

df = df[df['Y_label']!=''] # 去掉空值
df = df[df['Y_label']!=' '] # 去掉空值
print(2,df.shape)

df = df.loc[df['string_Y_1'] != -1,:] # 去掉找不到答案的
print(3,df.shape)

df = df.dropna(subset=['string_X_train','Y_label'],axis=0) # 去掉NAN
print(4,df.shape)

df.head(10)#這一個dataframe不一定跟當初訓練bert的data呈現iid分布 這是一個問題 會導致準度很差

1 (2323, 4)
2 (2323, 4)
3 (2323, 4)
4 (2323, 4)


Unnamed: 0,string_X_train,Y_label,string_Y_1,string_Y_2
0,48 MT PVC SUSPENSION RESIN GRADE S65AT THE RA...,PVC SUSPENSION RESIN,7,27
1,TETRAHYDROFURAN AT USD 5550 TOTAL AMOUNTUSD 1...,TETRAHYDROFURAN,1,16
2,A 19800 MT OF TAIRILAC AG15A1HABS ABS RESINAT...,ABS RESIN,35,44
3,80 MT PVC SUSPENSION RESIN GRADE S65S AT THE ...,PVC SUSPENSION RESIN,7,27
4,PVC SUSPENSION RESIN S70RQUANTITY 72 MT UNIT ...,PVC SUSPENSION RESIN,1,21
5,COMMODITY QUANTITY UNIT PRICE AMOUNT M...,POLYPROPYLENE RESIN,75,94
6,CIP DONGGUANHIPS RESIN GRADE NOHP8250 1800MT...,HIPS RESIN,13,23
7,NBUTANOLQUANTITY 300000000KG5UNIT PRICE USD 2...,NBUTANOL,1,9
8,35 MT OF PVC MASS RESIN GRADE B57 AT USD 1620...,PVC MASS RESIN,10,24
9,525 MT PVC RESIN GRADE B57 AT USD 1620 PER MT...,PVC RESIN,8,17


# 載入模型

In [135]:
def load_nlp(path,model,tokenizer):
    model.load_state_dict(torch.load(path,map_location=torch.device('cpu')))
    model.eval()
    nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)
    return nlp
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
nlp = load_nlp('../models/Product_Data_SQuAD_model_product.pt',model,tokenizer)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

# 推論

In [136]:
def test_model(nlp,df,qustion,start_idx_from_0):
    table = pd.DataFrame()
    for i in tqdm(df.index):
        sample = df.loc[[i]]
        string_X_train = sample['string_X_train'].values[0]
        Y_label = sample['Y_label'].values[0]  
        QA_input = {
            'question': qustion,
            'context': string_X_train
        }
        res = nlp(QA_input)
        if start_idx_from_0 == True:
            predict = QA_input['context'][0:res['end']]
        else:
            predict = QA_input['context'][res['start']:res['end']]
        row = pd.DataFrame({'Y_label':Y_label,'predict':predict},index=[i])
        if Y_label == predict:
            row['是否全對'] = 'Yes'
        else:
            row['是否全對'] = 'No'
        table = table.append(row)
    return table
table = test_model(
    nlp = nlp,
    df = df,
    qustion = 'What is the product name?',
    start_idx_from_0 = False)

100%|██████████| 2323/2323 [01:39<00:00, 23.43it/s]


# 展示結果

In [137]:
table

Unnamed: 0,Y_label,predict,是否全對
0,PVC SUSPENSION RESIN,PVC SUSPENSION RESIN,Yes
1,TETRAHYDROFURAN,TETRAHYDROFURAN,Yes
2,ABS RESIN,RESINAT,No
3,PVC SUSPENSION RESIN,PVC SUSPENSION RESIN,Yes
4,PVC SUSPENSION RESIN,PVC SUSPENSION RESIN S70RQUANTITY,No
...,...,...,...
2421,PVC MASS RESIN,PVC MASS RESIN,Yes
2423,WHITE OIL,F550NCONTRACT,No
2424,PVC SUSPENSION RESIN,PVC MASS RESIN B57,No
2425,POLYESTER PARTIALY ORIENTED YARN,PARTIALY ORIENTED YARN,No


In [143]:
import numpy as np

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def get_acc(df,t=1):
        correct = []
        correct_label = []
        for i in df.index:
            jac = get_jaccard_sim(df.loc[i,'Y_label'],df.loc[i,'predict'])
            if jac >= t:
                correct.append('yes')
            else:
                correct.append('no')
        a = pd.Series(correct)
        return a.value_counts()['yes']/len(a)

result = pd.DataFrame()
result['jaccard'] = [np.mean([ get_jaccard_sim(table['Y_label'].loc[i],table['predict'].loc[i]) for i in table.index])]
result['acc(1.0)'] = [get_acc(table,1)]
result['acc(0.75)'] = [get_acc(table,0.75)]
result['acc(0.5)'] = [get_acc(table,0.5)]
result['acc(0.25)'] = [get_acc(table,0.25)]
result

Unnamed: 0,jaccard,acc(1.0),acc(0.75),acc(0.5),acc(0.25)
0,0.499664,0.247525,0.321997,0.582006,0.734395
