In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from torch.optim.optimizer import Optimizer
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import random
import torch
from transformers import pipeline
import warnings 
warnings.filterwarnings('ignore')
from pytorch_lightning import seed_everything
from torch.utils.data import DataLoader
import os
import gc
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering
import pandas as pd
from tqdm import tqdm_notebook as tqdm
gc.collect()

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)


model_path = '../models/Product_Data_SQuAD_model_product.pt'
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model.load_state_dict(torch.load(model_path))
model.eval()
nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)

def model_predict(nlp,df):
    table = pd.DataFrame()
    for i in tqdm(sorted(df.index.tolist())):
        sample = df.loc[[i]]
        string_X_train = sample['string_X_train'].values[0]
        QA_input = {
            'question': 'What is the product name?',
            'context': string_X_train
        }
        res = nlp(QA_input)
        predict = QA_input['context'][res['start']:res['end']]
        row = pd.DataFrame({'predict:':predict},index=[i])
        table = table.append(row)
    return table

def substringSieve(string_list):
    out = []
    for s in string_list:
        if not any([s in r for r in string_list if s != r]):
            out.append(s)
    return out

Global seed set to 42
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should pr

# 製作寶典

In [2]:
# 載入寶典
df1 = pd.read_excel('../data/台塑企業_ 產品寶典20210303.xlsx',engine='openpyxl').iloc[:,:-1]
df2 = pd.read_excel('../data/寶典.v3.台塑網.20210901.xlsx',engine='openpyxl')
df2.columns = df1.columns
df = df1.append(df2)
df['品名'] = df['品名'].apply(lambda x:x.strip())

# 載入產品csv
val_df = pd.read_csv('../data/preprocess_for_SQUAD_產品.csv',index_col=0)[['string_X_train','Y_label','EXPNO','from']]
print(val_df.shape)

# 合併寶典和SPEC(後面四個月不要加)
產品集合 = set(df['品名'].values.tolist() + val_df.loc[val_df['from'].isin(sorted(val_df['from'].unique())[:]),'Y_label'].values.tolist())

# 對應表
品名2部門 = dict(zip(df['品名'],df['公司事業部門']))
品名2代號 = dict(zip(df['品名'],df['公司代號']))

# 驗證集
val_df = val_df.loc[val_df['from'].isin(sorted(val_df['from'].unique())[-4:]),:]
print(val_df.shape)
display(val_df.head(3))

(11200, 4)
(3092, 4)


Unnamed: 0,string_X_train,Y_label,EXPNO,from
765,1 GROUP II 500N BASE OIL 500N QUANTITY 900MT...,BASE OIL 500N,61,20210406.xlsx
766,1 GROUP II 500N BASE OIL 500N QUANTITY 900MT...,BASE OIL 500N,61,20210406.xlsx
767,COMMODITY EVA TAISOX 7470MQUANTITY1 2500MTUNIT...,TAISOX 7470M,18,20210406.xlsx


# 如果品名是單詞的話 前後加個空白

In [45]:
新產品集合 = []
for p in 產品集合:
    if ' ' not in p: # 如果是單詞
        p = f' {p.strip()} ' # 前後加空白
        新產品集合.append(p) # append
    else:
        新產品集合.append(p) # append
產品集合 = list(set(新產品集合))

In [46]:
def Collection_method(df,產品集合):
    labels = {}
    for i in tqdm(df.index):
        products = []
        for p in 產品集合:
            if p in df.loc[i,'string_X_train']:
                products.append(p) # 加入候選清單
        labels[i] = products # 這是一個清單,含多個產品
    predict = pd.DataFrame(index=labels.keys(),columns=['predict'])
    predict['predict'] = list(labels.values())
    return predict
predict = Collection_method(val_df,產品集合)
result = val_df.join(predict)
result['class'] = 'rule'

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3092.0), HTML(value='')))




In [47]:
result

Unnamed: 0,string_X_train,Y_label,EXPNO,from,predict,class
765,1 GROUP II 500N BASE OIL 500N QUANTITY 900...,BASE OIL 500N,61,20210406.xlsx,"[BASE OIL, BASE OIL 500N]",rule
766,1 GROUP II 500N BASE OIL 500N QUANTITY 900...,BASE OIL 500N,61,20210406.xlsx,"[BASE OIL, BASE OIL 500N]",rule
767,COMMODITY EVA TAISOX 7470M QUANTITY1 2500MTUN...,TAISOX 7470M,18,20210406.xlsx,"[ TAISOX , TAISOX 7470M, EVA ]",rule
768,COMMODITY EVA TAISOX 7470M QUANTITY1 2500MTUN...,TAISOX 7470M,,20210406.xlsx,"[ TAISOX , TAISOX 7470M, EVA ]",rule
769,480 MT OF PVC RESIN GRADE S65 ATTHE RATE OF ...,GRADE,,20210406.xlsx,"[ GRADE , RESIN , PVC RESIN]",rule
...,...,...,...,...,...,...
8203,1GPPS GP5250 QUANTITY200MT UNIT PRICEUSD13...,GP5250,,202106.xlsx,[ GP5250 ],rule
8204,COMMODITY EVA TAISOX 7350M EVA TAISOX...,TAISOX 7470M,,202106.xlsx,"[TAISOX 7350M, EVA TAISOX, TAISOX , TAISOX 74...",rule
8205,18 MT OF TAIRILAC ABS RESIN AT THE RATE USD 2...,ABS RESIN,,202106.xlsx,"[ABS RESIN, TAIRILAC , RESIN , ABS ]",rule
8206,COMMODITY TAIRILIN BRAND LOW...,GRADE,,202106.xlsx,"[ GRADE , TAIRILIN , FIBER ]",rule


In [48]:
not_find = []
for j,i in enumerate(result['predict'].values):
    if len(i) == 0:
        not_find.append(j)
len(not_find)

3

In [49]:
not_find_df = result.iloc[not_find]
not_find_df

Unnamed: 0,string_X_train,Y_label,EXPNO,from,predict,class
5205,18 MTS POM FORMOCON FM130 UNIT PRICE USD 225...,FM130,1P,202104.xlsx,[],rule
5335,TERMS OF PRICE FOB KAOHSIUNG TAIWAN COUNTRY O...,YUNGSOX 1040F,1P,202104.xlsx,[],rule
6007,TERMS OF PRICE FOB KAOHSIUNG PORT OF TAIWANCO...,YUNGSOX 2080,1P,202104.xlsx,[],rule


In [50]:
bert_predict = model_predict(nlp,not_find_df)
bert_predict

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




Unnamed: 0,predict:
5205,FM130
5335,YUNGSOX
6007,YUNGSOX 2080


In [51]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [52]:
if len(not_find) > 0:
    idx = not_find_df.index
    result.loc[idx,'predict'] = [ [str(i)] for i in bert_predict['predict:'].values]
    result.loc[idx,'class'] = 'bert'
    display(result.loc[idx,:])

Unnamed: 0,string_X_train,Y_label,EXPNO,from,predict,class
5205,18 MTS POM FORMOCON FM130 UNIT PRICE USD 225...,FM130,1P,202104.xlsx,[FM130],bert
5335,TERMS OF PRICE FOB KAOHSIUNG TAIWAN COUNTRY O...,YUNGSOX 1040F,1P,202104.xlsx,[YUNGSOX],bert
6007,TERMS OF PRICE FOB KAOHSIUNG PORT OF TAIWANCO...,YUNGSOX 2080,1P,202104.xlsx,[YUNGSOX 2080],bert


In [53]:
lst = []
for i in result.iloc[not_find].index:
    if result.loc[i,'Y_label'] != result.loc[i,'predict']:
        lst.append(i)

In [54]:
def get_acc(df,t=0.75):
    correct = []
    correct_label = []
    for i in df.index:
        jacs = []
        for j in df.loc[i,'predict']:
            jacs.append(get_jaccard_sim(df.loc[i,'Y_label'],j))
        if max(jacs) >= t:
            correct.append('yes')
        else:
            correct.append('no')
    result = pd.DataFrame({'correct':correct})
    return result['correct'].value_counts()['yes']/len(result)

def get_jac(df):
    all_jacs = []
    for i in df.index:
        jacs = []
        for j in df.loc[i,'predict']:
            jacs.append(get_jaccard_sim(df.loc[i,'Y_label'],j))
        all_jacs.append(max(jacs))
    return np.sum(all_jacs)/len(all_jacs)

# 表現

In [55]:
get_acc(result,t=1),get_acc(result,t=0.75),get_jac(result)

(0.9954721862871928, 0.9954721862871928, 0.9973587753341957)

In [56]:
部門_lst = []
for p_lst in tqdm(result['predict'].values):
    p = max(p_lst,key=len)
    jac_dict = {}
    for i in 品名2代號.keys():
        jac_dict[i] = get_jaccard_sim(i,p)
    部門_lst.append(品名2代號[max(jac_dict, key=jac_dict.get)])
result['預測部門代號'] = 部門_lst
result['predict'] = [substringSieve(i) for i in result['predict']]
result['EXPNO'] = [ str(i).strip() for i in result['EXPNO']]
result = result[result['EXPNO']!=str(np.nan)]
result['預測部門代號'] = [ str(i).strip() for i in result['預測部門代號']]
display(result)
a = len(result[result['EXPNO']==result['預測部門代號']])
b = len(result[result['EXPNO']!=result['預測部門代號']])
print(f'部門預測正確數量:{a} 錯誤數量:{b} 正確率:{a/(a+b)}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3092.0), HTML(value='')))




Unnamed: 0,string_X_train,Y_label,EXPNO,from,predict,class,預測部門代號
765,1 GROUP II 500N BASE OIL 500N QUANTITY 900...,BASE OIL 500N,61,20210406.xlsx,[BASE OIL 500N],rule,60
766,1 GROUP II 500N BASE OIL 500N QUANTITY 900...,BASE OIL 500N,61,20210406.xlsx,[BASE OIL 500N],rule,60
767,COMMODITY EVA TAISOX 7470M QUANTITY1 2500MTUN...,TAISOX 7470M,18,20210406.xlsx,"[ TAISOX , TAISOX 7470M, EVA ]",rule,18
770,EVA TAISOX 7470M QUANTITY 10000MTS UNIT ...,TAISOX 7470M,18,20210406.xlsx,"[TAISOX 7350M, TAISOX 7470M, EVA TAISOX 7350]",rule,18
771,EVA TAISOX 7470M QUANTITY 10000MTS UNIT ...,TAISOX 7470M,18,20210406.xlsx,"[TAISOX 7350M, TAISOX 7470M, EVA TAISOX 7350]",rule,18
...,...,...,...,...,...,...,...
8168,ITEM 1180 METRIC TONS OF ABS RESIN GRADE NO ...,GRADE,4A,202106.xlsx,"[ GRADE , ABS RESIN, RESIN , ABS ]",rule,4A
8180,ITEM1 144 METRIC TONS OF ABS RESIN GRADE NO ...,GRADE,4A,202106.xlsx,"[ GRADE , ABS RESIN, RESIN , ABS , PS , PS ...",rule,4A
8181,ITEM1 144 METRIC TONS OF ABS RESIN GRADE NO ...,GRADE,4A,202106.xlsx,"[ GRADE , ABS RESIN, RESIN , ABS , PS , PS ...",rule,4A
8198,COMMODITY MONO ETHYLENE GLYCOL MEGQUANTITY 4...,MONO ETHYLENE GLYCOL,2A,202106.xlsx,[MONO ETHYLENE GLYCOL],rule,2A


部門預測正確數量:2035 錯誤數量:492 正確率:0.8053027305104867


In [57]:
result[result['EXPNO'] != result['預測部門代號']]

Unnamed: 0,string_X_train,Y_label,EXPNO,from,predict,class,預測部門代號
765,1 GROUP II 500N BASE OIL 500N QUANTITY 900...,BASE OIL 500N,61,20210406.xlsx,[BASE OIL 500N],rule,60
766,1 GROUP II 500N BASE OIL 500N QUANTITY 900...,BASE OIL 500N,61,20210406.xlsx,[BASE OIL 500N],rule,60
833,PROCESS OIL 380N 60MT 5 PCT AT ICIS PRICING ...,PROCESS OIL 380N,61,20210406.xlsx,[PROCESS OIL],rule,64
876,TERMS OF PRICE CIF BUSAN SOUTH KOREACOUNTRY O...,YARN,XU,20210406.xlsx,"[ GRADE , YARN , FIBER ]",rule,4A
879,TERMS OF PRICE CIF BUSAN PORTCOUNTRY OF ORIGI...,YARN,XU,20210406.xlsx,"[ YARN , FIBER ]",rule,12
...,...,...,...,...,...,...,...
8120,PLASTIC RESIN CIF HAIPHONG VIETNAM1 1 X 20 FCL...,PP 5090T,1P,202106.xlsx,"[ PP , RESIN , PLASTIC RESIN, PP 5090T]",rule,23
8124,COMMODITY PURIFIED ISOPHTHALIC ACID PIAQUANT...,ACID,48,202106.xlsx,[ ACID ],rule,12
8141,COMMODITY PURIFIED TEREPHTHALIC ACID PTA QU...,ACID,48,202106.xlsx,"[ PTA , ACID ]",rule,12
8142,COMMODITY PURIFIED TEREPHTHALIC ACID PTA QU...,ACID,48,202106.xlsx,"[ PTA , ACID ]",rule,12


# 一些小問題 公司代號跟EXPNO對不上

In [58]:

table1 = pd.DataFrame()
table2 = pd.DataFrame()
p = result[result['EXPNO'] != result['預測部門代號']]['predict'].values[0][0]
for p in list(set(品名2代號.keys())&set(result['Y_label'])):
    if 品名2代號[p] != result.loc[result['Y_label']==p,'EXPNO'].values[0]:
        table1 = table1.append(df[df['品名']==p])
        table2 = table2.append(val_df[val_df['Y_label']==p])
display(table1.drop_duplicates(subset=['品名'])),display(table2.drop_duplicates(subset=['Y_label']))

Unnamed: 0,RIGID,分機,公司代號,公司事業部門,品名
227,,,25,纖維事業部,RELEASE FILM
1070,,,60,國外處,BASE OIL 500N
537,,,2P,南亞塑四部化學品部,PC RESIN
1069,,,60,國外處,BASE OIL 150N
387,王睿妍,8328.0,2E,南亞纖維部,PET FILM
298,耿瑞君,8215.0,22,南亞塑一部二處,SYNTHETIC PAPER
433,王淑霓,8347.0,26,台染部,POLYESTER


Unnamed: 0,string_X_train,Y_label,EXPNO,from
5015,TERMS OF PRICE FOB TAIWAN PORTCOUNTRY OF ORIG...,RELEASE FILM,2E,202104.xlsx
765,1 GROUP II 500N BASE OIL 500N QUANTITY 900...,BASE OIL 500N,61,20210406.xlsx
7747,DESCRIPTION QUANTITY UNIT PRICE ...,PC RESIN,4A,202106.xlsx
996,BASE OIL 150N PRICING1000MT5 PCT AT ICIS PRIC...,BASE OIL 150N,61,20210406.xlsx
940,1 100000 KGS OF PP SHEET RIGID PVC SHEET A P...,PET FILM,22,20210406.xlsx
5009,1PP SYNTHETIC PAPER BJE OF5 200UM X 1054MM ...,SYNTHETIC PAPER,22,202104.xlsx
1207,4600000 KILOGRAMS POLYESTER STAPLE HOLLOWCONJ...,POLYESTER,,20210406.xlsx


(None, None)

In [59]:
result.to_csv('submit_product_0916_規則加bert.csv')