In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from torch.optim.optimizer import Optimizer
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import random
import torch
from transformers import pipeline
import warnings 
warnings.filterwarnings('ignore')
from pytorch_lightning import seed_everything
from torch.utils.data import DataLoader
import os
import gc
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering
import pandas as pd
from tqdm import tqdm_notebook as tqdm
gc.collect()

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)


model_path = '../models/Product_Data_SQuAD_model_product.pt'
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model.load_state_dict(torch.load(model_path))
model.eval()
nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)

def model_predict(nlp,df):
    table = pd.DataFrame()
    for i in tqdm(sorted(df.index.tolist())):
        sample = df.loc[[i]]
        string_X_train = sample['string_X_train'].values[0]
        QA_input = {
            'question': 'What is the product name?',
            'context': string_X_train
        }
        res = nlp(QA_input)
        predict = QA_input['context'][res['start']:res['end']]
        row = pd.DataFrame({'predict:':predict},index=[i])
        table = table.append(row)
    return table

def substringSieve(string_list):
    out = []
    for s in string_list:
        if not any([s in r for r in string_list if s != r]):
            out.append(s)
    return out

Global seed set to 42
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should pr

# 製作寶典

In [3]:
# 載入寶典
df1 = pd.read_excel('../data/台塑企業_ 產品寶典20210303.xlsx',engine='openpyxl').iloc[:,:-1]
df2 = pd.read_excel('../data/寶典.v3.台塑網.20210901.xlsx',engine='openpyxl')
df2.columns = df1.columns
df = df1.append(df2)
df['品名'] = df['品名'].apply(lambda x:x.strip())

# 載入產品csv
val_df = pd.read_csv('../data/preprocess_for_SQUAD_產品.csv',index_col=0)[['string_X_train','Y_label','EXPNO']].dropna(axis=0)

# 合併寶典和SPEC
產品集合 = set(df['品名'].values.tolist() + val_df['Y_label'].values.tolist())

# 對應表
品名2部門 = dict(zip(df['品名'],df['公司事業部門']))
品名2代號 = dict(zip(df['品名'],df['公司代號']))

train_df = val_df

# 如果品名是單詞的話 前後加個空白

In [4]:
新產品集合 = []
for p in 產品集合:
    if ' ' not in p: # 如果是單詞
        p = f' {p.strip()} ' # 前後加空白
        新產品集合.append(p) # append
    else:
        新產品集合.append(p) # append
產品集合 = list(set(新產品集合))

In [5]:
def Collection_method(df,產品集合):
    labels = {}
    for i in tqdm(df.index):
        products = []
        for p in 產品集合:
            if p in df.loc[i,'string_X_train']:
                products.append(p) # 加入候選清單
        labels[i] = products # 這是一個清單,含多個產品
    predict = pd.DataFrame(index=labels.keys(),columns=['predict'])
    predict['predict'] = list(labels.values())
    return predict
predict = Collection_method(val_df,產品集合)
result = val_df.join(predict)
result['class'] = 'rule'

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1970.0), HTML(value='')))




In [6]:
result

Unnamed: 0,string_X_train,Y_label,EXPNO,predict,class
4,PHTHALIC ANHYDRIDE PAQUANTITY 306 MT UNIT P...,PHTHALIC ANHYDRIDE,27,[PHTHALIC ANHYDRIDE],rule
5,COMMODITY LLDPE TAISOX 3470 QUANTITY 320MT 2...,TAISOX 3470,18,"[TAISOX 3470, TAISOX ]",rule
7,COMMODITY ...,ORIENTED YARN,25,"[ORIENTED YARN, POLYESTER TEXTURED YARN, POLYE...",rule
8,COMMODITY ...,ORIENTED YARN,25,"[ORIENTED YARN, POLYESTER TEXTURED YARN, POLYE...",rule
9,COMMODITY ...,POLYESTER FILAMENT YARN,25,"[ POLYESTER , POLYESTER PARTIALY ORIENTED YARN...",rule
...,...,...,...,...,...
20633,PRECIPITATED CALCIUM CARBONATE FP100AQUANTIT...,CALCIUM CARBONATE,14,[CALCIUM CARBONATE],rule
20634,PRECIPITATED CALCIUM CARBONATE FP100AQUANTIT...,CALCIUM CARBONATE,14,[CALCIUM CARBONATE],rule
20635,PRECIPITATED CALCIUM CARBONATE FP100AQUANTIT...,CALCIUM CARBONATE,14,[CALCIUM CARBONATE],rule
20636,PRECIPITATED CALCIUM CARBONATE FP100AQUANTIT...,CALCIUM CARBONATE,14,[CALCIUM CARBONATE],rule


In [7]:
not_find = []
for j,i in enumerate(result['predict'].values):
    if len(i) == 0:
        not_find.append(j)
len(not_find)

0

In [8]:
not_find_df = result.iloc[not_find]
not_find_df

Unnamed: 0,string_X_train,Y_label,EXPNO,predict,class


In [9]:
bert_predict = model_predict(nlp,not_find_df)
bert_predict

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [10]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [12]:
if len(not_find) > 0:
    result.iloc[not_find,3] = [ [str(i)] for i in bert_predict['predict:'].values]
    idx = result.iloc[not_find].index
    result.loc[idx,'class'] = 'bert'
    result.loc[idx]

In [14]:
if len(not_find) > 0:
    result.loc[idx]

In [15]:
lst = []
for i in result.iloc[not_find].index:
    if result.loc[i,'Y_label'] != result.loc[i,'predict']:
        lst.append(i)

In [16]:
def get_acc(df,t=0.75):
    correct = []
    correct_label = []
    for i in df.index:
        jacs = []
        for j in df.loc[i,'predict']:
            jacs.append(get_jaccard_sim(df.loc[i,'Y_label'],j))
        if max(jacs) >= t:
            correct.append('yes')
        else:
            correct.append('no')
    result = pd.DataFrame({'correct':correct})
    return result['correct'].value_counts()['yes']/len(result)

def get_jac(df):
    all_jacs = []
    for i in df.index:
        jacs = []
        for j in df.loc[i,'predict']:
            jacs.append(get_jaccard_sim(df.loc[i,'Y_label'],j))
        all_jacs.append(max(jacs))
    return np.sum(all_jacs)/len(all_jacs)

# 表現

In [17]:
get_acc(result,t=1),get_acc(result,t=0.75),get_jac(result)

(1.0, 1.0, 1.0)

In [18]:
部門_lst = []
for p_lst in tqdm(result['predict'].values):
    p = max(p_lst,key=len)
    jac_dict = {}
    for i in 品名2代號.keys():
        jac_dict[i] = get_jaccard_sim(i,p)
    部門_lst.append(品名2代號[max(jac_dict, key=jac_dict.get)])
result['預測部門代號'] = 部門_lst
result['predict'] = [substringSieve(i) for i in result['predict']]
result['EXPNO'] = [ str(i).strip() for i in result['EXPNO']]
result['預測部門代號'] = [ str(i).strip() for i in result['預測部門代號']]
display(result)
a = len(result[result['EXPNO']==result['預測部門代號']])
b = len(result[result['EXPNO']!=result['預測部門代號']])
print(f'部門預測正確數量:{a} 錯誤數量:{b} 正確率:{a/(a+b)}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1970.0), HTML(value='')))




Unnamed: 0,string_X_train,Y_label,EXPNO,predict,class,預測部門代號
4,PHTHALIC ANHYDRIDE PAQUANTITY 306 MT UNIT P...,PHTHALIC ANHYDRIDE,27,[PHTHALIC ANHYDRIDE],rule,27
5,COMMODITY LLDPE TAISOX 3470 QUANTITY 320MT 2...,TAISOX 3470,18,"[TAISOX 3470, TAISOX ]",rule,18
7,COMMODITY ...,ORIENTED YARN,25,"[ORIENTED YARN, POLYESTER TEXTURED YARN, POLYE...",rule,25
8,COMMODITY ...,ORIENTED YARN,25,"[ORIENTED YARN, POLYESTER TEXTURED YARN, POLYE...",rule,25
9,COMMODITY ...,POLYESTER FILAMENT YARN,25,"[ POLYESTER , POLYESTER PARTIALY ORIENTED YARN...",rule,25
...,...,...,...,...,...,...
20633,PRECIPITATED CALCIUM CARBONATE FP100AQUANTIT...,CALCIUM CARBONATE,14,[CALCIUM CARBONATE],rule,14
20634,PRECIPITATED CALCIUM CARBONATE FP100AQUANTIT...,CALCIUM CARBONATE,14,[CALCIUM CARBONATE],rule,14
20635,PRECIPITATED CALCIUM CARBONATE FP100AQUANTIT...,CALCIUM CARBONATE,14,[CALCIUM CARBONATE],rule,14
20636,PRECIPITATED CALCIUM CARBONATE FP100AQUANTIT...,CALCIUM CARBONATE,14,[CALCIUM CARBONATE],rule,14


部門預測正確數量:1762 錯誤數量:208 正確率:0.8944162436548223


In [19]:
result[result['EXPNO'] != result['預測部門代號']]

Unnamed: 0,string_X_train,Y_label,EXPNO,predict,class,預測部門代號
432,COMMODITY POLYMER GRADE PROPYLENEQUANTITY 65...,POLYMER GRADE,62,"[POLYMER GRADE, GRADE ]",rule,12
1625,1ABS AX4100 QUANTITY1800MT UNIT PRICEUSD25...,AX4100,4A,[ AX4100 ],rule,11
1635,1GPPS GP550N QUANTITY60480MT UNIT PRICEUSD...,GP550N,4A,[ GP550N ],rule,11
1636,1GPPS GP550N QUANTITY60480MT UNIT PRICEUSD...,GP550N,4A,[ GP550N ],rule,11
1919,TERMS OF PRICE FOB OR FCA TAIWANCOUNTRY OF OR...,GLASS EPOXY PREPREG,2M,"[GLASS EPOXY PREPREG, PREPREG , COPPER CLAD L...",rule,28
...,...,...,...,...,...,...
20438,29Y29BNAN YA 2100ANC1 CIF HAIPHONG PORT VIETNAM,2100ANC1,23,[ 2100ANC1 ],rule,11
20497,TERMS OF PRICE FOB OR FCA TAIWANCOUNTRY OF OR...,GLASS EPOXY PREPREG,2M,"[GLASS EPOXY PREPREG, PREPREG , COPPER CLAD L...",rule,28
20498,TERMS OF PRICE FOB OR FCA TAIWANCOUNTRY OF OR...,GLASS EPOXY PREPREG,2M,"[GLASS EPOXY PREPREG, PREPREG , COPPER CLAD L...",rule,28
20499,TERMS OF PRICE FOB OR FCA TAIWANCOUNTRY OF OR...,GLASS EPOXY PREPREG,2M,"[GLASS EPOXY PREPREG, PREPREG , COPPER CLAD L...",rule,28


# 一些小問題 公司代號跟EXPNO對不上

In [45]:

table1 = pd.DataFrame()
table2 = pd.DataFrame()
p = result[result['EXPNO'] != result['預測部門代號']]['predict'].values[0][0]
for p in list(set(品名2代號.keys())&set(result['Y_label'])):
    if 品名2代號[p] != result.loc[result['Y_label']==p,'EXPNO'].values[0]:
        table1 = table1.append(df[df['品名']==p])
        table2 = table2.append(val_df[val_df['Y_label']==p])
display(table1.drop_duplicates(subset=['品名'])),display(table2.drop_duplicates(subset=['Y_label']))

Unnamed: 0,RIGID,分機,公司代號,公司事業部門,品名
433,王淑霓,8347,26,台染部,POLYESTER
320,葉鳳琴,8233,23,南亞塑三部,PP SYNTHETIC PAPER
802,王淑霓,8347,26,台染部,NAN YA RELEASE FILM


Unnamed: 0,string_X_train,Y_label,EXPNO
2984,TAIRILIN BRAND POLYESTER FILM B GRADE ...,POLYESTER,2E
6802,PP SYNTHETIC PAPER BCP 015MM X 935MM X 1500M ...,PP SYNTHETIC PAPER,22
1981,TERMS OF PRICE FOB TAIWAN PORTCOUNTRY OF ORIG...,NAN YA RELEASE FILM,2E


(None, None)

In [47]:
result.to_csv('submit_product_0909.csv')

In [48]:
result.head(30)

Unnamed: 0,string_X_train,Y_label,EXPNO,predict,class,預測部門代號
4,PHTHALIC ANHYDRIDE PAQUANTITY 306 MT UNIT P...,PHTHALIC ANHYDRIDE,27,[PHTHALIC ANHYDRIDE],rule,27
5,COMMODITY LLDPE TAISOX 3470 QUANTITY 320MT 2...,TAISOX 3470,18,"[TAISOX 3470, TAISOX ]",rule,18
7,COMMODITY ...,ORIENTED YARN,25,"[ORIENTED YARN, POLYESTER TEXTURED YARN, POLYE...",rule,25
8,COMMODITY ...,ORIENTED YARN,25,"[ORIENTED YARN, POLYESTER TEXTURED YARN, POLYE...",rule,25
9,COMMODITY ...,POLYESTER FILAMENT YARN,25,"[ POLYESTER , POLYESTER PARTIALY ORIENTED YARN...",rule,25
10,COMMODITY ...,ORIENTED YARN,25,"[ORIENTED YARN, POLYESTER TEXTURED YARN, POLYE...",rule,25
11,COMMODITY ...,ORIENTED YARN,25,"[ORIENTED YARN, POLYESTER TEXTURED YARN, POLYE...",rule,25
31,PRODUCT TETRAHYDROFURAN 998 PCT MINQUANTIT...,TETRAHYDROFURAN,2P,[ TETRAHYDROFURAN ],rule,2P
44,COMMODITY LLDPE TAISOX 3470 QUANTITY 11200 MT...,TAISOX 3470,18,"[TAISOX 3470, TAISOX ]",rule,18
62,HDPE TAISOX 9020 1000 MTUNIT PRICE USD106000M...,TAISOX 9020,18,"[TAISOX 9020, TAISOX ]",rule,18
