In [206]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from torch.optim.optimizer import Optimizer
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import random
import torch
from transformers import pipeline
import warnings 
warnings.filterwarnings('ignore')
from pytorch_lightning import seed_everything
from torch.utils.data import DataLoader
import os
import gc
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering
import pandas as pd
from tqdm import tqdm_notebook as tqdm

gc.collect()

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)



tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model.load_state_dict(torch.load('../models/Product_Data_SQuAD_model_產品.pt'))
model.eval()
nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)

def model_predict(nlp,df):
    table = pd.DataFrame()
    for i in tqdm(sorted(df.index.tolist())):
        sample = df.loc[[i]]
        string_X_train = sample['string_X_train'].values[0]
        QA_input = {
            'question': 'What is the product name?',
            'context': string_X_train
        }
        res = nlp(QA_input)
        predict = QA_input['context'][res['start']:res['end']]
        row = pd.DataFrame({'predict:':predict},index=[i])
        table = table.append(row)
    return table
gc.collect()

Global seed set to 42
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should pr

0

# 寶典

In [207]:
df = pd.read_excel('../data/台塑企業_ 產品寶典20210303.xlsx',engine='openpyxl').iloc[:,:-1]
df2 = pd.read_excel('../data/寶典.v3.台塑網.20210901.xlsx',engine='openpyxl')
df2.columns = df.columns
df = df.append(df2)
df = df.reset_index(drop=True)
display(df[df['品名']=='MA'])
產品集合 = set(df['品名'].values)

Unnamed: 0,RIGID,分機,公司代號,公司事業部門,品名
531,,,2P,南亞塑四部化學品部,MA
1480,,,2P,南亞塑四部化學品部,MA


In [208]:
df[df['品名'] == 'SAN']

Unnamed: 0,RIGID,分機,公司代號,公司事業部門,品名
1047,,,4A,塑膠事業部,SAN
1414,,,4A,塑膠事業部,SAN


In [209]:
df[df['品名'] == 'INA']

Unnamed: 0,RIGID,分機,公司代號,公司事業部門,品名
1925,郭慶怡,6246,27,南亞化一部,INA


In [210]:
df.loc[1925,'品名'] = ' INA '

In [211]:
df[df['品名'] == ' INA ']

Unnamed: 0,RIGID,分機,公司代號,公司事業部門,品名
1925,郭慶怡,6246,27,南亞化一部,INA


In [212]:
品名2部門 = dict(zip(df['品名'],df['公司事業部門']))
品名2代號 = dict(zip(df['品名'],df['公司代號']))

In [213]:
val_df = pd.read_csv('../data/preprocess_for_SQUAD_產品.csv',index_col=0)[['45A','Y_label','EXPNO']]
print(val_df.shape)
val_df.columns = ['string_X_train','Y_label','EXPNO']
val_df

(4059, 3)


Unnamed: 0,string_X_train,Y_label,EXPNO
0,MASS PVC RESIN B-57 QUANTITY 175 MT AT 1300 US...,MASS PVC RESIN B-57,11
1,PHTHALIC ANHYDRIDE PA QUANTITY 306 MT UNIT PRI...,PHTHALIC ANHYDRIDE,27
2,COMMODITY LLDPE TAISOX 3470 QUANTITY 320 MT 2 ...,LLDPE TAISOX,18
5,ITEM 1 HDPE TAISOX 8010 200 MT USD 1100 MT CON...,HDPE TAISOX 8010,18
6,ITEM 1 HDPE TAISOX 8010 200 MT USD 1100 MT CON...,HDPE TAISOX 8010,18
...,...,...,...
4204,CFR KOBE JAPAN VISCOSE RAYON STAPLE FIBER15DX4...,VISCOSE RAYON STAPLE FIBER,41
4205,CFR KOBE JAPAN VISCOSE RAYON STAPLE FIBER15DX4...,VISCOSE RAYON STAPLE FIBER,41
4206,CFR KOBE JAPAN VISCOSE RAYON STAPLE FIBER15DX4...,VISCOSE RAYON STAPLE FIBER,41
4207,187 MT PVC RESIN SUSPENSION GRADE S-65D OF FOR...,PVC RESIN SUSPENSION GRADE S-65D,11


In [214]:
train_df = val_df

# 如果品名是單詞的話 前後加個空白

In [215]:
for i in df.index:
    if ' ' not in df.loc[i,'品名']:
        name = df.loc[i,'品名']
        df.loc[i,'品名'] = f' {name} '
        assert df.loc[i,'品名'][0] == ' '
        assert df.loc[i,'品名'][-1] == ' '

In [216]:
display(df[df['品名']=='MA'])
display(df[df['品名']==' MA '])

Unnamed: 0,RIGID,分機,公司代號,公司事業部門,品名


Unnamed: 0,RIGID,分機,公司代號,公司事業部門,品名
531,,,2P,南亞塑四部化學品部,MA
1480,,,2P,南亞塑四部化學品部,MA


In [217]:
產品集合 = set(df['品名'].values)

# find_fail_sample and drop fail_sample

In [218]:
def find_fail_sample(df):
    fails = []
    for i in df.index:
        context = df.loc[i,'string_X_train']
        answer = df.loc[i,'Y_label']
        if str(answer) not in str(context):
            fails.append(i)
    return fails
train_fails = find_fail_sample(train_df)
val_fails = find_fail_sample(val_df)
print(train_fails,val_fails)
display(val_df.loc[val_fails])
print(val_df.shape)
val_df = val_df.drop(val_fails,axis=0)
print(val_df.shape)

[182, 489, 522, 746, 749, 759, 760, 761, 830, 833, 834, 876, 877, 892, 909, 921, 924, 926, 958, 988, 999, 1004, 1005, 1008, 1010, 1047, 1049, 1084, 1133, 1136, 1137, 1155, 1204, 1210, 1267, 1302, 1317, 1319, 1328, 1370, 1400, 1483, 1503, 1504, 1514, 1554, 1555, 1591, 1604, 1646, 1653, 1680, 1682, 1683, 1716, 1717, 1719, 1721, 1724, 1757, 1758, 1783, 1834, 1874, 1875, 1965, 1967, 1968, 2070, 2211, 2213, 2221, 2232, 2236, 2249, 2285, 2327, 2357, 2391, 2393, 2397, 2482, 2489, 2506, 2545, 2557, 2577, 2578, 2600, 2604, 2612, 2684, 2713, 2724, 2772, 2830, 2832, 2837, 2842, 2843, 2844, 2845, 2848, 2851, 2916, 2966, 2989, 2990, 3004, 3028, 3030, 3087, 3088, 3137, 3166, 3170, 3198, 3219, 3220, 3229, 3239, 3252, 3299, 3300, 3343, 3397, 3398, 3403, 3558, 3588, 3591, 3594, 3595, 3599, 3609, 3612, 3761, 3832, 3838, 3957, 3986, 4017, 4036, 4037, 4073] [182, 489, 522, 746, 749, 759, 760, 761, 830, 833, 834, 876, 877, 892, 909, 921, 924, 926, 958, 988, 999, 1004, 1005, 1008, 1010, 1047, 1049, 1084, 11

Unnamed: 0,string_X_train,Y_label,EXPNO
182,PHTHALIC ANHYDRIDE 504 MT AT USD 1010 MT PACKE...,,27
489,108 MT PHTHALIC ANHYDRIDE CI F MELBOURNE AUSTR...,,27
522,PHTHALIC ANHYDRIDE 504 MT AT USD 980 MT 600 KG...,,27
746,9000 MT PHTHALIC ANHYDRIDE PAA T USD 82500 MT ...,,27
749,9000 MT PHTHALIC ANHYDRIDE PAA T USD 82500 MT ...,,27
...,...,...,...
3986,NAME OF GOODS PHTHALIC ANHYDRIDE VOLUME OF GOO...,,27
4017,1 TAIRILIN BRAND POLYETHYLENE TEREPHTHALATE FI...,,2E
4036,342000 KGS OF PHTHALIC ANHYDRIDE OF TAIWAN ORI...,,27
4037,270000 KGS OF PHTHALIC ANHYDRIDE 85 - 44 - 49 ...,,27


(4059, 3)
(3914, 3)


In [219]:
str('abc')[str('abc').find('b')+1:]

'c'

In [220]:
def Collection_method(df,產品集合):
    labels = {}
    for i in tqdm(df.index):
        products = []
        for p in 產品集合:
            x = df.loc[i,'string_X_train']
            if p in x:
                if ' ' in p: # 非單詞直接append
                    products.append(p)
                if ' ' not in p: # 單詞要判斷一下
                    if (x[:x.find(p)] == ' ') and (x[x.find(p)+1:] == ' '):# 去除CHINA中找到INA這種情況
                        products.append(p)
        labels[i] = products
    predict = pd.DataFrame(index=labels.keys(),columns=['predict'])
    predict['predict'] = labels.values()
    return predict
predict = Collection_method(val_df,產品集合)
result = val_df.join(predict)
result['class'] = 'rule'

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3914.0), HTML(value='')))




In [221]:
result

Unnamed: 0,string_X_train,Y_label,EXPNO,predict,class
0,MASS PVC RESIN B-57 QUANTITY 175 MT AT 1300 US...,MASS PVC RESIN B-57,11,"[PVC RESIN B-57, RESIN , PVC RESIN]",rule
1,PHTHALIC ANHYDRIDE PA QUANTITY 306 MT UNIT PRI...,PHTHALIC ANHYDRIDE,27,"[PHTHALIC ANHYDRIDE , PHTHALIC ANHYDRIDE PA, ...",rule
2,COMMODITY LLDPE TAISOX 3470 QUANTITY 320 MT 2 ...,LLDPE TAISOX,18,"[ TAISOX , LLDPE TAISOX]",rule
5,ITEM 1 HDPE TAISOX 8010 200 MT USD 1100 MT CON...,HDPE TAISOX 8010,18,"[ TAISOX , HDPE TAISOX, HDPE TAISOX 8010, HDPE ]",rule
6,ITEM 1 HDPE TAISOX 8010 200 MT USD 1100 MT CON...,HDPE TAISOX 8010,18,"[ TAISOX , HDPE TAISOX, HDPE TAISOX 8010, HDPE ]",rule
...,...,...,...,...,...
4204,CFR KOBE JAPAN VISCOSE RAYON STAPLE FIBER15DX4...,VISCOSE RAYON STAPLE FIBER,41,"[RAYON STAPLE FIBER, RAYON , VISCOSE RAYON ST...",rule
4205,CFR KOBE JAPAN VISCOSE RAYON STAPLE FIBER15DX4...,VISCOSE RAYON STAPLE FIBER,41,"[RAYON STAPLE FIBER, RAYON , VISCOSE RAYON ST...",rule
4206,CFR KOBE JAPAN VISCOSE RAYON STAPLE FIBER15DX4...,VISCOSE RAYON STAPLE FIBER,41,"[RAYON STAPLE FIBER, RAYON , VISCOSE RAYON ST...",rule
4207,187 MT PVC RESIN SUSPENSION GRADE S-65D OF FOR...,PVC RESIN SUSPENSION GRADE S-65D,11,"[ RESIN , PVC RESIN]",rule


In [222]:
not_find = []
for j,i in enumerate(result.iloc[:,3].values):
    if len(i) == 0:
        not_find.append(j)
len(not_find)

284

In [223]:
not_find_df = result.iloc[not_find]
not_find_df

Unnamed: 0,string_X_train,Y_label,EXPNO,predict,class
51,PVC SUSPENSION S65 - 112 MT AT USD 133000 PER ...,EA,11,[],rule
203,TERMS OF PRICE CFR AQABA PORT - J O R D A N CO...,DOP,27,[],rule
204,TERMS OF PRICE CFR AQABA PORT - J O R D A N CO...,DOP,27,[],rule
241,TOTAL 36000 KGS - 10 PCT POM FORMOCON FOR USD ...,PA,1P,[],rule
244,TOTM 32 MT AT USD 2420 PER MTC FR JAPAN PORT,TOTM,27,[],rule
...,...,...,...,...,...
4147,CI F SEMARANG INDONESIA 359340 KGS OF POLYETHY...,ETHYLENE,2E,[],rule
4158,LLDPE GRADE 38400 QUANTITY 300 MTU NIT PRICE U...,INA,18,[],rule
4159,LLDPE GRADE 38400 QUANTITY 300 MTU NIT PRICE U...,INA,18,[],rule
4187,HIPS GRADE NO HP8250 27000 MTS AT USD 131600 P...,HIPS,4A,[],rule


In [224]:
bert_predict = model_predict(nlp,not_find_df)
bert_predict

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=284.0), HTML(value='')))




Unnamed: 0,predict:
51,RMS 2010
203,AQABA
204,AQABA
241,ANTWERP BELGIUM IN
244,32 MT AT USD 2420 PER MTC FR JAPAN PORT
...,...
4147,SEMARANG INDONESIA 359340 KGS OF POLYETHYLENE
4158,TAIWAN UNDER SALES
4159,TAIWAN UNDER SALES
4187,27000 MTS AT USD 131600 PER MTC IF


In [225]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [226]:
result.iloc[not_find,3] = [ [str(i)][0] for i in bert_predict['predict:'].values]
idx = result.iloc[not_find].index
result.loc[idx,'class'] = 'bert'
result.loc[idx]

Unnamed: 0,string_X_train,Y_label,EXPNO,predict,class
51,PVC SUSPENSION S65 - 112 MT AT USD 133000 PER ...,EA,11,RMS 2010,bert
203,TERMS OF PRICE CFR AQABA PORT - J O R D A N CO...,DOP,27,AQABA,bert
204,TERMS OF PRICE CFR AQABA PORT - J O R D A N CO...,DOP,27,AQABA,bert
241,TOTAL 36000 KGS - 10 PCT POM FORMOCON FOR USD ...,PA,1P,ANTWERP BELGIUM IN,bert
244,TOTM 32 MT AT USD 2420 PER MTC FR JAPAN PORT,TOTM,27,32 MT AT USD 2420 PER MTC FR JAPAN PORT,bert
...,...,...,...,...,...
4147,CI F SEMARANG INDONESIA 359340 KGS OF POLYETHY...,ETHYLENE,2E,SEMARANG INDONESIA 359340 KGS OF POLYETHYLENE,bert
4158,LLDPE GRADE 38400 QUANTITY 300 MTU NIT PRICE U...,INA,18,TAIWAN UNDER SALES,bert
4159,LLDPE GRADE 38400 QUANTITY 300 MTU NIT PRICE U...,INA,18,TAIWAN UNDER SALES,bert
4187,HIPS GRADE NO HP8250 27000 MTS AT USD 131600 P...,HIPS,4A,27000 MTS AT USD 131600 PER MTC IF,bert


In [227]:
lst = []
for i in result.iloc[not_find].index:
    if result.loc[i,'Y_label'] != result.loc[i,'predict']:
        lst.append(i)

In [228]:
'PVC SUSPENSION S65' in 產品集合

False

In [229]:
' EA ' in 產品集合

True

In [230]:
'SAN' in 產品集合 

False

In [231]:
get_jaccard_sim('MASS PVC RESIN B-57','PVC RESIN B-57')

0.75

In [232]:
def get_acc(df,t=0.75):
    correct = []
    correct_label = []
    for i in df.index:
        jacs = []
        for j in df.loc[i,'predict']:
            jacs.append(get_jaccard_sim(df.loc[i,'Y_label'],j))
        if max(jacs) >= t:
            correct.append('yes')
        else:
            correct.append('no')
    result = pd.DataFrame({'correct':correct})
    return result['correct'].value_counts()['yes']/len(result)

In [233]:
def get_jac(df):
    all_jacs = []
    for i in df.index:
        jacs = []
        for j in df.loc[i,'predict']:
            jacs.append(get_jaccard_sim(df.loc[i,'Y_label'],j))
        all_jacs.append(max(jacs))
    return np.sum(all_jacs)/len(all_jacs)

# 表現

In [234]:
get_acc(result,t=1),get_acc(result,t=0.75),get_jac(result)

(0.8035258048032703, 0.8160449667858968, 0.8467972114752902)

In [235]:
部門_lst = []
for p_lst in tqdm(result['predict'].values):
    p = max(p_lst,key=len)
    jac_dict = {}
    for i in 品名2代號.keys():
        jac_dict[i] = get_jaccard_sim(i,p)
    部門_lst.append(品名2代號[max(jac_dict, key=jac_dict.get)])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3914.0), HTML(value='')))




In [236]:
result['預測部門代號'] = 部門_lst

In [237]:
result

Unnamed: 0,string_X_train,Y_label,EXPNO,predict,class,預測部門代號
0,MASS PVC RESIN B-57 QUANTITY 175 MT AT 1300 US...,MASS PVC RESIN B-57,11,"[PVC RESIN B-57, RESIN , PVC RESIN]",rule,11
1,PHTHALIC ANHYDRIDE PA QUANTITY 306 MT UNIT PRI...,PHTHALIC ANHYDRIDE,27,"[PHTHALIC ANHYDRIDE , PHTHALIC ANHYDRIDE PA, ...",rule,27
2,COMMODITY LLDPE TAISOX 3470 QUANTITY 320 MT 2 ...,LLDPE TAISOX,18,"[ TAISOX , LLDPE TAISOX]",rule,18
5,ITEM 1 HDPE TAISOX 8010 200 MT USD 1100 MT CON...,HDPE TAISOX 8010,18,"[ TAISOX , HDPE TAISOX, HDPE TAISOX 8010, HDPE ]",rule,18
6,ITEM 1 HDPE TAISOX 8010 200 MT USD 1100 MT CON...,HDPE TAISOX 8010,18,"[ TAISOX , HDPE TAISOX, HDPE TAISOX 8010, HDPE ]",rule,18
...,...,...,...,...,...,...
4204,CFR KOBE JAPAN VISCOSE RAYON STAPLE FIBER15DX4...,VISCOSE RAYON STAPLE FIBER,41,"[RAYON STAPLE FIBER, RAYON , VISCOSE RAYON ST...",rule,41
4205,CFR KOBE JAPAN VISCOSE RAYON STAPLE FIBER15DX4...,VISCOSE RAYON STAPLE FIBER,41,"[RAYON STAPLE FIBER, RAYON , VISCOSE RAYON ST...",rule,41
4206,CFR KOBE JAPAN VISCOSE RAYON STAPLE FIBER15DX4...,VISCOSE RAYON STAPLE FIBER,41,"[RAYON STAPLE FIBER, RAYON , VISCOSE RAYON ST...",rule,41
4207,187 MT PVC RESIN SUSPENSION GRADE S-65D OF FOR...,PVC RESIN SUSPENSION GRADE S-65D,11,"[ RESIN , PVC RESIN]",rule,11


In [238]:
result['EXPNO'] = [ str(i).strip() for i in result['EXPNO']]
result['預測部門代號'] = [ str(i).strip() for i in result['預測部門代號']]

In [239]:
result[result['EXPNO']==result['預測部門代號']]

Unnamed: 0,string_X_train,Y_label,EXPNO,predict,class,預測部門代號
0,MASS PVC RESIN B-57 QUANTITY 175 MT AT 1300 US...,MASS PVC RESIN B-57,11,"[PVC RESIN B-57, RESIN , PVC RESIN]",rule,11
1,PHTHALIC ANHYDRIDE PA QUANTITY 306 MT UNIT PRI...,PHTHALIC ANHYDRIDE,27,"[PHTHALIC ANHYDRIDE , PHTHALIC ANHYDRIDE PA, ...",rule,27
2,COMMODITY LLDPE TAISOX 3470 QUANTITY 320 MT 2 ...,LLDPE TAISOX,18,"[ TAISOX , LLDPE TAISOX]",rule,18
5,ITEM 1 HDPE TAISOX 8010 200 MT USD 1100 MT CON...,HDPE TAISOX 8010,18,"[ TAISOX , HDPE TAISOX, HDPE TAISOX 8010, HDPE ]",rule,18
6,ITEM 1 HDPE TAISOX 8010 200 MT USD 1100 MT CON...,HDPE TAISOX 8010,18,"[ TAISOX , HDPE TAISOX, HDPE TAISOX 8010, HDPE ]",rule,18
...,...,...,...,...,...,...
4204,CFR KOBE JAPAN VISCOSE RAYON STAPLE FIBER15DX4...,VISCOSE RAYON STAPLE FIBER,41,"[RAYON STAPLE FIBER, RAYON , VISCOSE RAYON ST...",rule,41
4205,CFR KOBE JAPAN VISCOSE RAYON STAPLE FIBER15DX4...,VISCOSE RAYON STAPLE FIBER,41,"[RAYON STAPLE FIBER, RAYON , VISCOSE RAYON ST...",rule,41
4206,CFR KOBE JAPAN VISCOSE RAYON STAPLE FIBER15DX4...,VISCOSE RAYON STAPLE FIBER,41,"[RAYON STAPLE FIBER, RAYON , VISCOSE RAYON ST...",rule,41
4207,187 MT PVC RESIN SUSPENSION GRADE S-65D OF FOR...,PVC RESIN SUSPENSION GRADE S-65D,11,"[ RESIN , PVC RESIN]",rule,11


In [240]:
result[result['EXPNO']!=result['預測部門代號']]

Unnamed: 0,string_X_train,Y_label,EXPNO,predict,class,預測部門代號
9,300000 BB LS - 10 P C T OF GASOIL 10 PPM UNIT ...,GASOIL,61,[ GASOIL ],rule,60
10,300000 BB LS - 10 P C T OF GASOIL 10 PPM UNIT ...,GASOIL,61,[ GASOIL ],rule,60
13,300000 BB LS - 10 P C T OF GASOIL 10 PPM UNIT ...,GASOIL,61,[ GASOIL ],rule,60
26,COMMODITY MONOETHYLENE GLYCOL MEG FIBER GRADE ...,MONOETHYLENE GLYCOL MEG FIBER GRADE,2A,[INA ],rule,27
27,POM FORMOCON FM 090 54000 KG NET UNIT PRICE 18...,PACK,1P,[ PE ],rule,21
...,...,...,...,...,...,...
4176,PLASTIC RESIN CI P DONG GUAN CHINA 1 2 X 20 FC...,PLASTIC RESIN,4A,"[AG 15A1, PLASTIC RESIN, INA , RESIN , ABS ]",rule,23
4186,TERMS OF PRICE FOB KAOHSIUNG INC OTE RMS 2010 ...,PP FILM GRADE,1P,"[ PP , PP FILM GRADE]",rule,4A
4187,HIPS GRADE NO HP8250 27000 MTS AT USD 131600 P...,HIPS,4A,27000 MTS AT USD 131600 PER MTC IF,bert,12
4195,PLASTIC RESIN CI F WU ZHOU CHINA 1 1 X 20 FC L...,PLASTIC RESIN,1P,"[PLASTIC RESIN, INA , RESIN ]",rule,23


In [241]:
a = len(result[result['EXPNO']==result['預測部門代號']])
b = len(result[result['EXPNO']!=result['預測部門代號']])
print(f'正確:{a} 錯誤:{b} 正確率:{a/(a+b)}')

正確:2795 錯誤:1119 正確率:0.7141032192130813


In [242]:
品名2代號['GASOIL'] # EXPNO是61 寶典是60

'60'

In [243]:
len(result[result['EXPNO']==result['預測部門代號']])/len(result)

0.7141032192130813

In [244]:
result.to_csv('submit_product_0906.csv')