In [14]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from torch.optim.optimizer import Optimizer
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import random
import torch
from transformers import pipeline
import warnings 
warnings.filterwarnings('ignore')
from pytorch_lightning import seed_everything
from torch.utils.data import DataLoader
import os
import gc
gc.collect()

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# set_seed(42)

In [15]:
def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)

Global seed set to 42


# load model

In [16]:
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model.load_state_dict(torch.load('Product_Data_SQuAD_model.pt'))
model.eval()
nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)
gc.collect()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

244

# load data

In [17]:
#train_df = pd.read_csv('Train_Product_Data_2021_0114.csv',index_col=0).drop(['Forward','Backward','label_for_train'],axis=1)
#train_df = train_df.dropna(axis=0)
#display(train_df.head(3))

val_df = pd.read_csv('preprocess_for_SQUAD.csv',index_col=0)
val_df = val_df[['45A','Y_label','string_Y_1','string_Y_2']].dropna(axis=0)
val_df = val_df.rename(columns={'45A':'string_X_train'})
display(val_df.head(3))

Unnamed: 0,string_X_train,Y_label,string_Y_1,string_Y_2
3,MASS PVC RESIN B-57QUANTITY 175 MT AT 1300 USD...,MASS PVC RESIN B-57,0,19
4,PHTHALIC ANHYDRIDE PAQUANTITY 306 MT UNIT PRI...,PHTHALIC ANHYDRIDE,0,18
28,TERMS OF SALE CIF - NHAVA SHEVA PORT INDIAPVC ...,PVC RESIN S-65D,42,57


# find_fail_sample and drop fail_sample

In [18]:
def find_fail_sample(df):
    fails = []
    for i in df.index:
        context = df.loc[i,'string_X_train']
        answer = df.loc[i,'Y_label']
        if answer not in context:
            fails.append(i)
    return fails
val_fails = find_fail_sample(val_df)
display(val_df.loc[val_fails])
print(val_df.shape)
val_df = val_df.drop(val_fails,axis=0)
print(val_df.shape)

Unnamed: 0,string_X_train,Y_label,string_Y_1,string_Y_2


(609, 4)
(609, 4)


# test model

In [19]:
def test_model(nlp,df,test_n=30):
    table = pd.DataFrame()
    if test_n != None:
        idx_list = df.sample(test_n).index.tolist()
    else:
        idx_list = sorted(df.index.tolist())
    for i in tqdm(idx_list):
        sample = df.loc[[i]]
        string_X_train = sample['string_X_train'].values[0]
        string_Y_1 = sample['string_Y_1'].values[0]
        string_Y_2 = sample['string_Y_2'].values[0]
        QA_input = {
            'question': 'What is the product name?',
            'context': string_X_train
        }
        res = nlp(QA_input)
        predict = QA_input['context'][res['start']:res['end']]
        row = pd.DataFrame({
            'label':string_X_train[string_Y_1:string_Y_2],
            'predict:':predict},index=[i])
        if string_X_train[string_Y_1:string_Y_2] == predict:
            row['是否全對'] = 'Yes'
        else:
            row['是否全對'] = 'No'
        table = table.append(row)
    
    jaccard_avg_score = np.mean([get_jaccard_sim(table.label[i],table['predict:'][i]) for i in table.index])
    
    acc = table['是否全對'].value_counts()['Yes']/len(table)
    
    return table ,jaccard_avg_score ,acc

In [20]:
table,jaccard_avg_score,acc = test_model(nlp,val_df,test_n=len(val_df))
print(f'jaccard_avg_score:{jaccard_avg_score}')
print(f'acc:{acc}')
display(table)

  0%|          | 0/609 [00:00<?, ?it/s]

jaccard_avg_score:0.5354933927594027
acc:0.2019704433497537


Unnamed: 0,label,predict:,是否全對
4394,NAN YA RELEASE FILM,YA RELEASE FILM,No
5346,COPPER FOIL,PORTCOPPER FOILQUANTITY44400KGSAMOUNTUSD66180600,No
6804,DI-2-ETHYLHEXYL PHTHALATE,DI-2-ETHYLHEXYL PHTHALATEQUANTITY,No
7886,MONOETHYLENE GLYCOLMEG,CHINACOMMODITYMONOETHYLENE GLYCOLMEGQUANTITY20...,No
2410,MONOETHYLENE GLYCOL MEG FIBER GRADE,MONOETHYLENE GLYCOL,No
4000,MALEIC ANHYDRIDE,TAIWANMALEIC ANHYDRIDE,No
1851,BPA BISPHENOL-A,BPA BISPHENOL-AQUANTITY1020MTCONTRACT,No
2708,RIGID PVC FILM,RIGID PVC FILM,Yes
7657,MONO ETHYLENE GLYCOLMEG,MONO ETHYLENE GLYCOLMEGPACKING,No
2731,MONOETHYLENE GLYCOLMEG,MONOETHYLENE GLYCOLMEGQUANTITY,No


# Post_processing

In [21]:
def Post_processing(predicts):
    def remove_QUANTITY(x):
        x = x.replace('QUANTITY','')
        x = x.replace('QTY','')
        x = x.replace('n','')
        x = x.replace('KOREA','')
        return x
    return [ remove_QUANTITY(i) for i in predicts]

new_table = table.copy()[['label','predict:']]
new_table['predict:'] = Post_processing(new_table['predict:'].values)
for i in new_table.index:
    if new_table.loc[i,'label'] == new_table.loc[i,'predict:']:
        new_table.loc[i,'是否全對'] = 'Yes'
    else:
        new_table.loc[i,'是否全對'] = 'No'    
display(new_table[table['是否全對']!='Yes'].head(30))

jaccard_avg_score = np.mean([get_jaccard_sim(new_table.label[i],new_table['predict:'][i]) for i in new_table.index])
acc = new_table['是否全對'].value_counts()['Yes']/len(new_table)
print(f'jaccard_avg_score:{jaccard_avg_score}')
print(f'acc:{acc}')

Unnamed: 0,label,predict:,是否全對
4394,NAN YA RELEASE FILM,YA RELEASE FILM,No
5346,COPPER FOIL,PORTCOPPER FOIL44400KGSAMOUNTUSD66180600,No
6804,DI-2-ETHYLHEXYL PHTHALATE,DI-2-ETHYLHEXYL PHTHALATE,Yes
7886,MONOETHYLENE GLYCOLMEG,CHINACOMMODITYMONOETHYLENE GLYCOLMEG200000MT0,No
2410,MONOETHYLENE GLYCOL MEG FIBER GRADE,MONOETHYLENE GLYCOL,No
4000,MALEIC ANHYDRIDE,TAIWANMALEIC ANHYDRIDE,No
1851,BPA BISPHENOL-A,BPA BISPHENOL-A1020MTCONTRACT,No
7657,MONO ETHYLENE GLYCOLMEG,MONO ETHYLENE GLYCOLMEGPACKING,No
2731,MONOETHYLENE GLYCOLMEG,MONOETHYLENE GLYCOLMEG,Yes
3979,MALEIC ANHYDRIDE,TAIWANMALEIC ANHYDRIDE,No


jaccard_avg_score:0.6088376730002345
acc:0.31198686371100165


In [22]:
pd.options.display.max_rows = 999
submit = val_df.join(new_table)[['string_X_train','label','predict:']]
submit.head(30)

Unnamed: 0,string_X_train,label,predict:
3,MASS PVC RESIN B-57QUANTITY 175 MT AT 1300 USD...,MASS PVC RESIN B-57,MASS PVC RESIN B-57
4,PHTHALIC ANHYDRIDE PAQUANTITY 306 MT UNIT PRI...,PHTHALIC ANHYDRIDE,PA
28,TERMS OF SALE CIF - NHAVA SHEVA PORT INDIAPVC ...,PVC RESIN S-65D,INDIAPVC RESIN
31,PRODUCT TETRAHYDROFURAN 998 PCT MINQUANTITY ...,TETRAHYDROFURAN,TETRAHYDROFURAN
41,875 MTS OF PVC RESIN B-57 AND 360 MTS OF PVC R...,PVC RESIN S-65D,PVC RESIN B-57
56,7200 MTS PVC SUSPENSION RESIN S-65D UNIT PRICE...,PVC SUSPENSION RESIN S-65D,PVC SUSPENSION RESIN
60,COMMODITY MONOETHYLENE GLYCOL MEG F...,MONOETHYLENE GLYCOL MEG FIBER GRADE,MONOETHYLENE GLYCOL
66,COMMODITY DIETHYLENE GLYCOL DEGQUANTITY 1000MT...,DIETHYLENE GLYCOL DEG,DIETHYLENE GLYCOL DEG
175,5400 MTS PVC SUSPENSION RESIN S-65DUNIT PRICE ...,PVC MASS RESIN B-57,PVC SUSPENSION RESIN
222,TRADE TERMS CIF CHANGPING CHINA ANDOR CIF HEYU...,PVC RESIN S-60,PVC RESIN S-60


In [27]:
submit.to_csv('submit.csv',index=False)