In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from torch.optim.optimizer import Optimizer
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import random
import torch
from transformers import pipeline
import warnings 
warnings.filterwarnings('ignore')
from pytorch_lightning import seed_everything
from torch.utils.data import DataLoader
import os
import gc
gc.collect()

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# set_seed(42)

In [2]:
def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)

Global seed set to 42


# load model

In [3]:
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
model.load_state_dict(torch.load('Product_Data_SQuAD_model.pt'))
model.eval()
nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)
gc.collect()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

244

# load data

In [4]:
train_df = pd.read_csv('Train_Product_Data_2021_0114.csv',index_col=0).drop(['Forward','Backward','label_for_train'],axis=1)
train_df = train_df.dropna(axis=0)
display(train_df.head(3))

val_df = pd.read_csv('Val_Product_Data_2021_0114.csv',index_col=0).drop(['Forward','Backward'],axis=1)
val_df = val_df.dropna(axis=0)
display(val_df.head(3))

Unnamed: 0,string_X_train,string_Y_1,string_Y_2,Y_label,row_id
586,"YUNGSOX 2100M 12MT USD1,015/MT USD12,180 YUNGS...",0,13,YUNGSOX 2100M,855
92,"TRIS 1,2-CYCLOHEXANE DICARBOXYLIC ACID,DI-ISON...",0,50,"TRIS 1,2-CYCLOHEXANE DICARBOXYLIC ACID,DI-ISON...",140
91,"TRIS . CIF PORT KLANG,MALAYSIA",0,4,TRIS,139


Unnamed: 0,string_X_train,string_Y_1,string_Y_2,Y_label,row_id
1,"COMMODITY: STYRENE MONOMER IN BULKQUANTITY: 3,...",11,34,STYRENE MONOMER IN BULK,1
2,"COMMODITY: STYRENE MONOMER IN BULKQUANTITY: 3,...",11,34,STYRENE MONOMER IN BULK,2
3,PP 3307UNC1 . TRADE TERMS: CFR ANY JAPANESE PORT,0,2,PP,10


# find_fail_sample and drop fail_sample

In [5]:
def find_fail_sample(df):
    fails = []
    for i in df.index:
        context = df.loc[i,'string_X_train']
        answer = df.loc[i,'Y_label']
        if answer not in context:
            fails.append(i)
    return fails
train_fails = find_fail_sample(train_df)
val_fails = find_fail_sample(val_df)
print(train_fails,val_fails)
display(val_df.loc[val_fails])
print(val_df.shape)
val_df = val_df.drop(val_fails,axis=0)
print(val_df.shape)

[] [342, 343, 344]


Unnamed: 0,string_X_train,string_Y_1,string_Y_2,Y_label,row_id
342,#NAME?,1,26,PURIFIED ISOPHTHALIC ACID,1238
343,#NAME?,1,26,PURIFIED ISOPHTHALIC ACID,1240
344,#NAME?,40,65,PURIFIED ISOPHTHALIC ACID,1241


(744, 5)
(741, 5)


# test model

In [6]:
def test_model(nlp,df,test_n=30):
    table = pd.DataFrame()
    persudo_val_df = pd.read_csv('Val_Product_Data_2021_0114.csv',index_col=0).drop(['Forward','Backward'],axis=1)
    if test_n != None:
        idx_list = df.sample(test_n).index.tolist()
    else:
        idx_list = sorted(df.index.tolist())
    for i in tqdm(idx_list):
        sample = df.loc[[i]]
        string_X_train = sample['string_X_train'].values[0]
        string_Y_1 = sample['string_Y_1'].values[0]
        string_Y_2 = sample['string_Y_2'].values[0]
        QA_input = {
            'question': 'What is the product name?',
            'context': string_X_train
        }
        res = nlp(QA_input)
        predict = QA_input['context'][res['start']:res['end']]
        row = pd.DataFrame({
            'label':string_X_train[string_Y_1:string_Y_2],
            'predict:':predict},index=[i])
        if string_X_train[string_Y_1:string_Y_2] == predict:
            row['是否全對'] = 'Yes'
        else:
            row['是否全對'] = 'No'
        table = table.append(row)
        
        # make persudo label by nlp output 
        persudo_val_df.loc[i]['string_Y_1'] = res['start']
        persudo_val_df.loc[i]['string_Y_2'] = res['end']
        persudo_val_df.loc[i]['Y_label'] = predict
    
    jaccard_avg_score = np.mean([get_jaccard_sim(table.label[i],table['predict:'][i]) for i in table.index])
    
    acc = table['是否全對'].value_counts()['Yes']/len(table)
    
    return table ,jaccard_avg_score ,acc ,persudo_val_df

In [7]:
table,jaccard_avg_score,acc,persudo_val_df = test_model(nlp,val_df,test_n=len(val_df))
print(f'jaccard_avg_score:{jaccard_avg_score}')
print(f'acc:{acc}')
display(table)

  0%|          | 0/741 [00:00<?, ?it/s]

jaccard_avg_score:0.9360998650472334
acc:0.8785425101214575


Unnamed: 0,label,predict:,是否全對
121,POLYESTER STAPLE FIBER,POLYESTER STAPLE FIBER,Yes
197,MASS PVC RESIN,MASS PVC RESIN,Yes
336,EPOXY RESIN,EPOXY RESIN,Yes
585,BISPHENOL-A,BISPHENOL-A,Yes
398,ABS RESIN,ABS RESIN,Yes
...,...,...,...
72,PET RESIN,PET RESIN,Yes
107,PP FILM,PP FILM,Yes
271,POLYESTER STAPLE FIBER,POLYESTER STAPLE FIBER,Yes
439,GASOIL,GASOIL,Yes


# Post_processing

In [21]:
def Post_processing(predicts):
    def remove_QUANTITY(x):
        x = x.replace('QUANTITY','')
        x = x.replace('QTY','')
        x = x.replace('n','')
        x = x.replace('KOREA','')
        return x
    return [ remove_QUANTITY(i) for i in predicts]

new_table = table.copy()[['label','predict:']]
new_table['predict:'] = Post_processing(new_table['predict:'].values)
for i in new_table.index:
    if new_table.loc[i,'label'] == new_table.loc[i,'predict:']:
        new_table.loc[i,'是否全對'] = 'Yes'
    else:
        new_table.loc[i,'是否全對'] = 'No'    
display(new_table[table['是否全對']!='Yes'].head(30))

jaccard_avg_score = np.mean([get_jaccard_sim(new_table.label[i],new_table['predict:'][i]) for i in new_table.index])
acc = new_table['是否全對'].value_counts()['Yes']/len(new_table)
print(f'jaccard_avg_score:{jaccard_avg_score}')
print(f'acc:{acc}')

Unnamed: 0,label,predict:,是否全對
530,PET RESIN,PET RESIN 3824,No
165,GLASS FIBER,GLASS FIBER,Yes
213,HDPE TAISOX 8010,PRICEHDPE TAISOX 9001,No
11,HIPS RESIN,ABS RESIN,No
134,GLASS FIBER YARN,GLASS FIBER YARNECG75,No
355,GENERAL PURPOSE POLYSTYRENE,GENERAL PURPOSE POLYSTYRENE,Yes
159,PURIFIED ISOPHTHALIC ACID,PURIFIED ISOPHTHALIC ACID20MT,No
637,LLDPE,LLDPE FILM,No
334,HIGH DENSITY POLYETHYLENE,HIGH DENSITY POLYETHYLENEGRADE : HDPE TAISOX 9001,No
642,MONO ETHYLENE GLYCOL,MONO ETHYLENE GLYCOLPACKING,No


jaccard_avg_score:0.9520242914979757
acc:0.9082321187584346


In [26]:
pd.options.display.max_rows = 999
submit = val_df.join(new_table)[['string_X_train','label','predict:']]
submit.head(30)

Unnamed: 0,string_X_train,label,predict:
1,"COMMODITY: STYRENE MONOMER IN BULKQUANTITY: 3,...",STYRENE MONOMER IN BULK,STYRENE MONOMER IN BULK
2,"COMMODITY: STYRENE MONOMER IN BULKQUANTITY: 3,...",STYRENE MONOMER IN BULK,STYRENE MONOMER IN BULK
3,PP 3307UNC1 . TRADE TERMS: CFR ANY JAPANESE PORT,PP,PP
4,"CIF BELAWAN PORT, INDONESIA +20 MT +/- 5PCT OF...",2-ETHYL HEXANOL,2-ETHYL HEXANOL
5,"CIF BELAWAN PORT, INDONESIA +20 MTON +/-5PCT O...",2-ETHYL HEXANOL,2-ETHYL HEXANOL
6,TAISOX EVA 7470MQUANTITY:160.00MTUNIT PRICE:US...,TAISOX EVA 7470M,TAISOX EVA 7470M
7,TAISOX EVA 7350MQUANTITY:90.00MTUNIT PRICE:USD...,TAISOX EVA 7350M,TAISOX EVA 7350M
8,TAISOX EVA 7320MQUANTITY:30.00MTUNIT PRICE:USD...,TAISOX EVA 7320M,TAISOX EVA 7320M
9,COVERING SHIPMENT OF GOODS TAISOX EVA 7350MQUA...,TAISOX EVA 7350M,TAISOX EVA 7350M
10,TAISOX EVA 7350MQUANTITY:75.00MTUNIT PRICE:USD...,TAISOX EVA 7470M,TAISOX EVA 7350M


In [27]:
submit.to_csv('submit.csv',index=False)