In [60]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from torch.optim.optimizer import Optimizer
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import random
import torch
import warnings 
warnings.filterwarnings('ignore')
from pytorch_lightning import seed_everything
import os
import gc
from transformers import pipeline
import requests

# functions

In [61]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)
exec(requests.get('https://raw.githubusercontent.com/facebookresearch/madgrad/main/madgrad/madgrad.py').text)

Global seed set to 42


# load data and split data (train,val)

In [62]:

from sklearn.model_selection import train_test_split
df = pd.read_csv('../data/preprocess_for_SQUAD_產品.csv',index_col=0)[['45A','Y_label','string_Y_1','string_Y_2']].dropna(axis=0).head(300)
df.iloc[:,-2:] = df.iloc[:,-2:].astype(int)
df = df.dropna(axis=0).reset_index(drop=True).rename(columns={'45A':'string_X_train'})
train_df, val_df = train_test_split(df,test_size=0.2,random_state=42)
display(train_df.head(3))
display(val_df.head(3))
for i in tqdm(range(len(df))):
    assert df['Y_label'].values[i] in df['string_X_train'].values[i]

Unnamed: 0,string_X_train,Y_label,string_Y_1,string_Y_2
232,PRODUCT ACRYLONITRILE QUANTITY 1500 MT PLUS MI...,ACRYLONITRILE,8,21
59,HDPE RESIN TAISOX 8040 C 1008 MT AT USD 1075 M...,HDPE RESIN,0,10
6,COMMODITY TAISOX 7470 M QUANTITY 100 MT UNIT P...,TAISOX,10,16


Unnamed: 0,string_X_train,Y_label,string_Y_1,string_Y_2
203,COMMODITY BASE OIL 500N QUANTITY 80000 MT - 5 ...,BASE OIL 500N,10,23
266,ITEM 1 EVA TAISOX 7340 M 25 MT USD 2505 MT C I...,EVA TAISOX 7350,175,190
152,PVC RESIN S-65 QUANTITY 64 MTS AT USD 134000 P...,PVC RESIN S-65,0,14


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




# Model and Tokenizer Initialization

In [63]:

from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering
import torch
from torch.utils.data import DataLoader
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

# Data preprocessing

In [64]:

def preprocessing(df):
    contexts = df['string_X_train'].values.tolist()
    questions = [ 'What is the bank name?' for i in range(len(df))]
    answers = []
    for idx in df.index:
        answers.append({
            'text':df.loc[idx,'Y_label'],
            'answer_start':df.loc[idx,'string_Y_1'],
            'answer_end':df.loc[idx,'string_Y_2'],
            })
    return contexts ,questions ,answers

def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

def df2DataLoader(df):
    contexts ,questions ,answers = preprocessing(df)
    encodings = tokenizer(contexts, questions, truncation=True, padding=True)
    encodings = add_token_positions(encodings, answers)
    dataset = SquadDataset(encodings)
    loader = DataLoader(dataset, batch_size = 8, shuffle=True)
    return loader

train_loader = df2DataLoader(train_df)
val_loader = df2DataLoader(val_df)

# train loop

In [65]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optimizer = MADGRAD(model.parameters() ,lr = 5e-5)

def train_step(model,batch,optimizer):
    model = model.to(device)
    model.train()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    
    # forward
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    
    # backward
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    gc.collect()
    return loss.item()

def val_step(model,batch,optimizer):
    model = model.to(device)
    model.eval()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    
    # forward
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    
    gc.collect()
    return loss.item()

def train_loop(model,train_loader,val_loader,optimizer,max_epochs=5,patience=3):
    history = {'train_loss':[],'val_loss':[]}
    best_loss = np.inf
    best_model = None
    not_improve_count = 0
    for epoch in tqdm(range(max_epochs)):        
        epoch_train_loss = 0.0
        epoch_val_loss = 0.0
        
        for i,batch in enumerate(tqdm(train_loader)):
            epoch_train_loss += train_step(model,batch,optimizer)
        
        for j,batch in enumerate(tqdm(val_loader)):
            epoch_val_loss += val_step(model,batch,optimizer)
        
        history['train_loss'].append(epoch_train_loss/i)
        history['val_loss'].append(epoch_val_loss/j)
        
        print(f'epoch:{epoch} train_loss:{epoch_train_loss/i} val_loss:{epoch_val_loss/j}')
        
        if history['val_loss'][-1] <= best_loss: 
            best_model = deepcopy(model.eval())
            best_loss = history['val_loss'][-1]
            print(f'save best_model now_val_best_loss is:{best_loss}')

        if history['val_loss'][-1] > best_loss:
            not_improve_count += 1
            print(f'not_improve_count:{not_improve_count}')
            if not_improve_count > patience:
                print('early_stoping')
                break

    # get best_model.eval()
    model = best_model.eval()
    return model,history

In [66]:
def test_model(nlp,df,test_n=30):
    table = pd.DataFrame()
    persudo_val_df = df
    
    # 如果是測試模式隨機sample n個點
    if test_n != None:
        idx_list = df.sample(test_n).index.tolist()
    
    # 否則使用全部
    else:
        idx_list = df.index.tolist()
    
    # 遍歷樣本
    for i in tqdm(idx_list):
        sample = df.loc[[i]]
        string_X_train = sample['string_X_train'].values[0]
        string_Y_1 = sample['string_Y_1'].values[0]
        string_Y_2 = sample['string_Y_2'].values[0]
        
        # 製作QA
        QA_input = {
            'question': 'What is the product name?',
            'context': string_X_train
        }
        
        # 得到nlp預測結果
        res = nlp(QA_input)
        
        predict = QA_input['context'][res['start']:res['end']]
        
        # 製作成一條dataframe的row
        row = pd.DataFrame({
            'label':string_X_train[string_Y_1:string_Y_2],
            'predict:':predict},index=[i])
        
        if string_X_train[string_Y_1:string_Y_2] == predict:
            row['是否全對'] = 'Yes'
        else:
            row['是否全對'] = 'No'
        
        table = table.append(row)
        
        # make persudo label by nlp output 
        persudo_val_df.loc[i]['string_Y_1'] = res['start']
        persudo_val_df.loc[i]['string_Y_2'] = res['end']
        persudo_val_df.loc[i]['Y_label'] = predict
    
    # 計算得分
    jaccard_avg_score = np.mean([get_jaccard_sim(table.label[i],table['predict:'][i]) for i in table.index])
    
    acc = table['是否全對'].value_counts()['Yes'] / len(table)
    
    # 返回預測結果表,jac分數,acc分數,persudo_val_df
    return table ,jaccard_avg_score ,acc ,persudo_val_df

# knowledge_distillation

In [67]:
gc.collect()
def knowledge_distillation(n=2,max_epochs=2,patience=1):
    # initialize tokenizer ,model and train_loader
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    train_df_extra_loader = train_loader
    
    history = {'train_loss':[],'val_loss':[]}
    best_jaccard = 0
    best_model = None
    
    #knowledge_distillation loop
    for i in range(n):
        # 1.training model by MADGRAD optimizer
        optimizer = MADGRAD(model.parameters(),lr=5e-5)
        model,history = train_loop(model,train_df_extra_loader,val_loader,optimizer,max_epochs=max_epochs,patience=patience)
        nlp = pipeline('question-answering', model=model.to('cpu'), tokenizer=tokenizer)
        # 2.get persudo_label(by trained_model)
        table,jaccard_avg_score,acc,persudo_val_df = test_model(nlp,val_df,test_n=33)
        
        # if model is good save checkpoint
        if jaccard_avg_score >= best_jaccard:
            best_jaccard = jaccard_avg_score
            best_model = deepcopy(model.eval())
        
        # 3.add persudo_label to trainset
        train_df_extra_loader = df2DataLoader(train_df.append(persudo_val_df).reset_index(drop=True))
        model = model.to(device)
        gc.collect()
    
    return best_model.eval()

In [68]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
knowledge_distillation_model = knowledge_distillation(n=4,max_epochs=4,patience=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:0 train_loss:2.681900534136542 val_loss:1.2326379077775138
save best_model now_val_best_loss is:1.2326379077775138


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:1 train_loss:0.6215758226041136 val_loss:0.7559536067502839
save best_model now_val_best_loss is:0.7559536067502839


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:2 train_loss:0.40435277734851016 val_loss:0.7197299131325313
save best_model now_val_best_loss is:0.7197299131325313


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:3 train_loss:0.2691328674554825 val_loss:1.0486538058945112
not_improve_count:1



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=33.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:0 train_loss:0.3728768484213868 val_loss:0.31723643626485554
save best_model now_val_best_loss is:0.31723643626485554


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:1 train_loss:0.2612915195625376 val_loss:0.29286858971629826
save best_model now_val_best_loss is:0.29286858971629826


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:2 train_loss:0.2698506599770406 val_loss:0.16777354844712786
save best_model now_val_best_loss is:0.16777354844712786


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:3 train_loss:0.2124235578938513 val_loss:0.10182124416210822
save best_model now_val_best_loss is:0.10182124416210822



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=33.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:0 train_loss:0.15554375256682318 val_loss:0.15182165801525116
save best_model now_val_best_loss is:0.15182165801525116


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:1 train_loss:0.1350681741747099 val_loss:0.14434838266710617
save best_model now_val_best_loss is:0.14434838266710617


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:2 train_loss:0.14648977054811613 val_loss:0.10361379289367635
save best_model now_val_best_loss is:0.10361379289367635


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:3 train_loss:0.13665464466337013 val_loss:0.21535741165280342
not_improve_count:1



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=33.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:0 train_loss:0.11391850938152119 val_loss:0.12458645246390786
save best_model now_val_best_loss is:0.12458645246390786


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:1 train_loss:0.18059806373738055 val_loss:0.10405523083837968
save best_model now_val_best_loss is:0.10405523083837968


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:2 train_loss:0.1736292191214759 val_loss:0.13516531299267495
not_improve_count:1


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


epoch:3 train_loss:0.18091239762214026 val_loss:0.13901750024940288
not_improve_count:2



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=33.0), HTML(value='')))




# question-answering pipeline

In [69]:

nlp = pipeline('question-answering', model=knowledge_distillation_model.to('cpu'), tokenizer=tokenizer)

# test

In [70]:
table,jaccard_avg_score,acc,persudo_val_df = test_model(nlp,val_df,test_n=len(val_df))
print(f'jaccard_avg_score:{jaccard_avg_score}')
print(f'acc:{acc}')
display(table)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=60.0), HTML(value='')))


jaccard_avg_score:0.8777777777777779
acc:0.8166666666666667


Unnamed: 0,label,predict:,是否全對
73,VISCOSE STAPLE FIBER,VISCOSE STAPLE FIBER15D,No
25,PACK,PACKAGING,No
45,BASE OIL 150N,BASE OIL 150N,Yes
77,EVA TAISOX 7350,EVA TAISOX 7350,Yes
109,DOP,DOP,Yes
78,EVA TAISOX 7350,EVA TAISOX 7350,Yes
75,TAISOX,TAISOX 7350,No
9,EVA TAISOX,EVA TAISOX,Yes
286,HDPE TAISOX,HDPE TAISOX,Yes
182,GASOIL,GASOIL,Yes


# claculate acc and jaccard

In [71]:
def get_acc(df,t=1):
    correct = []
    correct_label = []
    for i in df.index:
        jac = get_jaccard_sim(df.loc[i,'label'],df.loc[i,'predict:'])
        if jac >= t:
            correct.append('yes')
        else:
            correct.append('no')
    a = pd.Series(correct)
    return a.value_counts()['yes']/len(a)

In [72]:
print('jaccard_avg_score:',np.mean([ get_jaccard_sim(table['label'].loc[i],table['predict:'].loc[i]) for i in table.index]))
print('acc:',get_acc(table,1))
print('放水acc:',get_acc(table,0.75))

jaccard_avg_score: 0.8777777777777779
acc: 0.8166666666666667
放水acc: 0.8166666666666667


In [73]:
torch.save(model.state_dict(),'../models/Product_Data_SQuAD_model_產品.pt')