In [146]:
import pandas as pd
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from torch.optim.optimizer import Optimizer
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import random
import torch
from transformers import pipeline
import warnings 
warnings.filterwarnings('ignore')
from pytorch_lightning import seed_everything
from torch.utils.data import DataLoader
import os
import gc

In [147]:
def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    seed_everything(seed)
    return random_state
random_state = set_seed(42)

Global seed set to 42


In [148]:
import math
from typing import TYPE_CHECKING, Any, Callable, Optional

if TYPE_CHECKING:
    from torch.optim.optimizer import _params_t
else:
    _params_t = Any

class MADGRAD(Optimizer):

    def __init__(
        self, params: _params_t, lr: float = 1e-2, momentum: float = 0.9, weight_decay: float = 0, eps: float = 1e-6,
    ):
        if momentum < 0 or momentum >= 1:
            raise ValueError(f"Momentum {momentum} must be in the range [0,1]")
        if lr <= 0:
            raise ValueError(f"Learning rate {lr} must be positive")
        if weight_decay < 0:
            raise ValueError(f"Weight decay {weight_decay} must be non-negative")
        if eps < 0:
            raise ValueError(f"Eps must be non-negative")

        defaults = dict(lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay)
        super().__init__(params, defaults)

    @property
    def supports_memory_efficient_fp16(self) -> bool:
        return False

    @property
    def supports_flat_params(self) -> bool:
        return True

    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:

        loss = None
        if closure is not None:
            loss = closure()

        if 'k' not in self.state:
            self.state['k'] = torch.tensor([0], dtype=torch.long)
        k = self.state['k'].item()

        for group in self.param_groups:
            eps = group["eps"]
            lr = group["lr"] + eps
            decay = group["weight_decay"]
            momentum = group["momentum"]

            ck = 1 - momentum
            lamb = lr * math.pow(k + 1, 0.5)

            for p in group["params"]:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]

                if "grad_sum_sq" not in state:
                    state["grad_sum_sq"] = torch.zeros_like(p.data).detach()
                    state["s"] = torch.zeros_like(p.data).detach()
                    if momentum != 0:
                        state["x0"] = torch.clone(p.data).detach()

                if momentum != 0.0 and grad.is_sparse:
                    raise RuntimeError("momentum != 0 is not compatible with sparse gradients")

                grad_sum_sq = state["grad_sum_sq"]
                s = state["s"]

                # Apply weight decay
                if decay != 0:
                    if grad.is_sparse:
                        raise RuntimeError("weight_decay option is not compatible with sparse gradients")

                    grad.add_(p.data, alpha=decay)

                if grad.is_sparse:
                    grad = grad.coalesce()
                    grad_val = grad._values()

                    p_masked = p.sparse_mask(grad)
                    grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad)
                    s_masked = s.sparse_mask(grad)

                    # Compute x_0 from other known quantities
                    rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps)
                    x0_masked_vals = p_masked._values().addcdiv(s_masked._values(), rms_masked_vals, value=1)

                    # Dense + sparse op
                    grad_sq = grad * grad
                    grad_sum_sq.add_(grad_sq, alpha=lamb)
                    grad_sum_sq_masked.add_(grad_sq, alpha=lamb)

                    rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps)

                    s.add_(grad, alpha=lamb)
                    s_masked._values().add_(grad_val, alpha=lamb)

                    # update masked copy of p
                    p_kp1_masked_vals = x0_masked_vals.addcdiv(s_masked._values(), rms_masked_vals, value=-1)
                    # Copy updated masked p to dense p using an add operation
                    p_masked._values().add_(p_kp1_masked_vals, alpha=-1)
                    p.data.add_(p_masked, alpha=-1)
                else:
                    if momentum == 0:
                        # Compute x_0 from other known quantities
                        rms = grad_sum_sq.pow(1 / 3).add_(eps)
                        x0 = p.data.addcdiv(s, rms, value=1)
                    else:
                        x0 = state["x0"]

                    # Accumulate second moments
                    grad_sum_sq.addcmul_(grad, grad, value=lamb)
                    rms = grad_sum_sq.pow(1 / 3).add_(eps)

                    # Update s
                    s.data.add_(grad, alpha=lamb)

                    # Step
                    if momentum == 0:
                        p.data.copy_(x0.addcdiv(s, rms, value=-1))
                    else:
                        z = x0.addcdiv(s, rms, value=-1)

                        # p is a moving average of z
                        p.data.mul_(1 - ck).add_(z, alpha=ck)


        self.state['k'] += 1
        return loss

In [149]:
df1 = pd.read_csv('preprocess_for_SQUAD_product.csv',index_col=0)[['45A','Y_label']]
df1['45A'] = df1['45A'].apply(lambda x:x.replace('x 000 D',''))
df1.columns = ['產品X','產品Y']
df1.sample(10)

Unnamed: 0,產品X,產品Y
890,PROPYLENE COPOLYMER RESIN 1 GRADE NO K8003 50...,PROPYLENE
2667,COMMODITY HDPE 8010 QUANTITY 175 MTS UNI T PR...,PACK
1559,PVC TRANSPARENT SHEET TTL 19185 M 03 MMX 1890 ...,PVC TRANSPARENT SHEET
331,PVC SUSPENSION RESIN GRADE S65 S QT Y 150 MT P...,PVC SUSPENSION RESIN
3164,COVERING 100 PERCENT OF INVOICE VALUE OF GOODS...,PROPYLENE
2455,TAIRILAC ABS RESIN S GRADE NO AG 15A1 - H NATU...,ABS RESIN
3347,TRADE TERMS INC O TERMS 2010 CFR HAI PHONG POR...,CHEMICALS
1403,TRADE TERMS INC O TERMS 2010 CFR HAI PHONG POR...,CHEMICALS
2128,COMMODITY GENERAL PURPOSE POLYSTYRENE GPPS GRA...,GENERAL PURPOSE POLYSTYRENE GPPS
1436,TRADE TERM FOB ANY PORT IN TAIWAN COMMODITY 2...,EVA


In [150]:
df2 = pd.read_csv('preprocess_for_SQUAD_開狀人.csv',index_col=0)[['string_X','Y_label']]
df2['string_X'] = df2['string_X'].apply(lambda x:x.replace('x000D',''))
df2.columns = ['開狀人X','開狀人Y']
df2.sample(10)

Unnamed: 0,開狀人X,開狀人Y
7255,BRENNTAG CANADA INC43 JUTLAND ROADTORONTO ON M...,BRENNTAG CANADA INC
2857,GICAR SPAVIA FERDINANDO DI SAVOIA 220124 MILAN...,GICAR SPA
1313,POLYGROUP TRADING LIMITED606 FAIRMONT HOUSE8 C...,POLYGROUP TRADING LIMITED
1125,HARIS AL AFAQ LTDP O BOX 61394 DUBAIJEBEL ALIU...,HARIS AL AFAQ LTD
4046,KENKO CORPORATION4F 3-1-2 IWAMOTO-CHO CHIYODA-...,KENKO CORPORATION
1254,SUMI TECHNOLOGYGUANGDONGCO LTDROOM 3103RD FLOO...,SUMI TECHNOLOGYGUANGDONGCO LTD
4700,SUPREME PETROCHEM LTDREFER FIELD 47A POINT NO 8,SUPREME PETROCHEM LTD
4928,VINMAR INTERNATIONAL LLC16825 NORTHCASE DRIVE ...,VINMAR INTERNATIONAL LLC
6861,HAI PHONG PAINT JOINT STOCK CO12 LACH TRAY STR...,HAI PHONG PAINT JOINT STOCK CO
6774,ORIENT INTERNATIONAL HOLDINGSHANGHAI FOREIGN T...,ORIENT INTERNATIONAL HOLDING


In [151]:
df3 = pd.read_csv('preprocess_for_SQUAD_bank.csv',index_col=0)[['string_X_train','Y_label']]
df3['string_X_train'] = df3['string_X_train'].apply(lambda x:x.replace('x000D',''))
df3.columns = ['銀行X','銀行Y']
df3

Unnamed: 0,銀行X,銀行Y
16,APPLICANTS ADDRESS19F HAICANG BUSINESS BUILDIN...,CTBC BANK CO LTD
22,THIS LC IS PAYABLE 30 DAYS AFTER BL DATE BL DA...,CTBC BANK CO LTD
23,THIS LC IS PAYABLE 30 DAYS AFTER BL DATE BL DA...,CTBC BANK CO LTD
30,1INSURANCE TO BE COVERED BY APPLICANT2CHARTER ...,CHANG HWA COMMERCIAL BANK LTD
41,ALL DOCUMENTS TO INDICATE THE NUMBER AND DATE ...,STANDARD CHARTERED BANK
...,...,...
8264,APPLICANTS ADDRESS NO33 KEFENG ROAD SCIENCE CI...,CTBC BANK CO LTD
8266,APPLICANTS ADDRESS NO33 KEFENG ROAD SCIENCE CI...,CTBC BANK CO LTD
8274,APPLICANTS ADDRESS NO33 KEFENG ROAD SCIENCE CI...,CTBC BANK CO LTD
8275,APPLICANTS ADDRESS NO33 KEFENG ROAD SCIENCE CI...,CTBC BANK CO LTD


In [152]:
df_m = df1.join(df2).join(df3).dropna(axis=0)
df_m

Unnamed: 0,產品X,產品Y,開狀人X,開狀人Y,銀行X,銀行Y
286,HDPE 8010 QT Y 900 MT AT USD 127000 MT H 0 FX ...,HDPE 8001,KINGFA SCI AND TECH CO LTDADDRESS NO33 KEFENG ...,KINGFA SCI AND TECH CO LTD,UPON OUR RECEIPT OF THE DOCUMENTS IN ORDER WE ...,BANK OF CHINA LTD
287,HDPE 8010 QT Y 900 MT AT USD 127000 MT H 0 FX ...,HDPE 8001,LANDMARK INTL CORPSUITE 310GRIFFITH CORPORATE ...,LANDMARK INTL CORP,1 SIGNED COMMERCIAL INVOICE IN 3 COPIES INDICA...,TAICHUNG COMMERCIAL BANK
310,EVA TAISOX 7360 M 12 MT USD 2625 MT EVA TAISOX...,EVA TAISOX,IMANAKA LTD12-1 GOBAN-CHOCHIYODA-KU TOKYO JAPAN,IMANAKA LTD,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD
311,300000 BB LS - 10 P C T OF GASOIL 005 PCT S TE...,GASOIL,IMANAKA LTD12-1 GOBAN-CHOCHIYODA-KU TOKYO JAPAN,IMANAKA LTD,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD
312,CONTRACT NO H 0 L X 75 COMMODITY EVA TAISOX 7...,EVA TAISOX 7350,IMANAKA LTD12-1 GOBAN-CHOCHIYODA-KU TOKYO JAPAN,IMANAKA LTD,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD
...,...,...,...,...,...,...
3576,DESCRIPTION OF GOODS QUANTITY MT UNIT PRICE US...,PC RESIN,PTINAWAN CHEMTEX SUKSES ABADIPLS SEE FIELD 47A,PTINAWAN CHEMTEX SUKSES ABADI,1 SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK CENTRAL ASIA
3579,DESCRIPTION OF GOODS QUANTITY MT UNIT PRICE US...,RESIN,DAP CORP404-1 BOCHE-RI MIYANG-MYEONANSEONG-CIT...,DAP CORP,SIGNED COMMERCIAL INVOICE IN 3 COPIESFULL SET ...,KOREA DEVELOPMENT BANK
3581,DESCRIPTION OF GOODS QUANTITY MT UNIT PRICE US...,PC RESIN,DAP CORP404-1 BOCHE-RI MIYANG-MYEONANSEONG-CIT...,DAP CORP,SIGNED COMMERCIAL INVOICE IN 3 COPIESFULL SET ...,KOREA DEVELOPMENT BANK
3583,DESCRIPTION OF GOODS QUANTITY MT UNIT PRICE US...,PC RESIN,DAP CORP404-1 BOCHE-RI MIYANG-MYEONANSEONG-CIT...,DAP CORP,SIGNED COMMERCIAL INVOICE IN 3 COPIESFULL SET ...,KOREA DEVELOPMENT BANK


In [153]:
dfm1 = df_m.iloc[:,:2]
dfm1

Unnamed: 0,產品X,產品Y
286,HDPE 8010 QT Y 900 MT AT USD 127000 MT H 0 FX ...,HDPE 8001
287,HDPE 8010 QT Y 900 MT AT USD 127000 MT H 0 FX ...,HDPE 8001
310,EVA TAISOX 7360 M 12 MT USD 2625 MT EVA TAISOX...,EVA TAISOX
311,300000 BB LS - 10 P C T OF GASOIL 005 PCT S TE...,GASOIL
312,CONTRACT NO H 0 L X 75 COMMODITY EVA TAISOX 7...,EVA TAISOX 7350
...,...,...
3576,DESCRIPTION OF GOODS QUANTITY MT UNIT PRICE US...,PC RESIN
3579,DESCRIPTION OF GOODS QUANTITY MT UNIT PRICE US...,RESIN
3581,DESCRIPTION OF GOODS QUANTITY MT UNIT PRICE US...,PC RESIN
3583,DESCRIPTION OF GOODS QUANTITY MT UNIT PRICE US...,PC RESIN


In [154]:
dfm2 = df_m.iloc[:,2:4]
dfm2

Unnamed: 0,開狀人X,開狀人Y
286,KINGFA SCI AND TECH CO LTDADDRESS NO33 KEFENG ...,KINGFA SCI AND TECH CO LTD
287,LANDMARK INTL CORPSUITE 310GRIFFITH CORPORATE ...,LANDMARK INTL CORP
310,IMANAKA LTD12-1 GOBAN-CHOCHIYODA-KU TOKYO JAPAN,IMANAKA LTD
311,IMANAKA LTD12-1 GOBAN-CHOCHIYODA-KU TOKYO JAPAN,IMANAKA LTD
312,IMANAKA LTD12-1 GOBAN-CHOCHIYODA-KU TOKYO JAPAN,IMANAKA LTD
...,...,...
3576,PTINAWAN CHEMTEX SUKSES ABADIPLS SEE FIELD 47A,PTINAWAN CHEMTEX SUKSES ABADI
3579,DAP CORP404-1 BOCHE-RI MIYANG-MYEONANSEONG-CIT...,DAP CORP
3581,DAP CORP404-1 BOCHE-RI MIYANG-MYEONANSEONG-CIT...,DAP CORP
3583,DAP CORP404-1 BOCHE-RI MIYANG-MYEONANSEONG-CIT...,DAP CORP


In [155]:
dfm3 = df_m.iloc[:,4:]
dfm3

Unnamed: 0,銀行X,銀行Y
286,UPON OUR RECEIPT OF THE DOCUMENTS IN ORDER WE ...,BANK OF CHINA LTD
287,1 SIGNED COMMERCIAL INVOICE IN 3 COPIES INDICA...,TAICHUNG COMMERCIAL BANK
310,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD
311,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD
312,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD
...,...,...
3576,1 SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK CENTRAL ASIA
3579,SIGNED COMMERCIAL INVOICE IN 3 COPIESFULL SET ...,KOREA DEVELOPMENT BANK
3581,SIGNED COMMERCIAL INVOICE IN 3 COPIESFULL SET ...,KOREA DEVELOPMENT BANK
3583,SIGNED COMMERCIAL INVOICE IN 3 COPIESFULL SET ...,KOREA DEVELOPMENT BANK


In [156]:
def str2index(context,string):
    ys = context.find(string)
    ye = ys + len(string)
    return ys,ye

In [157]:
def addysye(df):
    ys_lst = []
    ye_lst = []
    for i in range(len(df)):
        k = df.iloc[i,0]
        v = df.iloc[i,1]
        ys,ye = str2index(k,v)
        ys_lst.append(ys)
        ye_lst.append(ye)
    df['string_Y_1'] = ys_lst
    df['string_Y_2'] = ye_lst
    return df

In [158]:
dfm1 = addysye(dfm1)
dfm1

Unnamed: 0,產品X,產品Y,string_Y_1,string_Y_2
286,HDPE 8010 QT Y 900 MT AT USD 127000 MT H 0 FX ...,HDPE 8001,50,59
287,HDPE 8010 QT Y 900 MT AT USD 127000 MT H 0 FX ...,HDPE 8001,50,59
310,EVA TAISOX 7360 M 12 MT USD 2625 MT EVA TAISOX...,EVA TAISOX,0,10
311,300000 BB LS - 10 P C T OF GASOIL 005 PCT S TE...,GASOIL,27,33
312,CONTRACT NO H 0 L X 75 COMMODITY EVA TAISOX 7...,EVA TAISOX 7350,34,49
...,...,...,...,...
3576,DESCRIPTION OF GOODS QUANTITY MT UNIT PRICE US...,PC RESIN,62,70
3579,DESCRIPTION OF GOODS QUANTITY MT UNIT PRICE US...,RESIN,69,74
3581,DESCRIPTION OF GOODS QUANTITY MT UNIT PRICE US...,PC RESIN,62,70
3583,DESCRIPTION OF GOODS QUANTITY MT UNIT PRICE US...,PC RESIN,62,70


In [159]:
dfm2 = addysye(dfm2)
dfm2

Unnamed: 0,開狀人X,開狀人Y,string_Y_1,string_Y_2
286,KINGFA SCI AND TECH CO LTDADDRESS NO33 KEFENG ...,KINGFA SCI AND TECH CO LTD,0,26
287,LANDMARK INTL CORPSUITE 310GRIFFITH CORPORATE ...,LANDMARK INTL CORP,0,18
310,IMANAKA LTD12-1 GOBAN-CHOCHIYODA-KU TOKYO JAPAN,IMANAKA LTD,0,11
311,IMANAKA LTD12-1 GOBAN-CHOCHIYODA-KU TOKYO JAPAN,IMANAKA LTD,0,11
312,IMANAKA LTD12-1 GOBAN-CHOCHIYODA-KU TOKYO JAPAN,IMANAKA LTD,0,11
...,...,...,...,...
3576,PTINAWAN CHEMTEX SUKSES ABADIPLS SEE FIELD 47A,PTINAWAN CHEMTEX SUKSES ABADI,0,29
3579,DAP CORP404-1 BOCHE-RI MIYANG-MYEONANSEONG-CIT...,DAP CORP,0,8
3581,DAP CORP404-1 BOCHE-RI MIYANG-MYEONANSEONG-CIT...,DAP CORP,0,8
3583,DAP CORP404-1 BOCHE-RI MIYANG-MYEONANSEONG-CIT...,DAP CORP,0,8


In [160]:
dfm3 = addysye(dfm3)
dfm3

Unnamed: 0,銀行X,銀行Y,string_Y_1,string_Y_2
286,UPON OUR RECEIPT OF THE DOCUMENTS IN ORDER WE ...,BANK OF CHINA LTD,143,160
287,1 SIGNED COMMERCIAL INVOICE IN 3 COPIES INDICA...,TAICHUNG COMMERCIAL BANK,126,150
310,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD,123,136
311,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD,123,136
312,REIMBURSEMENT BY TELECOMMUNICATION IS PROHIBIT...,MUFG BANK LTD,123,136
...,...,...,...,...
3576,1 SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK CENTRAL ASIA,199,216
3579,SIGNED COMMERCIAL INVOICE IN 3 COPIESFULL SET ...,KOREA DEVELOPMENT BANK,115,137
3581,SIGNED COMMERCIAL INVOICE IN 3 COPIESFULL SET ...,KOREA DEVELOPMENT BANK,115,137
3583,SIGNED COMMERCIAL INVOICE IN 3 COPIESFULL SET ...,KOREA DEVELOPMENT BANK,115,137


In [161]:
dfm1['type'] = '產品'
dfm2['type'] = '開狀人'
dfm3['type'] = '銀行'

In [162]:
dfm1.columns = ['string_X_train','Y_label','string_Y_1','string_Y_2','type']
dfm2.columns = ['string_X_train','Y_label','string_Y_1','string_Y_2','type']
dfm3.columns = ['string_X_train','Y_label','string_Y_1','string_Y_2','type']
df = dfm1.append(dfm2).append(dfm3)
df.shape

(1539, 5)

In [163]:
df

Unnamed: 0,string_X_train,Y_label,string_Y_1,string_Y_2,type
286,HDPE 8010 QT Y 900 MT AT USD 127000 MT H 0 FX ...,HDPE 8001,50,59,產品
287,HDPE 8010 QT Y 900 MT AT USD 127000 MT H 0 FX ...,HDPE 8001,50,59,產品
310,EVA TAISOX 7360 M 12 MT USD 2625 MT EVA TAISOX...,EVA TAISOX,0,10,產品
311,300000 BB LS - 10 P C T OF GASOIL 005 PCT S TE...,GASOIL,27,33,產品
312,CONTRACT NO H 0 L X 75 COMMODITY EVA TAISOX 7...,EVA TAISOX 7350,34,49,產品
...,...,...,...,...,...
3576,1 SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS AND...,BANK CENTRAL ASIA,199,216,銀行
3579,SIGNED COMMERCIAL INVOICE IN 3 COPIESFULL SET ...,KOREA DEVELOPMENT BANK,115,137,銀行
3581,SIGNED COMMERCIAL INVOICE IN 3 COPIESFULL SET ...,KOREA DEVELOPMENT BANK,115,137,銀行
3583,SIGNED COMMERCIAL INVOICE IN 3 COPIESFULL SET ...,KOREA DEVELOPMENT BANK,115,137,銀行
