In [1]:
import os
import sys
from time import strftime, localtime
from collections import Counter
from config import opt
from pytorch_transformers import BertTokenizer
import random
import models
from utils import get_dataloader
from seqeval.metrics import f1_score, accuracy_score, classification_report
from tqdm import tqdm
#os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import numpy as np 
import torch 
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))



ModuleNotFoundError: No module named 'pytorch_transformers'

In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.cuda.is_available())

cpu
False


In [4]:
def train(**kwargs):
    torch.cuda.empty_cache()
    log_file = '{}-{}.log'.format(opt.model, strftime("%y%m%d-%H%M", localtime()))
    logger.addHandler(logging.FileHandler(log_file))

    att_list = ["brand"]

    tokenizer = BertTokenizer.from_pretrained(opt.pretrained_bert_name)
    tags2id = {'':0,'B':1,'I':2,'O':3}
    id2tags = {v:k for k,v in tags2id.items()}

    opt._parse(kwargs)

    if opt.seed is not None:
        random.seed(opt.seed)
        np.random.seed(opt.seed)
        torch.manual_seed(opt.seed)
        torch.cuda.manual_seed(opt.seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
   
    # step1: configure model
    model = getattr(models, opt.model)(opt)
    if opt.load_model_path:
        the_model = torch.load(PATH)
    model.to(opt.device)

    # step2: data
    train_dataloader,valid_dataloader,test_dataloader = get_dataloader(opt)
    
    # step3: criterion and optimizer
    criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
    lr = opt.lr
    optimizer = model.get_optimizer(lr, opt.weight_decay)
    

    # step4 train
    for epoch in range(opt.max_epoch):
        model.train()
        for ii,batch in tqdm(enumerate(train_dataloader)):
            
            # train model
            optimizer.zero_grad()
            x = batch['x'].to(opt.device)
            y = batch['y'].to(opt.device)
            att = batch['att'].to(opt.device)
            inputs = [x, att, y]
            loss = model.log_likelihood(inputs)
            loss.backward()
            #CRF
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=3)
            optimizer.step()
            if ii % opt.print_freq == 0:
                print('epoch:%04d,------------loss:%f'%(epoch,loss.item()))

    torch.save(model.state_dict(), f'data/finetuned_BERT_epoch_model.pth')
    preds, labels = [], []
    model.eval()
    with torch.no_grad():
        for index, batch in enumerate(valid_dataloader):

            x = batch['x'].to(opt.device)
            y = batch['y'].to(opt.device)
            att = batch['att'].to(opt.device)
            inputs = [x, att, y]
            predict = model(inputs)
            
            predict_list = predict.tolist()[0] 

            # 统计非0的，也就是真实标签的长度
            leng = []
            for i in y.cpu():
                # Check the device of the tensor
                tmp = []
                for j in i:
                    if j.item()>0:
                        tmp.append(j.item())
                leng.append(tmp)


            for index, i in enumerate(predict_list):
                preds.append([id2tags[k] if k>0 else id2tags[3] for k in i[:len(leng[index])]])
                # preds += i[:len(leng[index])]

            for index, i in enumerate(y.tolist()):
                labels.append([id2tags[k] if k>0 else id2tags[3] for k in i[:len(leng[index])]])
                #labels += i[:len(leng[index])]
        #precision = precision_score(labels, preds, average='macro')
        #recall = recall_score(labels, preds, average='macro')
        #f1 = f1_score(labels, preds, average='macro')
        report = classification_report(labels, preds)
        print(report)
        logger.info(report)

In [5]:
train()

loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at C:\Users\madri\.cache\torch\pytorch_transformers\5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at C:\Users\madri\.cache\torch\pytorch_transformers\b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391
Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_label

  mask = torch.tensor(context != 0, dtype=torch.uint8)
  score = torch.where(mask[i].unsqueeze(1), next_score, score)
1it [00:03,  3.55s/it]

epoch:0000,------------loss:51.560589


36it [00:18,  1.99it/s]


KeyboardInterrupt: 

In [None]:
# step5: evaluation on test data
model = getattr(models, opt.model)(opt)

model.load_state_dict(torch.load(f'data/finetuned_BERT_epoch_model.pth'))
model.to(opt.device)

# step2: data
train_dataloader,valid_dataloader,test_dataloader = get_dataloader(opt)
tokenizer = BertTokenizer.from_pretrained(opt.pretrained_bert_name)

preds, labels = [], []
model.eval()
with torch.no_grad():
    for index, batch in enumerate(test_dataloader):
            x = batch['x'].to(opt.device)
            y = batch['y'].to(opt.device)
            att = batch['att'].to(opt.device)
            
            inputs = [x, att]
            predict = model(inputs)
            
            print(inputs)
            

            
            predict_list = predict.tolist()[0] 

            
            
            for i in range(len(batch)):
                start_p, end_p, start_y, end_y = 0,0,0,0
                for index,value in enumerate(predict_list[i]):
                    if value == 1:
                        start_p = index
                        j = index
                        while(predict_list[i][j]!=3):
                            j = j + 1
                            end_p = j

                for index,value in enumerate(y[i]):
                    if value == 1:
                        start_y = index
                        j = index
                        while(y[i][j]!=3):
                            j = j + 1
                            end_y = j
                
                print(start_p)
                preds = (x[i][start_p : end_p])
                words_p = tokenizer.convert_ids_to_tokens([i.item() for i in preds.cpu() if i.item()>0])
                labels = (x[i][start_y : end_y])
                words_l = tokenizer.convert_ids_to_tokens([i.item() for i in labels.cpu() if i.item()>0])
            
                words = tokenizer.convert_ids_to_tokens([i.item() for i in x[i].cpu() if i.item()>0])
                print('\nTitle: ',' '.join(words))
                print('\nAttribute: ',tokenizer.convert_ids_to_tokens([i.item() for i in att[i].cpu() if i.item()>0]))
                print('\nPred label: ',' '.join(words_p), ' ')
                print('\nActual label: ', ' '.join(words_l))
    

            
           


In [5]:
import json

from os import path

filename = 'sam.json'

json_data = []

    

# Read JSON file
with open(filename) as fp:
  json_data = json.load(fp)

In [6]:
title = json_data["input_data"]["title"]
att = json_data["input_data"]["attributes"]

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def is_english_char(cp):
    """Checks whether CP is the codepoint of an English character."""
    if (
        (cp >= 0x0041 and cp <= 0x005A)
        or (cp >= 0x0061 and cp <= 0x007A)  # uppercase A-Z
        or (cp >= 0x00C0 and cp <= 0x00FF)  # lowercase a-z
        or (cp >= 0x0100 and cp <= 0x017F)  # Latin-1 Supplement
        or (cp >= 0x0180 and cp <= 0x024F)  # Latin Extended-A
        or (cp >= 0x1E00 and cp <= 0x1EFF)  # Latin Extended-B
        or (cp >= 0x2C60 and cp <= 0x2C7F)  # Latin Extended Additional
        or (cp >= 0xA720 and cp <= 0xA7FF)  # Latin Extended-C
        or (cp >= 0xAB30 and cp <= 0xAB6F)  # Latin Extended-D
        or (cp >= 0xFB00 and cp <= 0xFB06)  # Latin Extended-E
    ):  # Alphabetic Presentation Forms
        return True

    return False

max_len = 40

def X_padding(ids):
    if len(ids) >= max_len:
        return ids[:max_len]
    ids.extend([0] * (max_len - len(ids)))
    return ids

tag_max_len = 6

def tag_padding(ids):
    if len(ids) >= tag_max_len:
        return ids[:tag_max_len]
    ids.extend([0] * (tag_max_len - len(ids)))
    return ids

def nobert4token(tokenizer, title, attribute):
    def get_char(sent):
        tmp = []
        s = ""
        for char in sent.strip():
            if char.strip():
                cp = ord(char)
                if is_english_char(cp):
                    if s:
                        tmp.append(s)
                    tmp.append(char)
                    s = ""
                else:
                    s += char
            elif s:
                tmp.append(s)
                s = ""
        if s:
            tmp.append(s)
        return tmp

    title_list = get_char(title)
    attribute_list = get_char(attribute)

    title_list = tokenizer.convert_tokens_to_ids(title_list)
    attribute_list = tokenizer.convert_tokens_to_ids(attribute_list)


    return title_list, attribute_list

loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\madri\.cache\torch\pytorch_transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [9]:
result = {}

for i in range(len(title)):

    my_dict = {}
    for j in range(len(att)):
        
        attr = att[i][j]

        t, a = nobert4token(tokenizer, title[i], attr)

        x = X_padding(t)
        y = tag_padding(a)

        tensor_a = torch.tensor(y, dtype=torch.int32)
        tensor_a = torch.unsqueeze(tensor_a, dim=0).to('cuda')

        tensor_t = torch.tensor(x, dtype=torch.int32)
        tensor_t = torch.unsqueeze(tensor_t, dim=0).to('cuda')

        output = model([tensor_t, tensor_a])

        predict_list = output.tolist()[0]

        for k in range(len(predict_list)):
            start_p, end_p = 0, 0
            for index, value in enumerate(predict_list[k]):
                if value == 1:
                    start_p = index
                    ind = index
                    while predict_list[k][ind] != 3:
                        ind = ind + 1
                        end_p = ind
            preds = tensor_t[k][start_p:end_p]
            words_p = tokenizer.convert_ids_to_tokens(
                [k.item() for k in preds.cpu() if k.item() > 0])
        
        my_dict[attr] = " ".join(words_p)
        
        str =  " ".join(words_p)
        print(type(str))

    result[title[i]] = my_dict



  score = torch.where(mask[i].unsqueeze(-1), next_score, score)


<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [10]:
result

{' Lee posh Lactic Acid 60% Anti ageing Pigmentation Removing Glow Peel ': {'brand': '[UNK]',
  'hello': '[UNK]'},
 ' Generic Anti Snoring Snore Stopper Sleep Apnea Solution Lips Plasters Soft Space Cotton ': {'brand': '[UNK]',
  'hello': '[UNK]'}}

In [None]:
title_input = tensor_t.to('cuda')

In [None]:
model([title_input,tensor_a])

In [None]:
output = model([title_input,tensor_a.to('cuda')])

predict_list = output.tolist()[0]



In [11]:
predict_list

[[1,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3]]

In [None]:
att = ["brand"]

In [4]:
model = getattr(models, opt.model)(opt)
model.load_state_dict(torch.load(f'data/finetuned_BERT_epoch_model.pth'))
model.to(opt.device)

loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at C:\Users\madri\.cache\torch\pytorch_transformers\b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391
Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 28996
}

loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_m

OpenTag2019(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [None]:
for i in range(len(predict_list)):
                start_p, end_p= 0,0
                for index,value in enumerate(predict_list[i]):
                    if value == 1:
                        start_p = index
                        j = index
                        while(predict_list[i][j]!=3):
                            j = j + 1
                            end_p = j
                print(start_p)
                preds = (title_input.to('cuda')[i][start_p : end_p])
                print(preds)
                words_p = tokenizer.convert_ids_to_tokens([i.item() for i in preds.cpu() if i.item()>0])
                print(words_p)
            

                print('\nTitle: ',title)
                print('\nAttribute: ',att[i])
                print('\nPred label: ',' '.join(words_p), ' ')

