In [1]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification
import numpy as np
from dataset_extractor import *
import torch
model_trained=AutoModelForSequenceClassification.from_pretrained('/data/chenhaohua/PRC_legal/bert-base-legal-chinese-epoch-8')
tokenizer=AutoTokenizer.from_pretrained('/data/chenhaohua/PRC_legal/bert-base-chinese')
model_trained.eval()

  from .autonotebook import tqdm as notebook_tqdm


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [3]:
def bert_inference(batchsize=30):
    train_pair=extract("/data/chenhaohua/PRC_legal_dataset/data_valid.json")
    device=torch.device("cuda",4)
    sum_of_correct=0
    model_trained.to(device) # model to device once
    for i in range(0,len(train_pair['content']),batchsize):
        end_index = min(i + batchsize, len(train_pair['content'])-1)
        infer_batch_tokens=tokenizer(train_pair['content'][i:end_index],padding=True,return_tensors='pt').to(device)
        output=model_trained(**infer_batch_tokens)
        output_logits = output.logits.to("cpu")#must return specific value
        prediction=np.array(torch.argmax(output_logits,dim=1))
        labels=np.array(train_pair["label"][i:end_index])
        sum_of_correct += int(sum(prediction==labels))
        if i%batchsize==0 and i!=0:
            print(f"finished {i} data prediction")
    return sum_of_correct/len(train_pair['content'])

In [4]:
result=bert_inference()
print(result)

14501 lines have been extracted
finished 30 data prediction


OutOfMemoryError: CUDA out of memory. Tried to allocate 42.00 MiB (GPU 4; 31.75 GiB total capacity; 29.93 GiB already allocated; 25.50 MiB free; 30.86 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [4]:
from sklearn.metrics import f1_score, precision_recall_fscore_support
def bert_inference_f1score(batchsize=30):
    train_pair=extract("/data/chenhaohua/PRC_legal_dataset/data_valid.json")
    device=torch.device("cuda",4)
    model_trained.to(device) # model to device once
    all_predictions=[]
    all_labels=[]
    for i in range(0,len(train_pair['content']),batchsize):
        end_index = min(i + batchsize, len(train_pair['content'])-1)
        infer_batch_tokens=tokenizer(train_pair['content'][i:end_index],padding=True,return_tensors='pt').to(device)
        output=model_trained(**infer_batch_tokens)
        output_logits = output.logits.to("cpu")#must return specific value
        prediction=np.array(torch.argmax(output_logits,dim=1))
        labels=np.array(train_pair["label"][i:end_index])
        all_predictions.extend(prediction)
        all_labels.extend(labels)
        if i%batchsize==0 and i!=0:
            print(f"finished {i} data prediction")
    macro_f1=f1_score(all_labels,all_predictions,average="macro")
    micro_f1=f1_score(all_labels,all_predictions,average="micro")
    return macro_f1,micro_f1


In [5]:
micro,macro=bert_inference_f1score()
print(micro,macro)

14501 lines have been extracted
finished 30 data prediction
finished 60 data prediction
finished 90 data prediction
finished 120 data prediction
finished 150 data prediction
finished 180 data prediction
finished 210 data prediction
finished 240 data prediction
finished 270 data prediction
finished 300 data prediction
finished 330 data prediction
finished 360 data prediction
finished 390 data prediction
finished 420 data prediction
finished 450 data prediction
finished 480 data prediction
finished 510 data prediction
finished 540 data prediction
finished 570 data prediction
finished 600 data prediction
finished 630 data prediction
finished 660 data prediction
finished 690 data prediction
finished 720 data prediction
finished 750 data prediction
finished 780 data prediction
finished 810 data prediction
finished 840 data prediction
finished 870 data prediction
finished 900 data prediction
finished 930 data prediction
finished 960 data prediction
finished 990 data prediction
finished 1020 

In [3]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification
import numpy as np
from dataset_extractor import *
import torch
def bert_inference_regression(batchsize=120):
    train_pair=extract("/data/chenhaohua/PRC_legal_dataset/data_valid.json")
    device=torch.device("cuda",4)
    model_trained.to(device) # model to device once
    all_predictions=[]
    all_ground_truth=[]
    for i in range(0,len(train_pair['content']),batchsize):
        end_index = min(i + batchsize, len(train_pair['content'])-1)
        infer_batch_tokens=tokenizer(train_pair['content'][i:end_index],padding=True,return_tensors='pt').to(device)
        output=model_trained(**infer_batch_tokens)
        output_logits = output.logits.to("cpu")#must return specific value
        prediction=output_logits.detach().numpy().reshape(-1)
        ground_truth=np.array(train_pair["imprisonment"][i:end_index])
        all_predictions.extend(prediction)
        all_ground_truth.extend(ground_truth)
        if i%batchsize==0 and i!=0:
            print(f"finished {i} data prediction")
    errors=np.absolute(np.log(np.array(all_predictions)+1)-np.log(np.array(all_ground_truth)+1))
    score=0
    for error in errors:
        if error <= 0.2:
            score += 1.0
        elif error <= 0.4:
            score += 0.8
        elif error <= 0.6:
            score += 0.6
        elif error <= 0.8:
            score += 0.4
        elif error <= 1.0:
            score += 0.2
    return score*100/len(train_pair['content'])

In [4]:
model_trained=AutoModelForSequenceClassification.from_pretrained('/data/chenhaohua/PRC_legal/bert-base-legal-chinese-regression-epoch-24')
model_trained.eval()
tokenizer=AutoTokenizer.from_pretrained('/data/chenhaohua/PRC_legal/bert-base-chinese')
with torch.no_grad():
    result=bert_inference_regression()
print(result)

14501 lines have been extracted
finished 120 data prediction
finished 240 data prediction
finished 360 data prediction
finished 480 data prediction
finished 600 data prediction
finished 720 data prediction
finished 840 data prediction
finished 960 data prediction
finished 1080 data prediction
finished 1200 data prediction
finished 1320 data prediction
finished 1440 data prediction
finished 1560 data prediction
finished 1680 data prediction
finished 1800 data prediction
finished 1920 data prediction
finished 2040 data prediction
finished 2160 data prediction
finished 2280 data prediction
finished 2400 data prediction
finished 2520 data prediction
finished 2640 data prediction
finished 2760 data prediction
finished 2880 data prediction
finished 3000 data prediction
finished 3120 data prediction
finished 3240 data prediction
finished 3360 data prediction
finished 3480 data prediction
finished 3600 data prediction
finished 3720 data prediction
finished 3840 data prediction
finished 3960 da