In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
import sys
drive.mount('/content/drive')
#设置路径
sys.path.append('/content/drive/MyDrive/Colab Notebooks')

In [None]:
! pip install transformers==4.0.1

In [None]:
! pip install  torch==1.6.0

In [None]:
! pip install torchvision==0.7.0

In [None]:
! pip install numpy==1.17.0

## logging
Transformers 有一个集中的日志记录系统，因此您可以轻松设置库的详细程度。
目前该库的默认详细程度是WARNING.
* transformers.logging.CRITICALor transformers.logging.FATAL(int value, 50)：只报告最严重的错误。
* transformers.logging.ERROR(int value, 40)：只报告错误。
* transformers.logging.WARNINGor transformers.logging.WARN(int value, 30)：只报告错误和警告。这是库使用的默认级别。
* transformers.logging.INFO(int value, 20)：报告错误、警告和基本信息。
* transformers.logging.DEBUG(int value, 10)：上报所有信息。

In [None]:
import random
import json
import pandas as pd
import numpy as np
import os
import torch
from transformer import logging
from transformers import BertTokenizer
from collections import defaultdict
from tqdm import tqdm
def seed_everything(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  return seed

logging.set_verbosity_info()
seed_everything(2022)

In [None]:
corpus_path = '/content/drive/MyDrive/Colab Notebooks/dataset/ESIM'
model_path = '/content/drive/MyDrive/Colab Notebooks/dataset/BERT_model'
output_dir = '.'

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_path)

In [None]:
def parse_data(path, data_type='train'):
  sentence_a = []
  sentence_b = []
  labels = []

  with open(path, 'r', encoding = 'utf8') as f:
    for line in tqdm(f.readlines(), desc=f'Reading {data_type} data'):
      line = json.loads(line)
      sentence_a.append(line['sentence1'])
      sentence_b.append(line['sentence2'])
      if data_type != 'test':
        labels.append(int(line['label']))
      else:
        labels.append(0)

  df = pd.DataFrame(zip(sentence_a, sentence_b, labels), columns = ['text_a', 'text_b', 'labels'])
  return df

In [None]:
def read_data(config, tokenizer):
  train_df = parse_data(os.path.join(corpus_path, 'train.json'), data_type = 'train')
  dev_df = parse_data(os.path.join(corpus_path, 'dev.json'), data_type = 'dev')
  test_df = parse_data(os.path.join(corpus_path, 'test.json'), data_type = 'test')

  train_df.append(dev_df)
  train_df.append(test_df)
  inputs = defaultdict(list)


  for i, row in tqdm(train_df.iterrows(), desc= f'Preprocessing train data', total = len(train_df)):
      inputs_dict = tokenizer.encode_plus(row[0] + row[1], add_special_tokens = True,
                                          return_token_type_ids = True, return_attention_mask = True)
      inputs['input_ids'].append(inputs_dict['input_ids'])
      inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
      inputs['attention_mask'].append(inputs_dict['attention_mask'])
    
    
    
  return inputs

In [None]:
data = read_data(corpus_path, tokenizer)