In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.utils.data as Data
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertAdam
from sklearn.metrics import recall_score

#### 数据集读取

In [2]:
data = pd.read_csv('C:/Users/25529/Downloads/jigsaw/train_data.csv', encoding='ANSI')
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,168ac3d396c7d588,if there is a chromosone then e=what is it?Sma...,0,0,0,0,0,0
1,168bc99fa2cfd9aa,Hollywood Undead \n\nI have collected articles...,0,0,0,0,0,0
2,168cd51c24508159,"""\n\n Rollback \n\nI've enabled rollback on yo...",0,0,0,0,0,0
3,168d515e2e99f78d,Another backlog. Thanks. (Trouble?/My Work),0,0,0,0,0,0
4,168d5a1c66f5e8bf,""" - unsigned\n\nWe do include it. This article...",0,0,0,0,0,0


In [3]:
data = data[['comment_text', 'toxic']]
data.head()

Unnamed: 0,comment_text,toxic
0,if there is a chromosone then e=what is it?Sma...,0
1,Hollywood Undead \n\nI have collected articles...,0
2,"""\n\n Rollback \n\nI've enabled rollback on yo...",0
3,Another backlog. Thanks. (Trouble?/My Work),0
4,""" - unsigned\n\nWe do include it. This article...",0


In [4]:
data['toxic'].sum()/len(data['toxic']) #样本存在较为严重的样本失衡问题

0.08838388572285542

## BERT-base pretrained

In [5]:
bert_pre_model='C:/Users/25529/Downloads/bert-base-uncased/pytorch_model.bin'#预训练模型文件
bert_config='C:/Users/25529/Downloads/bert-base-uncased/bert_config.json'#配置文件
bert_pre_tokenizer='C:/Users/25529/Downloads/bert-base-uncased/bert-base-uncased-vocab.txt'#词表

#### trainloader

In [6]:
#提取语句并处理
sentencses=['[CLS] ' + sent + ' [SEP]' for sent in data.comment_text.values]
labels=data.toxic.values
print("第一句话:",sentencses[0])
tokenizer=BertTokenizer.from_pretrained(bert_pre_tokenizer,do_lower_case=True)
tokenized_sents=[tokenizer.tokenize(sent) for sent in sentencses]
print("tokenized的第一句话:",tokenized_sents[0])

第一句话: [CLS] if there is a chromosone then e=what is it?Small Text [SEP]




tokenized的第一句话: ['[CLS]', 'if', 'there', 'is', 'a', 'ch', '##rom', '##oso', '##ne', 'then', 'e', '=', 'what', 'is', 'it', '?', 'small', 'text', '[SEP]']


In [7]:
#定义句子最大长度
MAX_LEN=96
#将分割后的句子转化成数字  word-->idx
input_ids=[tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_sents]
print("转化后的第一个句子:",input_ids[0])

转化后的第一个句子: [101, 2065, 2045, 2003, 1037, 10381, 21716, 19137, 2638, 2059, 1041, 1027, 2054, 2003, 2009, 1029, 2235, 3793, 102]


In [8]:
#PADDING
import keras
input_ids=keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
print("Padding 第一个句子:",input_ids[0])

Padding 第一个句子: [  101  2065  2045  2003  1037 10381 21716 19137  2638  2059  1041  1027
  2054  2003  2009  1029  2235  3793   102     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0]


In [9]:
#mask
attention_masks = []
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)
print("第一个attention mask:",attention_masks[0])

第一个attention mask: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [10]:
#划分训练集、验证集
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)
print("训练集的一个inputs",train_inputs[0])
print("训练集的一个mask",train_masks[0])

训练集的一个inputs [  101  2624  9072  6657  5372  7543 10876  2098  2007  1996 14684 28483
  2078  1997  3963  3401  1010  2065  2025  3041  1012  2035  1996  3822
  2301  4933  2001  2053  2936  1996  2613  2518  1012  1011  2831  1013
 10373   102     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0]
训练集的一个mask [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [11]:
#将训练集、验证集转化成tensor
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

#生成dataloader
batch_size = 16
train_data = Data.TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = Data.RandomSampler(train_data)
train_dataloader = Data.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = Data.TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = Data.SequentialSampler(validation_data)
validation_dataloader = Data.DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [20]:
train_data[0]

(tensor([  101,  2624,  9072,  6657,  5372,  7543, 10876,  2098,  2007,  1996,
         14684, 28483,  2078,  1997,  3963,  3401,  1010,  2065,  2025,  3041,
          1012,  2035,  1996,  3822,  2301,  4933,  2001,  2053,  2936,  1996,
          2613,  2518,  1012,  1011,  2831,  1013, 10373,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0], dtype=torch.int32),
 tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

### model

In [12]:
modelConfig = BertConfig.from_pretrained(bert_config)
model = BertForSequenceClassification.from_pretrained(bert_pre_model, config=modelConfig)
print(model.cuda())

Some weights of the model checkpoint at C:/Users/25529/Downloads/bert-base-uncased/pytorch_model.bin were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [13]:
import torchviz
def modeltorchviz(model,b_input_ids,b_input_mask):    
    y = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]    # 获取网络的预测值
    print(y)
    MyConvNetVis = torchviz.make_dot(y, params=dict(list(model.named_parameters())))
    MyConvNetVis.format = "png"
    # 指定文件生成的文件夹
    MyConvNetVis.directory = "C:/Users/25529/Downloads/"
    # 生成文件
    MyConvNetVis.view() 
# 定义一个网络的输入值
for batch in validation_dataloader:
        batch = tuple(t.to('cuda') for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
b_input_ids = b_input_ids[0:1]
b_labels = b_labels[0:1]
modeltorchviz(model,b_input_ids,b_input_mask)

tensor([[-0.1953,  0.2689],
        [-0.3697,  0.2933],
        [-0.3447,  0.2462]], device='cuda:0', grad_fn=<AddmmBackward0>)


dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.953277 to fit


In [14]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

t_total value of -1 results in schedule not being applied


In [15]:
#定义一个计算准确率的函数
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

### train

In [16]:
#训练开始
train_loss_set = []#可以将loss加入到列表中，后期画图使用
epochs = 10
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")#判断CUDA是否能使用，不可以就使用CPU
for _ in range(epochs):
    #训练开始
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        #取第一个位置，BertForSequenceClassification第一个位置是Loss，第二个位置是[CLS]的logits
        loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0]
        train_loss_set.append(loss.item())
        loss.backward()
        optimizer.step()
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
    print("Train loss: {}".format(tr_loss / nb_tr_steps))
    #模型评估
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\utils\python_arg_parser.cpp:1055.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Train loss: 0.11156245103163301
Validation Accuracy: 0.9562321173104434
Train loss: 0.09089311386840165
Validation Accuracy: 0.9612839771101573
Train loss: 0.07850029753244975
Validation Accuracy: 0.9603004291845494
Train loss: 0.07947985485582737
Validation Accuracy: 0.9598086552217453
Train loss: 0.07768619991557933
Validation Accuracy: 0.9574839055793991
Train loss: 0.0703096574658591
Validation Accuracy: 0.9584674535050072
Train loss: 0.07421665355146337
Validation Accuracy: 0.9574839055793991
Train loss: 0.07177635951135812
Validation Accuracy: 0.9572156652360515
Train loss: 0.07483614263222664
Validation Accuracy: 0.9575286123032904
Train loss: 0.0667991025641714
Validation Accuracy: 0.9553826895565093


In [51]:
# 保存模型参数
def save_model(the_model, PATH):
    torch.save(the_model, PATH)
# 加载模型参数
def load_model(PATH):
    the_model = torch.load(PATH)

In [53]:
save_model(model, 'C:/Users/25529/Downloads/jigsaw_BERT.h5')

### test

In [35]:
test_data = pd.read_csv('C:/Users/25529/Downloads/jigsaw/test_data.csv', encoding='ISO-8859-1')
test_data = test_data[['comment_text', 'toxic']]
test_data.head()

Unnamed: 0,comment_text,toxic
0,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
1,Hey... what is it..\n@ | talk .\nWhat is it......,1
2,"Bye! \n\nDon't look, come or think of comming ...",1
3,You are gay or antisemmitian? \n\nArchangel WH...,1
4,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1


In [36]:
#提取语句并处理
sentencses=['[CLS] ' + sent + ' [SEP]' for sent in test_data.comment_text.values]
labels=test_data.toxic.values
print("第一句话:",sentencses[0])
tokenizer=BertTokenizer.from_pretrained(bert_pre_tokenizer,do_lower_case=True)
tokenized_sents=[tokenizer.tokenize(sent) for sent in sentencses]
print("tokenized的第一句话:",tokenized_sents[0])
#将分割后的句子转化成数字  word-->idx
input_ids=[tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_sents]
print("转化后的第一个句子:",input_ids[0])
#PADDING
import keras
input_ids=keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
print("Padding 第一个句子:",input_ids[0])
#mask
attention_masks = []
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)
print("第一个attention mask:",attention_masks[0])
#转化成tensor
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)
#生成dataloader
batch_size = 64
test_data = Data.TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = Data.RandomSampler(test_data)
test_dataloader = Data.DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

第一句话: [CLS] COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK [SEP]




tokenized的第一句话: ['[CLS]', 'cock', '##su', '##cker', 'before', 'you', 'piss', 'around', 'on', 'my', 'work', '[SEP]']
转化后的第一个句子: [101, 10338, 6342, 9102, 2077, 2017, 18138, 2105, 2006, 2026, 2147, 102]
Padding 第一个句子: [  101 10338  6342  9102  2077  2017 18138  2105  2006  2026  2147   102
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0]
第一个attention mask: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [37]:
#计算recall
def flat_recall(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(labels_flat, pred_flat)

In [38]:
model.eval()
test_loss, test_accuracy, test_recall = 0, 0, 0
nb_test_steps, nb_test_examples = 0, 0
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    tmp_test_recall = flat_recall(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    test_recall += tmp_test_recall
    nb_test_steps += 1
print("Test Accuracy: {}".format(test_accuracy / nb_test_steps))
print("Test Recall: {}".format(test_recall / nb_test_steps))

Test Accuracy: 0.9286024976801732
Test Recall: 0.7146251919136295
