In [1]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers

from tokenizers import BertWordPieceTokenizer  #Bert的分词器
from tqdm import tqdm
import numpy as np

!pip install wandb #可视化模型损失的包

import os, time
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from kaggle_datasets import KaggleDatasets  #在kaggle使用tpu的接口

# We'll use a tokenizer for the BERT model from the modelling demo notebook.
!pip install bert-tensorflow 
import bert.tokenization

print(tf.version.VERSION)

Collecting bert-tensorflow
  Downloading bert_tensorflow-1.0.4-py2.py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 1.1 MB/s 
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.4
2.4.1


In [2]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [3]:
DATA_PATH =  KaggleDatasets().get_gcs_path('jigsaw-multilingual-toxic-comment-classification')  #数据地址
BERT_PATH = KaggleDatasets().get_gcs_path('bert-multi')   #bert多语言模型的地址
BERT_PATH_SAVEDMODEL = BERT_PATH + "/bert_multi_from_tfhub"

OUTPUT_PATH = "/kaggle/working"

In [4]:
#读取数据
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
sub2 = pd.read_csv('../input/ensemble/submission.csv')

# BERT Tokenizer

# Preprocessing

In [5]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)  #截断
    tokenizer.enable_padding(length=maxlen)     #填充
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [6]:
#IMP DATA FOR CONFIG

AUTO = tf.data.experimental.AUTOTUNE  #根据可用的CPU动态设置并行调用的数量


# Configuration
EPOCHS = 5
BATCH_SIZE = 16 * strategy.num_replicas_in_sync  #tpu核心数
MAX_LEN = 192     #最大长度

In [7]:
# First load the real tokenizer
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')  #保留大小写，Bert蒸馏，多语言模型
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')   #保存到本地
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=False, wordpieces_prefix=##)

In [8]:
x_train = fast_encode(train1.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(valid.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(test.content.astype(str), fast_tokenizer, maxlen=MAX_LEN)

y_train = train1.toxic.values
y_valid = valid.toxic.values

100%|██████████| 874/874 [00:36<00:00, 23.92it/s]
100%|██████████| 32/32 [00:01<00:00, 26.90it/s]
100%|██████████| 250/250 [00:10<00:00, 23.49it/s]


In [9]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat() #将整个序列重复多次
    .shuffle(2048) 
    .batch(BATCH_SIZE)  #将多个元素组合成batch
    .prefetch(AUTO)  #在 GPU 训练的同时 CPU 可以准备数据
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [10]:
def build_model(transformer, max_len=512):
    """
    function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0] 
    
    cls_token = sequence_output[:, 0, :]
    out = tf.keras.layers.Dense(192, activation='relu')(cls_token)
    out = tf.keras.layers.Dense(64, activation='relu')(out)
    out = tf.keras.layers.Dense(64, activation='relu')(out)
    out = Dense(1, activation='sigmoid')(out)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [11]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel    #蒸馏模型
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

Downloading:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/911M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-multilingual-cased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).


Cause: while/else statement not yet supported


The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist TFBaseModelOutput(last_hi 134734080 
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 192)               147648    
_________________________________________________________________
dense_1 (Dense)              (None, 64)                12352     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65    

In [12]:
#在训练集上训练，验证集上验证
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Epoch 1/5


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.




The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
# 在验证集上训练
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS*2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
sub['toxic'] = model.predict(test_dataset, verbose=1)
#sub.to_csv('submission.csv', index=False)

sub1 = sub[['id', 'toxic']]

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.




In [15]:
sub1.rename(columns={'toxic':'toxic1'}, inplace=True)
sub2.rename(columns={'toxic':'toxic2'}, inplace=True)
sub3 = pd.merge(sub1, sub2, how='left', on='id')

sub3['toxic'] = (sub3['toxic1'] * 0.1) + (sub3['toxic2'] * 0.9) #blend 1
sub3['toxic'] = (sub3['toxic2'] * 0.35) + (sub3['toxic'] * 0.65) #blend 2

sub3[['id', 'toxic']].to_csv('submission.csv', index=False)