In [None]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers

from tokenizers import BertWordPieceTokenizer
from tqdm import tqdm
import numpy as np

!pip install wandb

import os, time
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from kaggle_datasets import KaggleDatasets

# We'll use a tokenizer for the BERT model from the modelling demo notebook.
!pip install bert-tensorflow
import bert.tokenization

print(tf.version.VERSION)

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
SEQUENCE_LENGTH = 128

DATA_PATH =  KaggleDatasets().get_gcs_path('jigsaw-multilingual-toxic-comment-classification')
BERT_PATH = KaggleDatasets().get_gcs_path('bert-multi')
BERT_PATH_SAVEDMODEL = BERT_PATH + "/bert_multi_from_tfhub"

OUTPUT_PATH = "/kaggle/working"

In [None]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
sub2 = pd.read_csv('../input/ensemble/submission.csv')

# BERT Tokenizer

In [None]:
'''
這段是要把無字切割並轉成BERT所需要的編碼
'''
def get_tokenizer(bert_path=BERT_PATH_SAVEDMODEL):
    bert_layer = tf.saved_model.load(bert_path)
    bert_layer = hub.KerasLayer(bert_layer, trainable=False)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() #vocab_file :轉換為對應的編碼通常频率越高的byte索引越小
    cased = bert_layer.resolved_object.do_lower_case.numpy()
    tf.gfile = tf.io.gfile  # 在tokenizer中載入 bert.tokenization.load_vocab 模型
    tokenizer = bert.tokenization.FullTokenizer(vocab_file, cased)
  
    return tokenizer

tokenizer = get_tokenizer()

# Preprocessing

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
#IMP DATA FOR CONFIG

AUTO = tf.data.experimental.AUTOTUNE #用於構建輸入管道的實驗性API。


# Configuration
EPOCHS = 5 #定義訓練過程中數據將被輪5次
BATCH_SIZE = 16 * strategy.num_replicas_in_sync #=16*8
MAX_LEN = 192

In [None]:
# First load the real tokenizer:載入真正的tokenizer
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
# Save the loaded tokenizer locally:儲存
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library: 利用huggestface tokenizers庫重新加載它
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

In [None]:
x_train = fast_encode(train1.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(valid.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(test.content.astype(str), fast_tokenizer, maxlen=MAX_LEN)

y_train = train1.toxic.values
y_valid = valid.toxic.values

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048) #打亂排序
    .batch(BATCH_SIZE)
    .prefetch(AUTO)#最佳參數
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [None]:
def build_model(transformer, max_len=512):
    """
    function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    
    cls_token = sequence_output[:, 0, :]
    out = tf.keras.layers.Dense(192, activation='relu')(cls_token)
    out = tf.keras.layers.Dense(64, activation='relu')(out)
    out = tf.keras.layers.Dense(64, activation='relu')(out)
    out = Dense(1, activation='sigmoid')(out)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
%%time
with strategy.scope(): #表明了將要分散式執行的程式碼塊
    transformer_layer = (
        transformers.TFDistilBertModel #Bert的簡化版本
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

In [None]:
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS*2
)

In [None]:
sub['toxic'] = model.predict(test_dataset, verbose=1)
#sub.to_csv('submission.csv', index=False)

sub1 = sub[['id', 'toxic']]

In [None]:
sub1.rename(columns={'toxic':'toxic1'}, inplace=True)
sub2.rename(columns={'toxic':'toxic2'}, inplace=True)
sub3 = pd.merge(sub1, sub2, how='left', on='id') #資料合併

sub3['toxic'] = (sub3['toxic1'] * 0.1) + (sub3['toxic2'] * 0.9) #blend 1
sub3['toxic'] = (sub3['toxic2'] * 0.39) + (sub3['toxic'] * 0.61) #blend 2

sub3[['id', 'toxic']].to_csv('submission.csv', index=False)