In [None]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint #導入tensorflow
from tensorflow import keras

from sklearn.model_selection import StratifiedKFold, KFold

from kaggle_datasets import KaggleDatasets #採用Kaggle資料集
import transformers

from tokenizers import BertWordPieceTokenizer #分詞器
from tqdm import tqdm #進度條顯示
import numpy as np

#!pip install wandb

#基本模型導入
import os, time
import gc
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from kaggle_datasets import KaggleDatasets

!pip install bert-tensorflow
import bert.tokenization

print(tf.version.VERSION) #tensorflow版本輸出

In [None]:
seed = 7
n_splits = 5

#kkfold = KFold(n_splits).split(x_train)
kfold = StratifiedKFold(n_splits, shuffle=True, random_state=seed)
cvscores = []

In [None]:
print(transformers.__version__) #tensorflow版本輸出

In [None]:
# TPU 檢測. 
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None


if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu) #TPU的連接
else:
    
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

#在TPU上針對Kaggle用戶運行Bert模型

In [None]:
SEQUENCE_LENGTH = 128 #一個輸入字串長度為128的list

#設置Kaggle數據的訪問路徑
DATA_PATH =  KaggleDatasets().get_gcs_path('jigsaw-multilingual-toxic-comment-classification')
#BERT_PATH = KaggleDatasets().get_gcs_path('bert-multi')
#BERT_PATH_SAVEDMODEL = BERT_PATH + "/bert_multi_from_tfhub"
WEIGHTS_PATH = '../input/jigsaw-weights'


OUTPUT_PATH = "/kaggle/working"

In [None]:
# train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train1 = pd.read_csv("/kaggle/input/jigsawch/666666.csv")
valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
# sub2 = pd.read_csv('../input/ensemble/submission.csv')

In [None]:
train1 = train1.dropna(how='any')
train1 = train1.sample(n=100000,random_state = seed)

In [None]:
print(train1.head())

new=pd.DataFrame({'id':['63812'],
                  'content':['你是白癡'],
                  'lang':['zh']}) 
test = test.append(new,ignore_index = True)
print(test.tail())

# BERT Tokenizer

In [None]:
#把文字切割並轉成BERT所需要的編碼

# def get_tokenizer(bert_path=BERT_PATH_SAVEDMODEL):
#     bert_layer = tf.saved_model.load(bert_path)
#     bert_layer = hub.KerasLayer(bert_layer, trainable=False)
#     vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() 
#     cased = bert_layer.resolved_object.do_lower_case.numpy()
#     tf.gfile = tf.io.gfile  
#     tokenizer = bert.tokenization.FullTokenizer(vocab_file, cased)
  
#     return tokenizer

# tokenizer = get_tokenizer()

# Preprocessing

In [None]:
#編碼器，用於將文本編碼為整數序列，以進行BERT輸入

def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):#批次上傳256，最長序列512
    
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen) #最大長度為512，不足會自動補0
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist() #將數據轉換為最接近Python的類型
        encs = tokenizer.encode_batch(text_chunk)
        #print(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
        
    
    return np.array(all_ids)

In [None]:
#用於配置的IMP數據

AUTO = tf.data.experimental.AUTOTUNE


# 配置
EPOCHS = 5 #定義訓練過程數據輪5次
BATCH_SIZE = 16 * strategy.num_replicas_in_sync  #資料集大小
MAX_LEN = 192

In [None]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')# 使用分詞器加載DistilBERT

tokenizer.save_pretrained('.') #儲存

fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer  #利用 huggingface tokenizers庫 重新加載詞向量，lowercase=False:詞向量皆為大寫

In [None]:
#快速編碼
x_train = fast_encode(train1.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(valid.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(test.content.astype(str), fast_tokenizer, maxlen=MAX_LEN)

y_train = train1.toxic.values
y_valid = valid.toxic.values

In [None]:
#訓練BERT模型

def build_model(transformer, max_len=512):  #建立模型，輸入句子最大長度512
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids") #dtype=tf.int 返回數據元素的數據類型int
    sequence_output = transformer(input_word_ids)[0] #BERT模型的輸出 
    cls_token = sequence_output[:, 0, :]
    
    #激活函數
    out = tf.keras.layers.Dense(300, activation='relu')(cls_token)
    out = tf.keras.layers.Dense(128, activation='relu')(out)
    out = tf.keras.layers.Dense(128, activation='relu')(out)
    out = Dense(1, activation='sigmoid')(out) #relu線性函數激活 sigmoid非線性激活函數
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy']) #損失函數的用法，Adam是優化器，loss：計算損失
    
    return model

# 轉化成數據集 生成對應的Dataset

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat() #重複數據集count次數
    .shuffle(2048) #隨機混洗數據集多元素
    .batch(BATCH_SIZE) #將數據集多連續元素合成批次
    .prefetch(AUTO)#將一部分內存加載到cache裡面
)
valid_dataset =(
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [None]:
alldatalen=[]
for i in range(100000):
    alldatalen.append(i)
# print(alldatalen)

In [None]:
foldlist = []
newfoldtrain = []
newfoldval = []
for x_s_train, y_s_train in kfold.split(x_train, y_train):
#     print(x_train[x_s_train])
#     print(y_train[x_s_train])
#     print(x_s_train)
    foldlist.append(y_s_train)
    print(len(x_s_train))
for i in range(5):
    newfoldtrain.append(np.concatenate([foldlist[i],foldlist[(i-1)]]))
    newfoldval.append(np.setdiff1d(alldatalen,newfoldtrain[i]))
    print(len(newfoldtrain[i]))
    print(len(newfoldval[i]))

In [None]:
%%time
with strategy.scope(): #表明分散式執行的程式碼區塊
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)

In [None]:
train_history_list = []

In [None]:
for a,b in kfold.split(x_train, y_train):
    print(a,b)
    print(len(a),len(b))
    print(type(a),type(b))

In [None]:
# %%time
# for i in range(5):
#     x_s_train = x_train[newfoldtrain[i]]
#     y_s_train = y_train[newfoldtrain[i]]
#     train_dataset = (
#         tf.data.Dataset
#         .from_tensor_slices((x_s_train, y_s_train))
#         .repeat() #重複數據集count次數
#         .shuffle(2048) #隨機混洗數據集多元素
#         .batch(BATCH_SIZE) #將數據集多連續元素合成批次
#         .prefetch(AUTO)#將一部分內存加載到cache裡面
#     )
#     x_s_valid = x_train[newfoldval[i]]
#     y_s_valid = y_train[newfoldval[i]]
#     valid_dataset =(
#         tf.data.Dataset
#         .from_tensor_slices((x_s_valid, y_s_valid))
#         .batch(BATCH_SIZE)
#         .cache()
#         .prefetch(AUTO)
#     )
#     n_steps = x_s_train.shape[0] // BATCH_SIZE #讀取矩陣第一維度的長度
#     train_history = model.fit(
#         train_dataset,
#         steps_per_epoch=n_steps,
#         validation_data=valid_dataset,
#         epochs=EPOCHS,
#     ) # 使用model.fit()執行訓練過程
#     train_history_list.append(train_history)
#     print("-----------------------------------------------------------")

In [None]:
%%time
for x_l_train, y_l_train in kfold.split(x_train, y_train):
    x_s_train = x_train[y_l_train]
    y_s_train = y_train[y_l_train]
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_s_train, y_s_train))
        .repeat() #重複數據集count次數
        .shuffle(2048) #隨機混洗數據集多元素
        .batch(BATCH_SIZE) #將數據集多連續元素合成批次
        .prefetch(AUTO)#將一部分內存加載到cache裡面
    )
    x_s_valid = x_train[x_l_train]
    y_s_valid = y_train[x_l_train]
    valid_dataset =(
        tf.data.Dataset
        .from_tensor_slices((x_s_valid, y_s_valid))
        .batch(BATCH_SIZE)
        .cache()
        .prefetch(AUTO)
    )
    n_steps = x_s_train.shape[0] // BATCH_SIZE #讀取矩陣第一維度的長度
    train_history = model.fit(
        train_dataset,
        steps_per_epoch=n_steps,
        validation_data=valid_dataset,
        epochs=EPOCHS,
    ) # 使用model.fit()執行訓練過程
    train_history_list.append(train_history)
    print("-----------------------------------------------------------")

In [None]:
model.summary() #輸出各層的輸出情況

In [None]:
for history in train_history_list:
    print(history.history)

In [None]:
import statistics

his_val_loss = []
his_val_accuracy = []

for history in train_history_list:
    his_val_loss.append(statistics.mean(history.history['val_loss']))
    his_val_accuracy.append(statistics.mean(history.history['val_accuracy']))
his_val_loss.append(statistics.mean(his_val_loss))
his_val_accuracy.append(statistics.mean(his_val_accuracy))

print(his_val_loss[-1])
print(his_val_accuracy[-1])

In [None]:
# model.load_weights(WEIGHTS_PATH+"/weights.h5")

n_steps = x_train.shape[0] // BATCH_SIZE #讀取矩陣第一維度的長度
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS,
) # 使用model.fit()執行訓練過程

n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS*2,
)

model.save_weights("weights.h5")

In [None]:
# print(train_history)
# print(train_history_2)

In [None]:
#model.predict()返回值是數值,表示樣本屬於toxic類別的概率

'''test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)'''

# sub['toxic'] = model.predict(test_dataset, verbose=1)

# sub1 = sub[['id', 'toxic']]

In [None]:
# model.load_weights("weights.h5")