## About this notebook
主要采用五折

In [None]:
!pip install xlrd==1.2.0

In [None]:
!pip install transformers==3.5.0

In [None]:
import os
import jieba
import numpy as np 
import tensorflow_addons as tfa
from tensorflow_addons.optimizers import AdamW
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input,Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import copy
import seaborn as sn
import random
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
from string import digits, punctuation
import re
from imblearn.under_sampling import RandomUnderSampler
from tensorflow.keras.models import Model
from sklearn.metrics import confusion_matrix,accuracy_score
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import AutoModel
import matplotlib.pyplot as plt
from kaggle_datasets import KaggleDatasets
import transformers
from sklearn.metrics import classification_report
import logging
import csv
from transformers import BertTokenizer,BertModel,BertConfig,BertForPreTraining,TFAutoModelWithLMHead
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import f1_score,confusion_matrix,precision_score,recall_score
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
import pandas as pd
AUTO = tf.data.experimental.AUTOTUNE

In [None]:
transformers.__version__

## Helper Functions

In [None]:
def seed_everything(seed=0):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['PYTHONHASHSEED']=str(seed)
    random.seed(seed)


seed = 1024
seed_everything(seed)

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen     
    )
    
    return np.array(enc_di['input_ids'])

In [None]:

#使用五个token
def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(3, activation='softmax')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
 
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=5e-6), loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing = 0.05), metrics=['accuracy',tfa.metrics.F1Score(num_classes=3,average='weighted')])
    
    return model

In [None]:
#只使用最后一层的cls_token
# def build_model(transformer, max_len=512):
#     """
#     https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
#     """
#     input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
#     res = transformer(input_word_ids,output_hidden_states = True)
#     #hideden_state一共12个，每一层的
#     sequence_output, hidden_state = res[0],res[2]
#     cls_token = sequence_output[:, 0, :]#最后一层的cls token
#     out = Dense(2, activation='softmax')(cls_token)
    
#     model = Model(inputs=input_word_ids, outputs=out)
#     model.compile(Adam(lr=5e-6), loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing = 0.01), metrics=['accuracy',tfa.metrics.F1Score(num_classes=2,average='weighted')])
    
#     return model

## TPU Configs

In [None]:
# Detect hardware, return appropriate distribution strategy
#调用kaggle上的tpu必要代码
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# 五折交叉验证
kfold = KFold(n_splits=5, random_state=seed, shuffle=True)#五折交叉验证
# Configuration
EPOCHS = 1
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
MAX_LEN = 140
use_external1 = False
use_external2 = False
use_pseudo = False
use_valid = True
DISPLAY_PLOT = True
# MODEL = 'roberta-base'
# # "roberta-base","roberta-large","bert-base-uncased","ernie-2.0-en"

In [None]:
df_train = pd.read_csv('/kaggle/input/nlpcc-track1-dataset/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
df_train.columns = ['text_a', 'text_b', 'labels']
df_test = pd.read_csv('/kaggle/input/nlpcc-track1-dataset/test.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
df_test.columns = ['text_a', 'text_b', 'labels']

In [None]:
#数据预处理
def fake_data_process(data):
    data.insert(data.shape[1], 'content',"")
    for i,label in enumerate(data['labels']):
        if(data['labels'][i] == "Against"):
            data['labels'][i] = 0
        elif(data['labels'][i] == "Support"):
            data['labels'][i] = 1
        elif(data['labels'][i] == "Neutral"):
            data['labels'][i] = 2
        data['content'][i] = data['text_a'][i] +'。'+data['text_b'][i]

In [None]:
fake_data_process(df_train)
fake_data_process(df_test)

In [None]:
df_train

In [None]:
class_weight = {0: 3.1,1: 3.9,2:3.0}

In [None]:
if (use_valid):
    df_train = pd.concat([df_train,df_test],ignore_index=True)

In [None]:
df_train

In [None]:
# fake_train1.label.value_counts()

## Build datasets objects

In [None]:
def get_train_dataset(x_data,y_data):
    dataset = tf.data.Dataset.from_tensor_slices((x_data, y_data))
    dataset = dataset.repeat()
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.shuffle(seed)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTO)
    return dataset

def get_valid_dataset(x_data,y_data):
    dataset = tf.data.Dataset.from_tensor_slices((x_data, y_data))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTO)
    return dataset

In [None]:
# fake_train_dataset = (
#     tf.data.Dataset
#     .from_tensor_slices((x_fake_train, y_fake_train))
#     .repeat()
#     .shuffle(2048)
#     .batch(BATCH_SIZE)
#     .prefetch(AUTO)
# )

# fake_valid_dataset = (
#     tf.data.Dataset
#     .from_tensor_slices((x_fake_valid, y_fake_valid))
#     .batch(BATCH_SIZE)
#     .cache()
#     .prefetch(AUTO)
# )

In [None]:
def lrfn1(epoch):
    LR_START = 0.00001
    LR_MAX = 0.00005 
    LR_MIN = 0.000001
    LR_RAMPUP_EPOCHS = 6
    LR_SUSTAIN_EPOCHS = 3
    LR_EXP_DECAY = .4
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr

In [None]:
rng = [i for i in range(EPOCHS)]
y_s = [lrfn1(x) for x in rng]
print(y_s)
plt.plot(rng, y_s)
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y_s[0], max(y_s), y_s[-1]))

In [None]:
def lrfn2(epoch):
    LR_START = 0.000005
    LR_MIN = 0.000001
    LR_MAX = 0.00005 
    LR_MIN = 0.000001
    LR_RAMPUP_EPOCHS = 6
    LR_SUSTAIN_EPOCHS = 3
    LR_EXP_DECAY = .4
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr

In [None]:
rng = [i for i in range(EPOCHS)]
y_s = [lrfn2(x) for x in rng]
print(y_s)
plt.plot(rng, y_s)
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y_s[0], max(y_s), y_s[-1]))
lr_warm_up = tf.keras.callbacks.LearningRateScheduler(lrfn2, verbose=1)

## Load model into the TPU

## Train Model

First, we train on the subset of the training set, which is completely in English.

In [None]:
preds = []
weights = []
models = ["hfl/chinese-roberta-wwm-ext-large","hfl/chinese-roberta-wwm-ext-large","hfl/chinese-roberta-wwm-ext-large","hfl/chinese-roberta-wwm-ext-large","hfl/chinese-roberta-wwm-ext-large"]
for fold,(train,valid) in enumerate(kfold.split(df_train.content,df_train.labels)):
    print('#### FOLD',fold+1)
    x_train,x_valid,y_train,y_valid = df_train.content[train],df_train.content[valid]\
    ,df_train.labels[train],df_train.labels[valid]
    if (use_pseudo):
        print("use pseudo")
        x_train = pd.concat([x_train, df_pseudo_cleaned.tweet]).reset_index(drop=True)
        y_train = pd.concat([y_train, df_pseudo_cleaned.label]).reset_index(drop=True)
    if (use_external2):
        x_train = pd.concat([x_train, fake_external2.tweet]).reset_index(drop=True)
        y_train = pd.concat([y_train, fake_external2.label]).reset_index(drop=True)
    
    tokenizer = AutoTokenizer.from_pretrained(models[fold])
    
    x_fake_train = regular_encode(x_train,tokenizer, maxlen=MAX_LEN)
    x_fake_valid = regular_encode(x_valid,tokenizer,maxlen=MAX_LEN)
    

    y_fake_train = to_categorical(y_train,3,dtype='int32')
    y_fake_valid = to_categorical(y_valid,3,dtype='int32')
    
    train_dataset = get_train_dataset(x_fake_train,y_fake_train )
    valid_dataset = get_valid_dataset(x_fake_valid,y_fake_valid)
    
    x_fake_test = regular_encode(df_test.content,tokenizer,maxlen=MAX_LEN)
    y_fake_test = to_categorical(df_test.labels,3,dtype='int32')
    
    test_dataset = get_valid_dataset(x_fake_test,y_fake_test)
    
    n_steps = x_fake_train.shape[0] // BATCH_SIZE

    # BUILD MODEL
    K.clear_session()
    if tpu:
        tf.tpu.experimental.initialize_tpu_system(tpu)
    with strategy.scope():
        transformer_layer = TFAutoModel.from_pretrained(models[fold])
        model = build_model(transformer_layer, max_len=MAX_LEN)
        
    sv = tf.keras.callbacks.ModelCheckpoint(
        'fold-%i.h5'%fold, monitor='val_f1_score', verbose=0, save_best_only=True,save_weights_only=True,
        mode='max', save_freq= 'epoch' )
    cb_lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(
        monitor = 'val_accuracy', factor = 0.1, patience = 2, verbose = 2, min_delta = 0.0001, mode = 'max')
    if(fold == 1 or fold == 4 or  fold == 0 or fold == 2 or fold==3):
        lr_warm_up = tf.keras.callbacks.LearningRateScheduler(lrfn1, verbose=1)
    else:
        lr_warm_up = tf.keras.callbacks.LearningRateScheduler(lrfn1, verbose=1)
    
#     x_fake_test = regular_encode(df_test.content,tokenizer,maxlen=MAX_LEN)
#     y_fake_test = to_categorical(df_test.labels,3,dtype='int32')
    train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    callbacks = [cb_lr_schedule,sv,lr_warm_up],
    validation_data= valid_dataset,
    class_weight = class_weight,
    epochs=EPOCHS)
    
    
    print('Loading best model...')
    model.load_weights('fold-%i.h5'%fold)
    
        
#     x_full_train = regular_encode(fake_train1.tweet, tokenizer, maxlen=MAX_LEN)
    
 
    x_fake_test = regular_encode(df_test.content,tokenizer,maxlen=MAX_LEN)
    y_fake_test = to_categorical(df_test.labels,3,dtype='int32')
    preds.append(model.predict(x_fake_test))
    weights.append(model.evaluate(valid_dataset)[1])
    model.evaluate(x=x_fake_test,y=y_fake_test,verbose=1)
#     if DISPLAY_PLOT:
#         plt.figure(figsize=(15,5))
#         plt.plot(np.arange(EPOCHS),train_history.history['f1_score'],'-o',label='Train F1 Score',color='#ff7f0e')
#         plt.plot(np.arange(EPOCHS),train_history.history['val_f1_score'],'-o',label='Val F1 Score',color='#1f77b4')
#         x = np.argmax(train_history.history['val_f1_score']); y = np.max( train_history.history['val_f1_score'] )
#         xdist = plt.xlim()[1] - plt.xlim()[0]; ydist = plt.ylim()[1] - plt.ylim()[0]
#         plt.scatter(x,y,s=200,color='#1f77b4'); plt.text(x-0.03*xdist,y-0.13*ydist,'max score\n%.2f'%y,size=14)
#         plt.ylabel('F1 Score',size=14); plt.xlabel('Epoch',size=14)
#         plt.legend(loc=2)
#         plt2 = plt.gca().twinx()
#         plt2.plot(np.arange(EPOCHS),train_history.history['loss'],'-o',label='Train Loss',color='#2ca02c')
#         plt2.plot(np.arange(EPOCHS),train_history.history['val_loss'],'-o',label='Val Loss',color='#d62728')
#         x = np.argmin( train_history.history['val_loss'] ); y = np.min( train_history.history['val_loss'] )
#         ydist = plt.ylim()[1] - plt.ylim()[0]
#         plt.scatter(x,y,s=200,color='#d62728'); plt.text(x-0.03*xdist,y+0.05*ydist,'min loss',size=14)
#         plt.ylabel('Loss',size=14)
#         plt.title('FOLD %i'%(fold+1),size=18)
#         plt.legend(loc=3)
#         plt.show()  
    model.save('fold-%i.h5'%fold) 
    del model

In [None]:
def weight_ensemble(weights,predictions):
    weight_sum = np.sum(weights)  
    prediction_sum = 0
    for i in range(len(weights)):
        prediction_sum += (weights[i]/weight_sum)*predictions[i]
    print(prediction_sum)
    np.savez('model_predict_weight',prediction_sum)
    print(prediction_sum)
    result = np.argmax(prediction_sum,axis=1)
    return result

In [None]:
def mean_ensemble(predictions):
    result = np.argmax(np.mean(predictions,axis=0),axis=1)
    np.savez('model_predict_mean',np.mean(predictions,axis=0))
    return result

In [None]:
weight_result= weight_ensemble(weights,preds)
accuracy_score(np.array(df_test.labels,dtype='int32'),weight_result)

In [None]:
weight_result

In [None]:
mean_result = mean_ensemble(preds)
accuracy_score(np.array(df_test.labels,dtype='int32'),mean_result)

In [None]:
label_map = {0: 'Against', 1: 'Support', 2: 'Neutral'}
pred = [label_map[x] for x in weight_result]

with open('./submission.csv', 'w') as f:
    for x in pred:
        f.write(x+'\n')

In [None]:
# result = np.argmax(preds[4],axis=1)
# f1_score(np.array(fake_valid1.label,dtype='int32'),result,average='weighted')

In [None]:
def make_submission(result):
    submission = pd.DataFrame(columns=['id','label'])
    submission.label = result
    nlist = range(1,result.shape[0]+1)
    submission.id = nlist
    submission.to_csv('answer.txt', index=False)

In [None]:
make_submission(weight_result)
# make_submission(mean_result)

In [None]:
# submission = pd.DataFrame(columns=['id','label'])
# submission.label = weight_result
# nlist = range(1,weight_result.shape[0]+1)
# submission.id = nlist
# submission.to_csv('answer.txt', index=False)

In [None]:

# MODEL = "lordtt13/COVID-SciBERT"
# tokenizer = AutoTokenizer.from_pretrained(MODEL)


# x_fake_valid = regular_encode(fake_valid1.tweet,tokenizer,maxlen=MAX_LEN)
# x_fake_train = regular_encode(fake_train1.tweet,tokenizer,maxlen=MAX_LEN)



# y_fake_train = to_categorical(fake_train1.label,2,dtype='int32')
# y_fake_valid = to_categorical(fake_valid1.label,2,dtype='int32')

# with strategy.scope():
#     transformer_layer = TFAutoModel.from_pretrained(MODEL)
#     model = build_model(transformer_layer, max_len=MAX_LEN)
# model.summary()
# n_steps = x_fake_train.shape[0] // BATCH_SIZE

# train_dataset = get_train_dataset(x_fake_train,y_fake_train)
# valid_dataset = get_valid_dataset(x_fake_valid,y_fake_valid)
# cb_lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(
#         monitor = 'val_f1_score', factor = 0.5, patience = 3, verbose = 1, min_delta = 0.0001, mode = 'max')
# sv = tf.keras.callbacks.ModelCheckpoint(
#         'best_model.h5', monitor='val_f1_score', verbose=0, save_best_only=True,
#         save_weights_only=True, mode='max', save_freq='epoch')
# train_history = model.fit(
#     train_dataset,
#     steps_per_epoch=n_steps,
#     callbacks = [cb_lr_schedule,lr_warm_up,sv],
#     validation_data= valid_dataset,
#     epochs=EPOCHS
#     )
# print('Loading best model...')
# model.load_weights('best_model.h5')
# x_fake_test = regular_encode(fake_valid1.tweet,tokenizer,maxlen=MAX_LEN)
# score = model.evaluate(valid_dataset)[2]
# pred = model.predict(x_fake_test)
# np.savez('single-model',pred)

In [None]:
# f1_score(np.array(fake_valid1.label,dtype='int32'),np.argmax(pred,axis=1),average='weighted')

In [None]:
# submission = pd.DataFrame(columns=['id','label'])
# submission.label = result
# nlist = range(1,result.shape[0]+1)
# submission.id = nlist
# submission.to_csv('answer.txt', sep='\t', index=False)