In [56]:
# 导入需要的模块
import tensorflow as tf # tensorflow模块，深度学习框架
import keras # keras模块，深度学习框架
import re # 正则表达式
import numpy as np # numpy数组
import pandas as pd # pandas表格
import matplotlib.pyplot as plt # matplotlib 数据可视化
from sklearn.model_selection import train_test_split # 训练集、验证集、测试集的划分
from tensorflow.keras.models import load_model # 用于加载模型
from sklearn.utils import class_weight # 用于计算样本的权重
from sklearn.metrics import roc_curve, auc # 用于绘画ROC曲线，计算AUC值
from keras.preprocessing.image import ImageDataGenerator # keras模块的图片预处理模块，可用于数据增强
from keras.optimizers import adam_v2 # Adam优化器
from collections import Counter # 词频统计包
import time
from keras.callbacks import TensorBoard
from sklearn.metrics import confusion_matrix
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.models import Model

In [8]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [9]:
import pandas as pd
train_data = pd.read_csv('train_data.csv',encoding='ISO-8859-1')
test_data = pd.read_csv('test_data.csv',encoding='ISO-8859-1')
train_data['non_toxic'] = train_data['toxic'].apply(lambda x:1 if x==0 else 0)
test_data['non_toxic'] = test_data['toxic'].apply(lambda x:1 if x==0 else 0)
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,non_toxic
0,168ac3d396c7d588,if there is a chromosone then e=what is it?Sma...,0,0,0,0,0,0,1
1,168bc99fa2cfd9aa,Hollywood Undead \n\nI have collected articles...,0,0,0,0,0,0,1
2,168cd51c24508159,"""\n\n Rollback \n\nI've enabled rollback on yo...",0,0,0,0,0,0,1
3,168d515e2e99f78d,Another backlog. Thanks. (Trouble?/My Work),0,0,0,0,0,0,1
4,168d5a1c66f5e8bf,""" - unsigned\n\nWe do include it. This article...",0,0,0,0,0,0,1


In [10]:
form_dict={'severe_toxic':1,'obscene':2,'threat':3,'insult':4,'identity_hate':5,'non_toxic':0}

tmp_traindata = train_data[['severe_toxic','obscene','threat','insult','identity_hate','non_toxic']]
train_data['Form'] = tmp_traindata.apply(lambda row:form_dict[tmp_traindata.columns[row.tolist().index(max(row.tolist()))]] , axis = 1 )

tmp_testdata = test_data[['severe_toxic','obscene','threat','insult','identity_hate','non_toxic']]
test_data['Form'] = tmp_testdata.apply(lambda row:form_dict[tmp_testdata.columns[row.tolist().index(max(row.tolist()))]] , axis = 1 )

In [11]:
train_data['Form'].value_counts()

0    193488
2      9212
1      8116
4      2106
5       249
3       205
Name: Form, dtype: int64

In [66]:
# 处理训练集
# 将对应的标签拿出来
#train_labels = np.array(train_data['Form'], dtype=np.int32)
train_labels = pd.get_dummies(train_data['Form'])
t0 = time.time()
# 分词处理
train_intro_texts = []
for intro in train_data['comment_text']:
    intro = intro.replace(',',' ').replace('.',' ').replace('\n',' ').replace('?',' ').replace('!',' ').replace(';',' ').replace(':',' ')
    intro = intro.replace('(',' ').replace(')',' ').replace('[',' ').replace(']',' ').replace('{',' ').replace('}',' ')
    intro = intro.replace('"',' ').replace('."',' ').replace('?"',' ').replace('!"',' ').replace('-','')
    intro = intro.lower()
    intro = [i for i in intro.split()]
    train_intro_texts.append(intro)
print("分词时间：%s"%(time.time()-t0))

分词时间：4.611855506896973


In [68]:
# 处理测试集
# 将对应的标签拿出来
#test_labels = np.array(test_data['Form'], dtype=np.int32)
test_labels = pd.get_dummies(test_data['Form'])
t0 = time.time()
# 分词处理
test_intro_texts = []
for intro in test_data['comment_text']:
    intro = intro.replace(',',' ').replace('.',' ').replace('\n',' ').replace('?',' ').replace('!',' ').replace(';',' ').replace(':',' ')
    intro = intro.replace('(',' ').replace(')',' ').replace('[',' ').replace(']',' ').replace('{',' ').replace('}',' ')
    intro = intro.replace('"',' ').replace('."',' ').replace('?"',' ').replace('!"',' ').replace('-','')
    intro = intro.lower()
    intro = [i for i in intro.split()]
    test_intro_texts.append(intro)
print("分词时间：%s"%(time.time()-t0))

分词时间：0.19749951362609863


In [69]:
# 训练集，验证集，测试集的划分
# text_s,text_test_s,\
# label_s,label_test_s = train_test_split(
#     intro_texts,labels,
#     test_size=1.0, 
#     stratify=labels,
#     random_state=233)
text_train,text_val,\
label_train,label_val = train_test_split(
    train_intro_texts,train_labels,
    test_size=0.2, 
    stratify=train_labels,
    random_state=233)  # 训练集和验证集的划分
text_test = test_intro_texts
label_test = test_labels
print(text_train[0])
print(label_train[0])
print(len(text_train))
print(len(label_train))

['statto', 'com', 'hello', 'yes', "it's", 'been', 'back', 'for', 'a', 'couple', 'of', 'weeks', 'now', 'hopefully', 'it', 'intends', 'to', 'stay', 'up', 'this', 'time', 'cheers']
134115    1
3236      1
125329    1
18471     1
167131    1
         ..
206152    1
22791     1
6923      1
190231    1
80092     1
Name: 0, Length: 170700, dtype: uint8
170700
170700


In [71]:
intro_lengths = [len(introduction) for introduction in text_train]#统计评论的长度

SENLEN = 400

# VOCAB_SIZE 的决定:训练集上该词出现的次数>=2
train_word_list = [word for sequence in text_train for word in sequence]
word_counter_introduction = Counter(train_word_list)#统计每个词出现的次数
most_common_word_in_train_introduction = word_counter_introduction.most_common()#词频降序排序
VOCAB_SIZE = len(most_common_word_in_train_introduction) + 1 # 设置一个初值，大概率结果不是这个
for i in range(len(most_common_word_in_train_introduction)):
    if most_common_word_in_train_introduction[i][1] <= 1:
        VOCAB_SIZE = i + 1 #VOCAB_SIZE设为词频>=2的词的数量
        break
        
# 对训练集建表
tokenizer = keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(text_train)

# 将训练集的文本数字化
text_train = tokenizer.texts_to_sequences(text_train)
text_train = keras.preprocessing.sequence.pad_sequences(
    text_train, maxlen=SENLEN, padding='post')

# 将验证集的文本数字化
text_val = tokenizer.texts_to_sequences(text_val)
text_val = keras.preprocessing.sequence.pad_sequences(
    text_val, maxlen=SENLEN, padding='post')

# 将测试集的文本数字化
text_test = tokenizer.texts_to_sequences(text_test)
text_test = keras.preprocessing.sequence.pad_sequences(
    text_test, maxlen=SENLEN, padding='post')

In [72]:
#! -*- coding: utf-8 -*-
#%%
from __future__ import print_function
from tensorflow.keras import backend as K
from keras.layers import Layer
 
class Position_Embedding(Layer):
 
    def __init__(self, size=None, mode='sum', **kwargs):
        self.size = size #必须为偶数
        self.mode = mode
        super(Position_Embedding, self).__init__(**kwargs)
 
    def call(self, x):
        if (self.size == None) or (self.mode == 'sum'):
            self.size = int(x.shape[-1])
        batch_size,seq_len = K.shape(x)[0],K.shape(x)[1]
        position_j = 1. / K.pow(10000., \
                                 2 * K.arange(self.size / 2, dtype='float32' \
                               ) / self.size)
        position_j = K.expand_dims(position_j, 0)
        position_i = K.cumsum(K.ones_like(x[:,:,0]), 1)-1 #K.arange不支持变长，只好用这种方法生成
        position_i = K.expand_dims(position_i, 2)
        position_ij = K.dot(position_i, position_j)
        position_ij = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2)
        if self.mode == 'sum':
            return position_ij + x
        elif self.mode == 'concat':
            return K.concatenate([position_ij, x], 2)
 
    def compute_output_shape(self, input_shape):
        if self.mode == 'sum':
            return input_shape
        elif self.mode == 'concat':
            return (input_shape[0], input_shape[1], input_shape[2]+self.size)

In [76]:
 
class Attention(Layer):
 
    def __init__(self, nb_head, size_per_head, **kwargs):
        super(Attention, self).__init__(**kwargs)
        self.nb_head = nb_head
        self.size_per_head = 2*size_per_head
        self.output_dim = nb_head*size_per_head
        
    def build(self, input_shape):
        self.WQ = self.add_weight(name='WQ',
                                  shape=(input_shape[0][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WK = self.add_weight(name='WK',
                                  shape=(input_shape[1][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WV = self.add_weight(name='WV',
                                  shape=(input_shape[2][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        super(Attention, self).build(input_shape)
 
    def Mask(self, inputs, seq_len, mode='mul'):
        if seq_len == None:
            return inputs
        else:
            mask = K.one_hot(seq_len[:,0], K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, 1)
            for _ in range(len(inputs.shape)-2):
                mask = K.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            if mode == 'add':
                return inputs - (1 - mask) * 1e12
 
    def call(self,x):
        #如果只传入Q_seq,K_seq,V_seq，那么就不做Mask
        #如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len，那么对多余部分做Mask
        if len(x) == 3:
            Q_seq,K_seq,V_seq = x
            Q_len,V_len = None,None
        elif len(x) == 5:
            Q_seq,K_seq,V_seq,Q_len,V_len = x
        #对Q、K、V做线性变换
        Q_seq = K.dot(Q_seq, self.WQ)
        Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
        Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
        K_seq = K.dot(K_seq, self.WK)
        K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
        K_seq = K.permute_dimensions(K_seq, (0,2,1,3))
        V_seq = K.dot(V_seq, self.WV)
        V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
        V_seq = K.permute_dimensions(V_seq, (0,2,1,3))
        #计算内积，然后mask，然后softmax
        A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5
        A = K.permute_dimensions(A, (0,3,2,1))
        A = self.Mask(A, V_len, 'add')
        A = K.permute_dimensions(A, (0,3,2,1))
        A = K.softmax(A)
        #输出并mask
        O_seq = K.batch_dot(A, V_seq, axes=[3,2])
        O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
        O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, 'mul')
        return O_seq
 
    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1], self.output_dim)

In [82]:
def TransformerBlock(x,nb_head,ff_dim):
    O_seq = Attention(nb_head)([x,x,x])
    O_seq = keras.layers.add([O_seq,x])
    O_seq = keras.layers.LayerNormalization(epsilon=1e-6)(O_seq)
    x_res = O_seq
    O_seq = keras.layers.Dense(ff_dim,activation='relu')(O_seq)
    O_seq = keras.layers.Dense(ff_dim,activation='relu')(O_seq)
    O_seq = keras.layers.add([O_seq,x_res])
    O_seq = keras.layers.LayerNormalization(epsilon=1e-6)(O_seq)
    return O_seq

### 尝试随机过采样

In [10]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(text_train,label_train)
print(len(X_resampled))
#print(sorted(Counter(y_resampled).items()))
print(len(text_train))

928740
170700


## Transformer 测试

In [87]:
'''
model : Transformer
'''
batch_size = 256
EMB_SIZE = 128
nb_head = 8
ff_dim = 128
from keras.layers import *
S_inputs = Input(shape=(SENLEN,), dtype='int32') 
embeddings = Embedding(VOCAB_SIZE, EMB_SIZE)(S_inputs)
embeddings = Position_Embedding()(embeddings) #增加Position_Embedding能轻微提高准确率
O_seq = TransformerBlock(embeddings,nb_head,ff_dim)
for i in range(3):
    O_seq = TransformerBlock(O_seq,nb_head,ff_dim)
O_seq = GlobalAveragePooling1D()(O_seq)
O_seq = Dropout(0.5)(O_seq)
outputs = Dense(6, activation='softmax')(O_seq)
model = Model(inputs=S_inputs, outputs=outputs)
# try using different optimizers and different optimizer configs
opt = adam_v2.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
loss = 'categorical_crossentropy'
#model.compile(loss=loss,optimizer=opt,metrics=['accuracy'])
print(model.summary())

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           [(None, 400)]        0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 400, 128)     14036608    input_14[0][0]                   
__________________________________________________________________________________________________
position__embedding_12 (Positio (None, 400, 128)     0           embedding_12[0][0]               
__________________________________________________________________________________________________
attention_8 (Attention)         (None, 400, 128)     1           position__embedding_12[0][0]     
                                                                 position__embedding_12[0][0



In [88]:
model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])

In [90]:
early2_stopping = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=5, mode='min') # 采用早停的策略
history = model.fit(
    text_train,label_train, batch_size=64, epochs=5, validation_data=(text_val,label_val)
)
#model2.save('model2.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [91]:
encoder = LabelEncoder()
# make predictions
pred = model.predict(text_test)
#pred = encoder.fit_transform(pred)
# inverse numeric variables to initial categorical labels
pred_labels = [np.argmax(x) for x in pred]

In [92]:
cf = confusion_matrix(test_data['Form'],pred_labels)
cf

array([[7412,  168,    9,    0,    4,    0],
       [ 517,  339,  309,    1,   54,    2],
       [ 130,  192,  691,    1,   34,    0],
       [  11,    9,    1,    1,    5,    1],
       [  84,   98,   27,    1,   50,    0],
       [   4,    9,    4,    0,    5,    0]])

In [94]:
print("Accuracy: %3f"%((TP+TN)/(TP+TN+FP+FN)))
print("Precision: %3f"%(TP/(TP+FP)))
print("Recall: %3f"%((TP)/(TP+FN)))
print("Specificity: %3f"%((TN)/(TN+FP)))
precision = TP/(TP+FP)
recall = (TP)/(TP+FN)
F1Score = 2*recall*precision/(recall+precision)
print("F1-Score: %3f"%F1Score)

Accuracy: 0.908876
Precision: 0.908556
Recall: 0.976162
Specificity: 0.710853
F1-Score: 0.941147
