In [None]:
import pandas as pd # 导入Pandas库
import numpy as np # 导入Numpy库
df_train = pd.read_csv('../input/quora-insincere-questions-classification/train.csv') # 读入训练集
df_test = pd.read_csv('../input/quora-insincere-questions-classification/test.csv') # 读入测试集
df_train.head() # 输出部分数据

In [None]:
from keras.preprocessing.text import Tokenizer # 导入分词工具
X_train_lst = df_train["question_text"].values[0:10000] # 将评论读入张量（训练集）
X_test_lst  = df_test["question_text"].values # 将评论读入张量（测试集）
X_test_ids  =  df_test["qid"].values # 将ID读入张量（测试集）
y_train = df_train["target"].values[0:10000] # 构建标签集
dictionary_size = 20000 # 设定词典的大小
tokenizer = Tokenizer(num_words=dictionary_size) # 初始化词典
tokenizer.fit_on_texts( X_train_lst ) # 使用训练集创建词典索引
# 为所有训练集和测试集的所有评论的单词分配索引值，完成分词工作
X_train_tokenized_lst = tokenizer.texts_to_sequences(X_train_lst)
X_test_tokenized_lst  = tokenizer.texts_to_sequences(X_test_lst) 

In [None]:
import matplotlib.pyplot as plt # 导入matplotlib
word_per_comment = [len(comment) for comment in X_train_tokenized_lst]
plt.hist(word_per_comment, bins = np.arange(0,800,10)) # 评论长度分布
plt.show() 

In [None]:
from keras.preprocessing.sequence import pad_sequences 
max_comment_length = 100 # 设定评论输入长度为100，并填充默认值(如字数少于100)
X_train = pad_sequences(X_train_tokenized_lst, maxlen=max_comment_length)
X_test =  pad_sequences(X_test_tokenized_lst, maxlen=max_comment_length )

In [None]:
from keras.models import Sequential # 导入贯序模型
from keras.layers.embeddings import Embedding #导入词嵌入层
from keras.layers import Dense #导入全连接层
# from keras.layers import CuDNNLSTM #导入CuDNNLSTM层
from keras.layers import LSTM #导入LSTM层
embedding_vecor_length = 60 # 设定词嵌入向量长度为60
model = Sequential() # 贯序模型
model.add(Embedding(dictionary_size, embedding_vecor_length, input_length=max_comment_length)) # 加入词嵌入
# model.add(CuDNNLSTM(100)) # 加入CuDNNLSTM层
model.add(LSTM(100)) # 加入CuDNNLSTM层
model.add(Dense(10, activation='relu')) # 加入全连接层
model.add(Dense(1, activation='sigmoid')) # 加入分类输出层
model.compile(loss='binary_crossentropy', #损失函数
              optimizer='adam', # 优化器
              metrics=['accuracy']) # 评估指标
print(model.summary()) #打印网络模型

In [None]:
history = model.fit(X_train, y_train, # 指定训练集
                    validation_split = 0.3, # 拆分验证集
                    epochs=10, # 指定轮次
                    batch_size=64) # 批量大小