In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.model_selection import train_test_split

#### 解压数据

In [None]:
# Extract a zip file
import zipfile
zip_ref = zipfile.ZipFile('/kaggle/input/quora-insincere-questions-classification/embeddings.zip', 'r')
print(zip_ref.namelist())
embeddings = zip_ref.open('glove.840B.300d/glove.840B.300d.txt', 'r')

#### 读取语料库

In [None]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.decode().split(" ")) for o in embeddings)

#### 读取数据

In [None]:
train_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
test_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')
print('训练集维度：\n',train_data.shape)
print('测试集维度：\n',test_data.shape)
train_data.sample(5)

In [None]:
#将数据转化为列表
train_input = list(train_data['question_text'])
train_label = list(train_data['target'])

test_input = list(test_data['question_text'])

#### 数据预处理-用空格代替语气等无用词

In [None]:
from nltk.corpus import stopwords
stop=stopwords.words('english') #调用英语语气词库

def remove_stop_words(x):
    for word in stop:
        token = " " + word + " "
        if (x.find(token) != -1): #没找到 返回-1，
            x = x.replace(token, " ")
    return x

train_input_rsw = list(map(remove_stop_words, train_input))
test_input_rsw = list(map(remove_stop_words, test_input))

In [None]:
max_features=100000
embed_size = 300 # 词向量维度
max_length = 60 #最长句子长度（即RNN中隐层的长度）

#### 分词，并通过语料库提取嵌入矩阵

In [None]:
tokenizer=Tokenizer(num_words=max_features)
#num_words:None或整数,处理的最大单词数量。少于此数的单词丢掉
tokenizer.fit_on_texts(train_input_rsw)
#使用一系列文档来生成token词典，texts为list类，每个元素为一个文档。
word_index = tokenizer.word_index
n_words=min(max_features,len(word_index))

embedding_matrix = np.zeros((n_words+1, 300))

for word, i in word_index.items():
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

#### 将文本转换为数字，并对其进行填充处理

In [None]:
sequences = tokenizer.texts_to_sequences(train_input_rsw)
#将多个文档转换为word下标的向量形式,shape为[len(texts)，len(text)] -- (文档数，每条文档的长度)
train_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
print(train_input_padded.shape)

sequences = tokenizer.texts_to_sequences(test_input_rsw)
test_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
print(test_input_padded.shape)

In [None]:
train_input_padded[0]

In [None]:
 #shuffling the data
#np.random.seed(2)
#trn_idx = np.random.permutation(len(train_data))

#train_X = train_input_padded[trn_idx]
#train_y = train_data['target'][trn_idx] 

train_text, cv_text, train_target, cv_target = train_test_split(train_input_padded, train_label, test_size = 0.1, random_state=2)

####  Keras 建模

In [None]:
from keras.models import Sequential
from keras.layers import Embedding,Bidirectional,LSTM,Dropout,Conv1D,MaxPooling1D,Dense

In [None]:
lstm=Sequential()
lstm.add(Embedding(n_words+1,300,input_length=max_length,weights=[embedding_matrix], trainable=False))
lstm.add(Bidirectional(LSTM(256,return_sequences=True)))
lstm.add(Dropout(0.2))
lstm.add(Conv1D(100,5,activation='relu'))
lstm.add(MaxPooling1D(pool_size=4))
lstm.add(LSTM(128))
lstm.add(Dropout(0.4))
lstm.add(Dense(1,activation='sigmoid'))

lstm.summary()

lstm.compile(loss='binary_crossentropy',optimizer='adam', metrics=['acc'])



In [None]:
del embeddings_index
import gc
gc.collect()

In [None]:
history=lstm.fit(np.array(train_text), np.array(train_target), epochs = 5, validation_data=(np.array(cv_text),np.array(cv_target)), batch_size=1024,verbose=1)

In [None]:
import matplotlib.pyplot as plt


acc=history.history['acc']
val_acc=history.history['val_acc']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(5)
##精确率图像
plt.figure()
plt.plot(epochs, acc, 'r')
plt.plot(epochs, val_acc, 'b')
plt.title('Training and validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(["Accuracy", "Validation Accuracy"])
plt.show()
##损失图像
plt.figure()
plt.plot(epochs, loss, 'r')
plt.plot(epochs, val_loss, 'b')
plt.title('Training and validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["Loss", "Validation Loss"])
plt.show()

#### 预测

In [None]:
from sklearn.metrics import f1_score
cv_predictions = lstm.predict(cv_text, batch_size=512)

thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    result = f1_score(cv_target, (cv_predictions>thresh).astype(int))
    thresholds.append([thresh, result])
    print("F1 score at threshold {} is {}".format(thresh, result))

thresholds.sort(key=lambda x: x[1], reverse=True)
print("Best value {}".format(thresholds[0]))
best_thresh = thresholds[0]

In [None]:
predictions = lstm.predict(cv_text)
predictions = np.around(predictions).astype(int)
df = pd.DataFrame({'pred': predictions.flatten(), 'actual': cv_target})
df.head()
pd.crosstab(df['pred'], df['actual'], margins=True)

In [None]:
predictions = lstm.predict(test_input_padded)

In [None]:
predictions1 = (predictions>best_thresh).astype(int)

In [None]:
predictions = (predictions>best_thresh).astype(int)

submission = pd.DataFrame({'qid': test_data.qid, 'prediction': predictions1[:,0]})
submission.to_csv('submission.csv', index=False)