In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter("ignore")

train_df = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
# 只保留50万的数据作为训练集
df_train = train_df.loc[:500000, :]
df_train, df_valid = train_test_split(df_train, test_size=0.1)


In [None]:
print(df_train.target.values[:10])
print(df_train.head(n=2))

# 数据分析部分

1. 首先构建一个词典，这个词典会把单词对应成索引
2. 通过使用keras内置的分析工具取出训练集中最为常用的20000个单词作为词典


In [None]:
# print(df_train.head(n=10))
print(df_train.columns)

# create a Vocabulary using the question_text

from keras.preprocessing.text import Tokenizer
def get_vocab(df, num_words=20000):
    """
    get the dictionary using the df
    """
    tokenizer = Tokenizer(num_words=num_words)
    texts = df.question_text.tolist()
    tokenizer.fit_on_texts([item.lower() for item in texts])
    return tokenizer
# 使用训练数据集构建一个关于训练集的tokenizer
tokenizer = get_vocab(df_train)

In [None]:
print(tokenizer.texts_to_sequences(["I Love you"]))
print(tokenizer.texts_to_sequences(["To be a better man."]))

使用序列化方法sequence来实现对句子的padding

In [None]:
%%time
from keras.preprocessing.sequence import pad_sequences

MAX_LENGTH = 40
train_X = np.array(pad_sequences(tokenizer.texts_to_sequences(df_train.question_text.tolist()), maxlen=MAX_LENGTH, padding = 'post'))
valid_X = np.array(pad_sequences(tokenizer.texts_to_sequences(df_valid.question_text.tolist()), maxlen=MAX_LENGTH, padding = 'post'))
print(train_X.shape)

train_y, valid_y = np.array(df_train.target.values), np.array(df_valid.target.values)

In [None]:
print(df_train.question_text[0], train_X[0])

# 使用预训练的词向量，并且构建和之前词典中的一个映射关系出来。

* 首先建立一个关于所有单词的embedding矩阵，，把这个矩阵保存起来，之后使用这个矩阵来初始化模型后面的embedding-layer就行了。

In [None]:
# 这步是把txt文档转成向量 搜的别人的代码
# using embedding here to get the numpy array for later useage
embeddings_index = {}
file = open('../input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt')
for line in tqdm(file):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
file.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
print(tokenizer.num_words)

**构建一个matrix，这个matrix和之前建立的词典 tokenizer.word_index 要一一对应其起来，之后使用这个matrix来作为embedding的初始化**

In [None]:
vocab_size = len(tokenizer.word_index.items())
# create a weight matrix for words in training docs
embedding_matrix = np.random.normal(loc=0, scale=1.0, size=(vocab_size+1, 300))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## 使用keras来构建一个模型

In [None]:
from keras.models import Sequential
from keras.layers import RNN, LSTM, Dropout, Flatten, Embedding, SpatialDropout1D, Dense

In [None]:
# define model

model = Sequential()
model.add(Embedding(vocab_size+1, 300, input_length=MAX_LENGTH, weights=[embedding_matrix]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
model.summary()

## 直接进行训练

# 训练两轮，每一次batch_size 是128

In [None]:
model.fit(train_X, train_y, epochs=2, verbose=1, batch_size=256)