In [None]:
import pandas as pd
from collections import Counter
import numpy as np
import time
import operator
import re
import gc

from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.callbacks import ModelCheckpoint
from keras.models import Model, load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import optimizers
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from string import punctuation

nltk.download('stopwords')

# Loading Data

In [None]:
TRAIN_DATA_PATH = '/kaggle/input/quora-insincere-questions-classification/train.csv'
TEST_DATA_PATH = '/kaggle/input/quora-insincere-questions-classification/test.csv'

In [None]:
train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
print(f'Train data shape: {train_data.shape}')
print(f'Test data shape: {test_data.shape}')

In [None]:
value_counts = train_data['target'].value_counts()
value_counts_percentage = train_data['target'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%'
pd.concat([value_counts, value_counts_percentage], axis=1, keys=['Counts', 'Percentage'])

Классы сильно несбалансированы. Доля провокационных вопросов в датасете составляет 6.19% от общего числа вопросов.

# Prerocessing

In [None]:
train_data['question_text'][35]

Препроцессим текст: приводим все слова к нижнему регистру, удаляем пунктуацию и стопворды (слова, не несущие особого смысла).

In [None]:
STOPWORDS = nltk.corpus.stopwords.words('english')

tokenizer = WordPunctTokenizer()

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text)
    return [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text]

def remove_punctuation(text):
    return "".join([i for i in text if i not in punctuation])

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])


def preprocess_data(data):
    # Lower Casing
    data['preprocessed_text'] = data['question_text'].apply(lambda x: x.lower())

    # Remove Punctuation
    data['preprocessed_text'] = data['preprocessed_text'].apply(lambda x: remove_punctuation(x))

    # Remove Stopwords
    data['preprocessed_text'] = data['preprocessed_text'].apply(lambda x: remove_stopwords(x))

    # Tokenization
    # data['preprocessed_text'] = data['preprocessed_text'].apply(lambda x: tokenizer.tokenize(x))

    # Lemmitization
    # data['preprocessed_text'] = data['preprocessed_text'].apply(lambda x: lemmatize_words(x))

In [None]:
%%time
preprocess_data(train_data)
preprocess_data(test_data)

train_data.head()

In [None]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def vocab_to_integer(vocab):
    return {word: ii for ii, word in enumerate(vocab, 1)}

Строим словарь из обработанных данных и маппим слова на соответсвтующие индексы.

In [None]:
%%time
all_questions = pd.concat([train_data['preprocessed_text'], test_data['preprocessed_text']])
final_vocab = build_vocab(all_questions)
word_to_idx = vocab_to_integer(final_vocab)

Строим словарь из исходных вопросов.

In [None]:
vocab_original = build_vocab(pd.concat([train_data['question_text'], test_data['question_text']]))

Задаем параметры некоторые гиперпараметры для модели + константы

In [None]:
hparam = {}
hparam['VOCAB_SIZE'] = len(final_vocab) + 1
hparam['PAD_LENGTH'] = 77
hparam['MINIBATCH_SIZE'] = 512
hparam['LEARNING_RATE'] = 1e-3
hparam['EPOCHS'] = 4
hparam['LSTM_HIDDEN_SIZE'] = 128
hparam['WORD_EMB_DIM'] = 0
hparam['KFOLDS'] = 3

Функция для загрузки готовых эмбеддингов.

In [None]:
def load_embed(file):
    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')
    
    if file.split('/')[-1] == 'wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o) > 100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

Ниже описана функция определения слов, которых нет в готовых эмбеддингах.

In [None]:
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass
    print('{} known words, {} unique'.format(nb_known_words, len(known_words)))
    print('{} unknown words, {} unique'.format(nb_unknown_words, len(unknown_words)))
    print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]
    return unknown_words

In [None]:
def create_emb_matrix(nb_words, embed_size):
    # Создаем исходную матрицу эмбеддингов (слова, у которых нет эмбеддингов, будут представлены в виде нулевого вектора)
    return np.zeros((nb_words, embed_size), dtype=np.float32)

def fill_emb_matrix(word_idx, emb_matrix, emb_index):
    for word, i in word_idx:
        emb_vector = emb_index.get(word)
        if emb_vector is not None:
            emb_matrix[i] = emb_vector
    return emb_matrix

def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")

Извлекаем готовые эмбеддинги из архива

In [None]:
import zipfile

z= zipfile.ZipFile('../input/quora-insincere-questions-classification/embeddings.zip')
z.extractall()

In [None]:
_glove = './glove.840B.300d/glove.840B.300d.txt'
_paragram =  './paragram_300_sl999/paragram_300_sl999.txt'
_wiki_news = './wiki-news-300d-1M/wiki-news-300d-1M.vec'
_google_news = './GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

embeddings = [{'name': 'glove', 'path': _glove},
              {'name': 'paragram', 'path': _paragram},
              {'name': 'fasttext', 'path': _wiki_news}]

In [None]:
%%time

conc_embedding = None
word_index = word_to_idx
nb_words = min(hparam['VOCAB_SIZE'], len(word_index) + 1)
hparam['VOCAB_SIZE'] = nb_words
print(hparam['VOCAB_SIZE'], len(word_index) + 1)
print(f"Got a vocab size of {nb_words} number of words")

for embedding in embeddings:
    emb_name = embedding['name']
    emb_path = embedding['path']
    print("Running procedure on {}".format(emb_name))
    
    # Загружаем эмбеддинги
    print("Loading {}".format(emb_name))
    emb_index = load_embed(emb_path)
    
    # Добавляем слова в нижнем регистре
    print("Adding lowercase to {}".format(emb_name))
    add_lower(emb_index, vocab_original)
    
    
    _ = check_coverage(final_vocab, emb_index)
    
    emb_size = 300
    hparam['WORD_EMB_DIM'] += emb_size
    
    # Конвертируем в формат word2vec
    emb_matrix = create_emb_matrix(nb_words, emb_size)
    print(emb_matrix.size)
    print(emb_matrix.shape)
    emb_matrix = fill_emb_matrix(word_index.items(), emb_matrix, emb_index)
    
    # Конкатенируем новые эмбеддинги с предыдущими
    if conc_embedding is not None:
        conc_embedding = np.concatenate((conc_embedding, emb_matrix), axis=1)
        print("Concatenated! New shape: {}".format(conc_embedding.shape))
    else:
        conc_embedding = emb_matrix
    print("=================================================")
    
    del emb_matrix, emb_index, emb_name, emb_path, emb_size
    import gc; gc.collect()

In [None]:
def embed_word_to_int(X, vocab_to_int):
    embedded_X = []
    for q in X:
        tmp_X = []
        for w in q.split():
            tmp_X.append(vocab_to_int[w])
        embedded_X.append(tmp_X)
    return embedded_X

# Ставим в соответствие каждому слову уникальное целое число
X_train = embed_word_to_int(train_data['preprocessed_text'].values, word_to_idx)
X_test = embed_word_to_int(test_data['preprocessed_text'].values, word_to_idx)

pad_length = hparam['PAD_LENGTH']

# Приводим данные к единому размеру
X_train_pad = pad_sequences(X_train, maxlen=pad_length, padding='pre', truncating='pre')
X_test_pad = pad_sequences(X_test, maxlen=pad_length, padding='pre', truncating='pre')

print(train_data['preprocessed_text'][25])
print(X_train[25])
print(X_train_pad[25])

Для оценки модели используется метод кросс-валидации, метрика качества - f-мера

In [None]:
def train_val_pred(dataset, hparam, embedding_matrix):
    
    # Достаем данные
    X_train = dataset['X_train']
    y_train = dataset['y_train']
    X_val = dataset['X_val']
    y_val = dataset['y_val']
    X_test = dataset['X_test']

    # Достаем гиперпараметры
    VOCAB_SIZE = hparam['VOCAB_SIZE']
    PAD_LENGTH = hparam['PAD_LENGTH']
    MINIBATCH_SIZE = hparam['MINIBATCH_SIZE']
    LEARNING_RATE = hparam['LEARNING_RATE']
    EPOCHS = hparam['EPOCHS']
    LSTM_HIDDEN_SIZE = hparam['LSTM_HIDDEN_SIZE']
    WORD_EMB_DIM = hparam['WORD_EMB_DIM']
    
    # Создаем модель (2-x BiLSTM)
    inp = Input(shape=(PAD_LENGTH,))
    x = Embedding(VOCAB_SIZE, WORD_EMB_DIM, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.3)(x)
    x1 = Bidirectional(CuDNNLSTM(256, return_sequences=True))(x)
    x2 = Bidirectional(CuDNNGRU(128, return_sequences=True))(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    conc = Concatenate()([max_pool1, max_pool2])
    predictions = Dense(1, activation='sigmoid')(conc)
    model = Model(inputs=inp, outputs=predictions)
    adam = tf.keras.optimizers.Adam(lr=LEARNING_RATE)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    model.summary()
    
    # Обучаем модель
    model.fit(X_train, y_train, epochs=EPOCHS, batch_size=MINIBATCH_SIZE, 
          validation_data = (X_val, y_val))
    
    
    val_preds = model.predict(X_val, batch_size=MINIBATCH_SIZE, verbose=1)
    best_f1 = -1
    best_thresh = -1
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        f1 = metrics.f1_score(y_val, (val_preds > thresh).astype(int))
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    print("Best f1 score = {} at thresh {}".format(best_f1, best_thresh))
    
    # Предсказания на тестовой выборке
    test_preds = model.predict(X_test)
    
    del embedding_matrix, model, inp, x, adam
    import gc; gc.collect()
    
    return test_preds, val_preds, best_thresh, best_f1

Разбиваем обучающий датасет на несколько маленьких кусков для кросс-валидации.

In [None]:
kfold = StratifiedKFold(n_splits=hparam['KFOLDS'], shuffle=True, random_state=2019)

In [None]:
X = X_train_pad
y = train_data['target'].values

results = []

for fold, (train_index, val_index) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    print(f"Training on {len(X_train)} and validating on {len(X_val)} number of words")
    
    dataset = {'X_train': X_train, 'y_train': y_train,
          'X_val': X_val, 'y_val': y_val,
          'X_test': X_test_pad}
    
    test_preds, val_preds, thresh, f1 = train_val_pred(dataset, hparam, conc_embedding)
    
    print("len(test_preds) = {}, len(val_preds) = {}, thresh = {} at f1 = {}".format(len(test_preds), 
                                                                                     len(val_preds), 
                                                                                     thresh, 
                                                                                     f1))
    new_result = {'name': 'fold-' + str(fold), 
                  'test_preds': test_preds, 
                  'val_preds': val_preds, 
                  'thresh': thresh, 
                  'f1': f1}
    results.append(new_result)
    
    import gc; gc.collect()

Выводим список f-мер и трешхолды для соответствующих кусков датасета.

In [None]:
print("Got {} number of results!".format(len(results)))
avg_thresh = 0
for result in results:
    print("{} gave f1 score {} with thresh {}".format(result['name'], result['f1'], result['thresh']))
    avg_thresh += result['thresh']

avg_thresh = avg_thresh / len(results)
print("Got an average threshold at {}".format(avg_thresh))

In [None]:
print("Avg treshold {}".format(avg_thresh))

factor = 1.0 / len(results)
pred_test_y = results[0]['test_preds'] * factor

print("Using factor: ", factor)

for i in range(1, len(results)):
    pred_test_y += factor * results[i]['test_preds']
    

pred_test_y_res = (pred_test_y > avg_thresh).astype(int)

results_dict = {'qid':test_data['qid'].values, 'prediction':[]}

for prediction in pred_test_y_res:
    results_dict['prediction'].append(prediction[0])
    
print(results_dict['qid'][:15])
print(results_dict['prediction'][:15])
    
df = pd.DataFrame(data=results_dict)
df.to_csv('submission.csv', index=False)
print("Saved")