## LSTM RNN

In [0]:
import os, warnings, pickle, gc, re, string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers

from tensorflow.keras.layers import Layer, Dense, Input, Activation, Embedding, SpatialDropout1D, Bidirectional, LSTM, GRU, GlobalMaxPooling1D, GlobalAveragePooling1D, Dropout
from tensorflow.keras.layers import concatenate, add

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, LearningRateScheduler, EarlyStopping
from tensorflow.keras import backend as K

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from tqdm.notebook import tqdm
tqdm.pandas()

from sklearn.metrics import roc_auc_score

warnings.simplefilter('ignore')

Using TensorFlow backend.
  from pandas import Panel


Гиперпараметры. Также были использованы Crawl и GLoVe эмбеддинги

In [0]:
MAX_LEN = 220
MAX_FEATURES = 100000
EMBED_SIZE = 600

BATCH_SIZE = 128

LEARNING_RATE = 8e-4

CRAWL_EMB_PATH = '../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl'
GLOVE_EMB_PATH = '../input/pickled-crawl300d2m-for-kernel-competitions/crawl-300d-2M.pkl'

Вспомогательные функции

In [0]:
def get_coeffs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(embed_dir):
    with open(embed_dir, 'rb') as  infile:
        embeddings = pickle.load(infile)
        return embeddings

In [0]:
def build_embedding_matrix(word_index, embeddings_index, max_features, lower = True, verbose = True):
    embedding_matrix = np.zeros((max_features, 300))
    for word, i in tqdm(word_index.items(), len=(word_index.items())):
        if lower:
            word = word.lower()
        if i >= max_features: continue
        try:
            embedding_vector = embeddings_index[word]
        except:
            embedding_vector = embeddings_index["unknown"]
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [0]:
def build_matrix(word_index, embeddings_index):
    embedding_matrix = np.zeros((len(word_index) + 1,300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embeddings_index[word]
        except:
            embedding_matrix[i] = embeddings_index["unknown"]
    return embedding_matrix

In [0]:
class Attention(Layer):
    def __init__(self, step_dim, W_regularizer=None, b_regularizer=None, 
                 W_constraint=None, b_constraint=None, bias=True, **kwargs):

        self.supports_masking = True

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = None
        super(Attention, self).__init__(**kwargs)

        self.param_W = {
            'initializer': initializers.get('glorot_uniform'),
            'name': '{}_W'.format(self.name),
            'regularizer': regularizers.get(W_regularizer),
            'constraint': constraints.get(W_constraint)
        }
        self.W = None

        self.param_b = {
            'initializer': 'zero',
            'name': '{}_b'.format(self.name),
            'regularizer': regularizers.get(b_regularizer),
            'constraint': constraints.get(b_constraint)
        }
        self.b = None

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.features_dim = input_shape[-1]
        self.W = self.add_weight(shape=(input_shape[-1],), 
                                 **self.param_W)

        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],), 
                                     **self.param_b)

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        step_dim = self.step_dim
        features_dim = self.features_dim

        eij = K.reshape(
            K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))),
            (-1, step_dim))

        if self.bias:
            eij += self.b
        eij = K.tanh(eij)
        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.features_dim

Получение всех данных

In [0]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")

train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])

del train1, train2
gc.collect()

valid = pd.read_csv('/kaggle/input/val-en-df/validation_en.csv')

test = pd.read_csv('/kaggle/input/test-en-df/test_en.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

Обработка текстов

In [0]:
misspell_dict = {"aren't": "are not", "can't": "cannot", "couldn't": "could not",
                 "didn't": "did not", "doesn't": "does not", "don't": "do not",
                 "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                 "he'd": "he would", "he'll": "he will", "he's": "he is",
                 "i'd": "I had", "i'll": "I will", "i'm": "I am", "isn't": "is not",
                 "it's": "it is", "it'll": "it will", "i've": "I have", "let's": "let us",
                 "mightn't": "might not", "mustn't": "must not", "shan't": "shall not",
                 "she'd": "she would", "she'll": "she will", "she's": "she is",
                 "shouldn't": "should not", "that's": "that is", "there's": "there is",
                 "they'd": "they would", "they'll": "they will", "they're": "they are",
                 "they've": "they have", "we'd": "we would", "we're": "we are",
                 "weren't": "were not", "we've": "we have", "what'll": "what will",
                 "what're": "what are", "what's": "what is", "what've": "what have",
                 "where's": "where is", "who'd": "who would", "who'll": "who will",
                 "who're": "who are", "who's": "who is", "who've": "who have",
                 "won't": "will not", "wouldn't": "would not", "you'd": "you would",
                 "you'll": "you will", "you're": "you are", "you've": "you have",
                 "'re": " are", "wasn't": "was not", "we'll": " will", "tryin'": "trying"}


def _get_misspell(misspell_dict):
    misspell_re = re.compile('(%s)' % '|'.join(misspell_dict.keys()))
    return misspell_dict, misspell_re


def replace_typical_misspell(text):
    misspellings, misspellings_re = _get_misspell(misspell_dict)

    def replace(match):
        return misspellings[match.group(0)]

    return misspellings_re.sub(replace, text)
    

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']',
          '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£', '·', '_', '{', '}', '©', '^',
          '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█',
          '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶',
          '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼',
          '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
          'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪',
          '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']


def clean_text(x):
    x = str(x)
    for punct in puncts + list(string.punctuation):
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x


def clean_numbers(x):
    return re.sub(r'\d+', ' ', x)

In [0]:
def preprocess(train, valid, test, tfms):
    for tfm in tfms:
        print(tfm.__name__)
        train['comment_text'] = train['comment_text'].progress_apply(tfm)
        valid['comment_text_en'] = valid['comment_text_en'].progress_apply(tfm)
        test['content'] = test['content'].progress_apply(tfm)
    
    return train, valid, test

In [0]:
tfms = [replace_typical_misspell, clean_text, clean_numbers]
train, valid, test = preprocess(train, valid, test, tfms)

replace_typical_misspell


HBox(children=(FloatProgress(value=0.0, max=328177.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63812.0), HTML(value='')))


clean_text


HBox(children=(FloatProgress(value=0.0, max=328177.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63812.0), HTML(value='')))


clean_numbers


HBox(children=(FloatProgress(value=0.0, max=328177.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=63812.0), HTML(value='')))




Кодирование данных

In [0]:
tokenizer = Tokenizer(num_words=MAX_FEATURES, filters='', lower=False)

tokenizer.fit_on_texts(list(train['comment_text']) + list(valid['comment_text_en']) + list(test['content_en']))
word_index = tokenizer.word_index

X_train = tokenizer.texts_to_sequences(list(train['comment_text']))
y_train = train['toxic'].values

X_valid = tokenizer.texts_to_sequences(list(valid['comment_text_en']))
y_valid = valid['toxic'].values

X_test = tokenizer.texts_to_sequences(list(test['content_en']))

X_train = pad_sequences(X_train, maxlen=MAX_LEN)
X_valid = pad_sequences(X_valid, maxlen=MAX_LEN)
X_test = pad_sequences(X_test, maxlen=MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

Матрица эмбеддингов

In [0]:
crawl_embeddings = load_embeddings(CRAWL_EMB_PATH)

glove_embeddings = load_embeddings(GLOVE_EMB_PATH)

embedding_matrix_1 = build_matrix(word_index, crawl_embeddings)
embedding_matrix_2 = build_matrix(word_index, glove_embeddings)

embedding_matrix = np.concatenate([embedding_matrix_1, embedding_matrix_2], axis=1)

Датасеты

In [0]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(X_test)
    .batch(BATCH_SIZE)
)

Модель

In [0]:
def build_model(word_index, embedding_matrix, verbose=True):
    sequence_input = Input(shape=(MAX_LEN,), dtype=tf.int32)
    
    embedding_layer = Embedding(*embedding_matrix.shape,
                                weights=[embedding_matrix],
                                trainable=False)
    
    x = embedding_layer(sequence_input)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(LSTM(256, return_sequences=True))(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    
    att = Attention(MAX_LEN)(x)
    avg_pool1 = GlobalAveragePooling1D()(x)
    max_pool1 = GlobalMaxPooling1D()(x)
    hidden = concatenate([att, avg_pool1, max_pool1])
    
    hidden = Dense(512, activation='relu')(hidden)
    hideen = Dense(128, activation='relu')(hidden)

    out = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(sequence_input, out)
    
    return model

In [0]:
model = build_model(word_index, embedding_matrix)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 220)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 220, 600)     356948400   input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 220, 600)     0           embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 220, 512)     1755136     spatial_dropout1d[0][0]          
______________________________________________________________________________________________

In [0]:
cb = LearningRateScheduler(lambda epoch: LEARNING_RATE * (0.6 ** epoch))

Фит на трейне (английский язык)

In [0]:
n_steps = X_train.shape[0] // BATCH_SIZE

train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    callbacks=[cb],
    epochs=6
)

Train for 2563 steps, validate for 63 steps
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


Фит на валидационной выборке (разные языки)

In [0]:
n_steps = X_valid.shape[0] // BATCH_SIZE

train_history = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    callbacks=[cb],
    epochs=4
)

Train for 62 steps
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [0]:
preds = model.predict(test_dataset, verbose=1)
sub['toxic'] = preds



In [0]:
sub.to_csv('submission.csv', index=False)

Public LB : 0.8919