In [None]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Préparation des données

In [None]:
labels =["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
label_weights_3 = [1, 2, 2, 5, 1, 2]
label_weights_2 = [0.32, 1.5, 0.16, 1.5, 0.64, 1.5]
label_weights = [1.2, 1.3, 0.5, 0.4, 0.3, 0.5]
label_weights_4 = [1, 2, 2, 4, 4, 5]
#tf.random.set_seed(50)

Récupération des données de la compétition [Jigsaw Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

In [None]:
train = pd.read_csv('../input/jigsaw-training/train.csv')
train.head()

In [None]:
#aperçu du texte à nettoyer
train.iloc[0]['comment_text']

In [None]:
#Répartition des commentaires toxiques
train['toxicity'] = train.drop(['id', 'comment_text'], axis=1).sum(axis=1)
train.toxicity.value_counts()

In [None]:
#pour équilibrer le dataset d'entraînement on met le même nombre de commentaires positifs/negatifs (contre 10% de comm negatifs précédemment)
#pos = train[train.toxicity>0]
#neg = train[train.toxicity==0].sample(len(pos))
#train = pd.concat([pos, neg])
#train.toxicity.value_counts()

In [None]:
train.isnull().any()

In [None]:
#Fonction de nettoyage afin de retirer les caractères spéciaux, les balises html, les chiffres (adresse IP par exemple)

def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+')
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml')
    only_text = soup.get_text()
    text = only_text
    text = re.sub(r"[^A-Za-z ]+", "", text)
    text = re.sub(' +', ' ', text)
    text = text.strip()

    
    return text

In [None]:
#nettoyage du texte des commentaires
train['comment_text'] = train['comment_text'].apply(text_cleaning)

In [None]:
#séparation des commentaires et des valeurs des labels
y = train[labels].values
list_sentences_train = train["comment_text"]

In [None]:
#limitation aux 20000 mots les plus présents et tokenization des chaines de caractères
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)

In [None]:
#padding à 200 mots
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)

#Création du modèle

In [None]:
#Définition du modèle RNN
inp = Input(shape=(maxlen, ))
embed_size = 128
x = Embedding(max_features, embed_size)(inp)
#x = Bidirectional(LSTM(50, return_sequences=True))(x)
x = LSTM(60, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(60, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)


In [None]:
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
print(model.summary())

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
#entrainement
batch_size = 32
epochs = 2

model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

#Validation sur les données de la compétition

In [None]:
val_data = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
val_data.head()

In [None]:
val_data['less_toxic'] = val_data['less_toxic'].apply(text_cleaning)
val_data['more_toxic'] = val_data['more_toxic'].apply(text_cleaning)

Tokenizer + padding

In [None]:
list_sentences_val_lt = val_data["less_toxic"]
list_sentences_val_mt = val_data["more_toxic"]

list_tokenized_val_lt = tokenizer.texts_to_sequences(list_sentences_val_lt)
list_tokenized_val_mt = tokenizer.texts_to_sequences(list_sentences_val_mt)

val_lt = pad_sequences(list_tokenized_val_lt, maxlen=maxlen)
val_mt = pad_sequences(list_tokenized_val_mt, maxlen=maxlen)

Predict

In [None]:
#model.load_weights(weight_file_path)
p1 = model.predict(val_lt)
p2 = model.predict(val_mt)

In [None]:
f = np.array(list(label_weights))
f

Création du score global

In [None]:
f1 = np.array([sum(row) for row in f*p1])
f1

In [None]:
f2 = np.array([sum(row) for row in f*p2])
f2

Test du % de cas où le comm less_toxic est bien prédit comme less_toxic

In [None]:
(f1<f2).mean()

# Submission

In [None]:
#import et préparation des données pour le modèle
sub = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
sub.text = sub.text.apply(text_cleaning)

list_tokenized_sub = tokenizer.texts_to_sequences(sub.text)
sub_pad = pad_sequences(list_tokenized_sub, maxlen=maxlen)

#calcul du score grâce au modèle et aux poids donnés à chaque label prédit
p = model.predict(sub_pad)
f = np.array(list(label_weights))
sub['score'] = np.array([sum(row) for row in f*p])
sub

In [None]:
#submission
sub[['comment_id', 'score']].to_csv('submission.csv', index=False)