In [1]:
import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import re
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from tensorflow.keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
import os
os.environ['OMP_NUM_THREADS'] = '4'

2023-05-17 16:59:50.522883: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
EMBEDDING_FILE = './glove.twitter.27B.200d.txt' # there one of embedding files, for  txt format

train = pd.read_csv('./data_2/train.csv')
test = pd.read_csv('./data_2/test.csv')
submission = pd.read_csv('./data_2/sample_submission.csv')

In [3]:
repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

In [4]:
repl_keys = [i for i in repl.keys()]

In [5]:
new_train_data = []
new_test_data = []
train_list = train["comment_text"].tolist()
test_list = test["comment_text"].tolist()

In [None]:
# cleaning data

In [6]:
# cleaning for train

for item in train_list:
    arr = str(item).split()
    xx = ""
    for elem in arr:
        elem = str(elem).lower()
        if elem[:4] == 'http' or elem[:3] == 'www':
            continue
        if elem in repl_keys:
            elem = repl[elem]
        xx += elem + " "
    new_train_data.append(xx)

In [7]:
# cleaning for test

for item in test_list:
    arr = str(item).split()
    xx = ""
    for elem in arr:
        elem = str(elem).lower()
        if elem[:4] == 'http' or elem[:3] == 'www':
            continue
        if elem in repl_keys:
            elem = repl[elem]
        xx += elem + " "
    new_test_data.append(xx)

In [8]:
train["new_comment_text"] = new_train_data
test["new_comment_text"] = new_test_data

print("cleaned")

train_clean_list = train["new_comment_text"].tolist()
test_clean_list = test["new_comment_text"].tolist()

crap removed


In [None]:
# cleaning continue

In [9]:
for item, c in enumerate(train_clean_list):
    train_clean_list[item] = re.sub('[^a-zA-Z ?!]+', '', str(train_clean_list[item]).lower())

In [10]:
for item, c in enumerate(test_clean_list):
    test_clean_list[item] = re.sub('[^a-zA-Z ?!]+', '', test_clean_list[item])

In [11]:
train["comment_text"] = train_clean_list
test["comment_text"] = test_clean_list

print('now only alphabets')

only alphabets


In [12]:
X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

In [13]:
max_features = 30000
max_len = 100
emb_size = 200

In [14]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = pad_sequences(X_train, maxlen=max_len)
x_test = pad_sequences(X_test, maxlen=max_len)

In [16]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [None]:
# if embedding file have some strings anywhere

# def get_coefs(word, *arr):
#     float_arr = []
#     for a in arr:
#         try:
#             float_arr.append(float(a))
#         except ValueError:
#             float_arr.append(0.0)
#     return word, np.asarray(float_arr, dtype='float32')

# embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [17]:
all_embs = np.hstack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, emb_size))
for word, item in word_index.items():
    if item >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[item] = embedding_vector


In [None]:
# if embedding file have some strings anywhere

# all_embs = np.hstack(embeddings_index.values())
# emb_mean, emb_std = all_embs.mean(), all_embs.std()

# word_index = tokenizer.word_index
# nb_words = min(max_features, len(word_index))
# embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

# for word, i in word_index.items():
#     if i >= max_features:
#         continue
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # Check if embedding vector has the correct dimensionality
#         if len(embedding_vector) != embed_size:
#             # If embedding vector has more than `embed_size` dimensions, truncate it
#             if len(embedding_vector) > embed_size:
#                 embedding_vector = embedding_vector[:embed_size]
#             # If embedding vector has fewer than `embed_size` dimensions, pad it with zeros
#             else:
#                 embedding_vector = np.pad(embedding_vector, (0, embed_size - len(embedding_vector)), 'constant')
#         embedding_matrix[i] = embedding_vector

In [18]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch + 1, score))

In [19]:
def get_model():
    inp = Input(shape=(max_len,))
    x = Embedding(max_features, emb_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [20]:
model = get_model()

2023-05-17 17:00:43.272205: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-17 17:00:43.284027: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-17 17:00:43.284078: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-17 17:00:43.285997: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-17 17:00:43.286044: I tensorflow/compile

In [21]:
batch_size = 32
epochs = 2 # give us best result

In [22]:
# in results were different values in train_size, sometimes ~ 0.8, 0.75, 0.95, 0.7
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=123)
roc_auc_eval = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

In [23]:
model_fit = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                      callbacks=[roc_auc_eval], verbose=2)

Epoch 1/2


2023-05-17 17:00:44.391544: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-17 17:00:44.392584: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-17 17:00:44.393301: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

2023-05-17 17:02:13.969734: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-17 17:02:13.970840: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-17 17:02:13.971689: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus


 ROC-AUC - epoch: 1 - score: 0.987501 

4488/4488 - 92s - loss: 0.0516 - accuracy: 0.9583 - val_loss: 0.0408 - val_accuracy: 0.9458 - 92s/epoch - 20ms/step
Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.987889 

4488/4488 - 53s - loss: 0.0398 - accuracy: 0.9331 - val_loss: 0.0397 - val_accuracy: 0.9895 - 53s/epoch - 12ms/step


In [24]:
y_pred = model.predict(x_test, batch_size=1024)



In [25]:
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission_glove_twitter_27B_200d.csv', index=False)