In [1]:
import time
start_time = time.time()
from sklearn.model_selection import train_test_split
import os, numpy as np, pandas as pd
np.random.seed(32)
os.environ["OMP_NUM_THREADS"] = "4"


from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import Dense, Input,Embedding
from keras.layers import Bidirectional
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import GRU, Conv1D

2023-05-17 15:37:19.355184: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# import tensorflow as tf

# gpus = tf.config.list_physical_devices('GPU')
# if gpus:
#   try:
#     # Currently, memory growth needs to be the same across GPUs
#     for gpu in gpus:
#       tf.config.experimental.set_memory_growth(gpu, True)
#     logical_gpus = tf.config.list_logical_devices('GPU')
#     print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
#   except RuntimeError as e:
#     # Memory growth must be set before GPUs have been initialized
#     print(e)

In [3]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))

In [4]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
EMBEDDING_FILE = "wiki.en.align.vec" # there one of emb files, for vec format

In [None]:
embed_size = 300
max_features = 100000
max_len = 150

In [5]:
columns_toxic = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

train_values = train[columns_toxic].values
train["comment_text"].fillna("no comment")
test["comment_text"].fillna("no comment")
X_train, X_valid, Y_train, Y_valid = train_test_split(train, train_values, test_size = 0.1)

In [6]:
raw_text_train = X_train["comment_text"].str.lower()
raw_text_valid = X_valid["comment_text"].str.lower()
raw_text_test = test["comment_text"].str.lower()

tokenizing = Tokenizer(num_words = max_features, lower = True)
tokenizing.fit_on_texts(raw_text_train)

X_train["comment_seq"] = tokenizing.texts_to_sequences(raw_text_train)
X_valid["comment_seq"] = tokenizing.texts_to_sequences(raw_text_valid)
test["comment_seq"] = tokenizing.texts_to_sequences(raw_text_test)

X_train = pad_sequences(X_train.comment_seq, maxlen = max_len)
X_valid = pad_sequences(X_valid.comment_seq, maxlen = max_len)
test = pad_sequences(test.comment_seq, maxlen = max_len)

In [7]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING_FILE))

In [8]:
word_index = tokenizing.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [9]:
file_path_for_model = "best_model.hdf5"
check_point = ModelCheckpoint(file_path_for_model, monitor ="val_loss", verbose = 1, save_best_only = True, mode ="min")
roc_auc_eval = RocAucEvaluation(validation_data=(X_valid, Y_valid), interval = 1)
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)

def build_model(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0):
    inp = Input(shape = (max_len,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x = SpatialDropout1D(dr)(x)

    x = Bidirectional(GRU(units, return_sequences = True))(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool])

    x = Dense(6, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    make_history = model.fit(X_train, Y_train, batch_size = 32, epochs = 4, validation_data = (X_valid, Y_valid),
                        verbose = 1, callbacks = [roc_auc_eval, check_point, early_stop])
    model = load_model(file_path_for_model)
    return model

In [10]:
model = build_model(lr = 1e-3, lr_d = 0, units = 128, dr = 0.2)
pred = model.predict(test, batch_size = 1024, verbose = 1)

2023-05-17 15:38:55.396048: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-17 15:38:55.433788: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-17 15:38:55.433845: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-17 15:38:55.434913: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-17 15:38:55.434968: I tensorflow/compile

Epoch 1/4


2023-05-17 15:38:59.365728: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-17 15:38:59.366576: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-17 15:38:59.367286: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

2023-05-17 15:39:00.663097: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis' with dtype int32 and shape [1]
	 [[{{node gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis}}]]
2023-05-17 15:39:01.576137: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-05-17 15:39:02.700093: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.




2023-05-17 15:40:03.252591: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-17 15:40:03.253551: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-17 15:40:03.254291: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus


 ROC-AUC - epoch: 1 - score: 0.986009

Epoch 1: val_loss improved from inf to 0.04526, saving model to best_model.hdf5
Epoch 2/4
   5/4488 [..............................] - ETA: 1:00 - loss: 0.0337 - accuracy: 0.9937

2023-05-17 15:40:09.314415: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 120000000 exceeds 10% of free system memory.


 ROC-AUC - epoch: 2 - score: 0.988038

Epoch 2: val_loss improved from 0.04526 to 0.04243, saving model to best_model.hdf5
Epoch 3/4
   5/4488 [..............................] - ETA: 1:00 - loss: 0.0401 - accuracy: 0.9875

2023-05-17 15:41:14.544083: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 120000000 exceeds 10% of free system memory.


 ROC-AUC - epoch: 3 - score: 0.989051

Epoch 3: val_loss improved from 0.04243 to 0.04204, saving model to best_model.hdf5
Epoch 4/4
   5/4488 [..............................] - ETA: 1:00 - loss: 0.0483 - accuracy: 0.9875

2023-05-17 15:42:19.533343: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 120000000 exceeds 10% of free system memory.


 ROC-AUC - epoch: 4 - score: 0.989214

Epoch 4: val_loss improved from 0.04204 to 0.04137, saving model to best_model.hdf5


2023-05-17 15:43:24.578679: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-17 15:43:24.579621: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-17 15:43:24.580411: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



In [None]:
submission = pd.read_csv("../data/sample_submission.csv")
submission[columns_toxic] = (pred)
submission.to_csv("submission_wiki_en_align_vec.csv", index = False)
print("[{}] Completed!".format(time.time() - start_time))

In [None]:
# here an old experiments

In [11]:
# submission = pd.read_csv("../data/sample_submission.csv")
# submission[list_classes] = (pred)
# submission.to_csv("submission_wiki_en_align_vec.csv", index = False)
# print("[{}] Completed!".format(time.time() - start_time))

[372.88526701927185] Completed!


In [65]:
# gpu_submissions = pd.read_csv('submission-gpu.csv')
# gpu_submissions.drop(['id'],axis=1,inplace=True)
# orchestra_submissions = pd.read_csv('submission_9_orchestra.csv')
# orchestra_submissions.drop(['id'],axis=1,inplace=True)
# pooling_gru = pd.read_csv('submission_pooled_gru.csv')
# pooling_gru.drop(['id'],axis=1,inplace=True)
# xgboostdata = pd.read_csv('submission_9.csv')
# xgboostdata.drop(['id'],axis=1,inplace=True)
# catboostdata = pd.read_csv('submission_9_cat.csv')
# catboostdata.drop(['id'],axis=1,inplace=True)
# kerasdata = pd.read_csv('subm.csv')
# kerasdata.drop(['id'],axis=1,inplace=True)

In [66]:
# total = 0.7 * gpu_submissions + 0.3 * orchestra_submissions

# total = 0.33 * gpu_submissions + 0.33 * pooling_gru + 0.33 * orchestra_submissions
# total = pooling_gru + gpu_submissions

In [67]:
# columns_toxic = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
#
# submission_samples = pd.read_csv('../data/sample_submission.csv')
# sample_submission_id = pd.DataFrame({'id': submission_samples["id"]})
# submission_output = pd.concat([sample_submission_id, pd.DataFrame(total, columns = columns_toxic)], axis=1)
#
# submission_output.to_csv('./submission_total_5.csv', index=False)