In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from profanity_check import predict_prob
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.utils import resample

2023-05-16 18:48:23.942436: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
stop_words_list = set(stopwords.words('english'))

In [3]:
train_data_or = pd.read_csv('./data/train.csv')
train_data_or.head()
test_data = pd.read_csv('./data/test.csv')
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \r\n\r\n The title is fine as i...
2,00013b17ad220c46,""" \r\n\r\n == Sources == \r\n\r\n * Zawe Ashto..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [4]:
def prepare_data_set(data_set):
    data_set = data_set.fillna("unknown")
    # data_set['comment_text'] = \
    #     data_set['comment_text'].apply(preprocess_text)
    data_set['total_length'] = \
        data_set['comment_text'].apply(len)
    data_set['is_upper'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(1 for sen in comment if sen.isupper()))
    data_set['is_exclamation'] = \
        data_set['comment_text'].apply(
            lambda comment:
                comment.count('!'))
    data_set['is_question'] = \
        data_set['comment_text'].apply(
            lambda comment:
                comment.count('?'))
    data_set['sum_of_punctuation'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(comment.count(word) for word in '.,;:-!?"'))
    data_set['sum_of_another_symbols'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(comment.count(word) for word in '*&$%_^#@()+/\\\r\n'))
    data_set['sum_of_words'] = \
        data_set['comment_text'].apply(
            lambda comment:
                len(comment.split())
        )
    data_set['count_if_unique'] = \
        data_set['comment_text'].apply(
            lambda comment:
                len(set(word for word in comment.split()))
        )
    data_set['diff_sums_unique_and_words'] = \
        data_set['count_if_unique'] / data_set['sum_of_words']

    data_set['if_positive'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(comment.count(word) for word in (':-)', ':)', ';-)', ';)')))

    data_set['obscene_probability'] = \
        data_set['comment_text'].apply(
            lambda comment:
                predict_prob([comment])[0]
        )

    return data_set

In [5]:
train_data_or = prepare_data_set(train_data_or)
train_data_or.to_csv('./submissions/train_data_or_prepared.csv', index=False)
test_data = prepare_data_set(test_data)
test_data.to_csv('./submissions/test_data_prepared.csv', index=False)

In [6]:
train_data_or.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,total_length,is_upper,is_exclamation,is_question,sum_of_punctuation,sum_of_another_symbols,sum_of_words,count_if_unique,diff_sums_unique_and_words,if_positive,obscene_probability
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0,265,17,0,1,7,2,43,41,0.953488,0,0.007771
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,112,8,1,0,6,4,17,17,1.0,0,0.016528
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,233,4,0,0,4,0,42,39,0.928571,0,0.086265
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0,626,11,0,0,16,11,113,82,0.725664,0,0.001756
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,67,2,0,1,4,0,13,13,1.0,0,0.029764
5,00025465d4725e87,"""\r\n\r\nCongratulations from me as well, use ...",0,0,0,0,0,0,67,1,0,0,4,4,13,12,0.923077,0,0.032015
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,44,37,0,0,0,0,8,8,1.0,0,0.997755
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,115,4,0,0,3,0,20,20,1.0,0,0.120069
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,472,7,0,1,10,2,83,70,0.843373,0,0.086447
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0,70,2,0,0,0,0,12,12,1.0,0,0.012096


In [7]:
test_data.head(10)

Unnamed: 0,id,comment_text,total_length,is_upper,is_exclamation,is_question,sum_of_punctuation,sum_of_another_symbols,sum_of_words,count_if_unique,diff_sums_unique_and_words,if_positive,obscene_probability
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,367,4,0,0,11,0,72,61,0.847222,0,0.999928
1,0000247867823ef7,== From RfC == \r\n\r\n The title is fine as i...,52,7,0,0,2,4,12,11,0.916667,0,0.006622
2,00013b17ad220c46,""" \r\n\r\n == Sources == \r\n\r\n * Zawe Ashto...",58,4,0,0,2,10,12,10,0.833333,0,0.014215
3,00017563c3f7919a,":If you have a look back at the source, the in...",205,4,0,0,5,0,38,30,0.789474,0,0.004834
4,00017695ad8997eb,I don't anonymously edit articles at all.,41,1,0,0,1,0,7,7,1.0,0,0.006948
5,0001ea8717f6de06,Thank you for understanding. I think very high...,96,2,0,0,2,0,16,15,0.9375,0,0.01439
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...,176,5,0,0,6,0,29,28,0.965517,0,0.004765
7,000247e83dcc1211,:Dear god this site is horrible.,32,1,0,0,2,0,6,6,1.0,0,0.657411
8,00025358d4737918,""" \r\n Only a fool can believe in such numbers...",566,42,0,0,19,23,100,77,0.77,0,0.003289
9,00026d1092fe71cc,== Double Redirects == \r\n\r\n When fixing do...,226,7,0,0,6,4,40,33,0.825,0,0.006296


In [8]:
train_data_or.drop(['id'], axis=1, inplace=True)
test_data.drop(['id'], axis=1, inplace=True)
x = train_data_or.iloc[:,2:].sum()

In [9]:
train_data_or.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,total_length,is_upper,is_exclamation,is_question,sum_of_punctuation,sum_of_another_symbols,sum_of_words,count_if_unique,diff_sums_unique_and_words,if_positive,obscene_probability
0,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0,265,17,0,1,7,2,43,41,0.953488,0,0.007771
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,112,8,1,0,6,4,17,17,1.0,0,0.016528
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,233,4,0,0,4,0,42,39,0.928571,0,0.086265
3,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0,626,11,0,0,16,11,113,82,0.725664,0,0.001756
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,67,2,0,1,4,0,13,13,1.0,0,0.029764


In [10]:
test_data.head()

Unnamed: 0,comment_text,total_length,is_upper,is_exclamation,is_question,sum_of_punctuation,sum_of_another_symbols,sum_of_words,count_if_unique,diff_sums_unique_and_words,if_positive,obscene_probability
0,Yo bitch Ja Rule is more succesful then you'll...,367,4,0,0,11,0,72,61,0.847222,0,0.999928
1,== From RfC == \r\n\r\n The title is fine as i...,52,7,0,0,2,4,12,11,0.916667,0,0.006622
2,""" \r\n\r\n == Sources == \r\n\r\n * Zawe Ashto...",58,4,0,0,2,10,12,10,0.833333,0,0.014215
3,":If you have a look back at the source, the in...",205,4,0,0,5,0,38,30,0.789474,0,0.004834
4,I don't anonymously edit articles at all.,41,1,0,0,1,0,7,7,1.0,0,0.006948


In [11]:
x.head()

severe_toxic     1595.0
obscene          8449.0
threat            478.0
insult           7877.0
identity_hate    1405.0
dtype: float64

In [12]:
#marking comments without any tags as "clean"
rowsums = train_data_or.iloc[:, 2:7].sum(axis=1)
train_data_or['clean'] = (rowsums == 0)

In [13]:
rowsums.head()

0    0
1    0
2    0
3    0
4    0
dtype: int64

In [14]:
train_data_or.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,total_length,is_upper,is_exclamation,is_question,sum_of_punctuation,sum_of_another_symbols,sum_of_words,count_if_unique,diff_sums_unique_and_words,if_positive,obscene_probability,clean
0,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0,265,17,0,1,7,2,43,41,0.953488,0,0.007771,True
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,112,8,1,0,6,4,17,17,1.0,0,0.016528,True
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,233,4,0,0,4,0,42,39,0.928571,0,0.086265,True
3,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0,626,11,0,0,16,11,113,82,0.725664,0,0.001756,True
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,67,2,0,1,4,0,13,13,1.0,0,0.029764,True


In [15]:
#count number of clean entries
train_data_or['clean'].sum()
print("Total comments = ", len(train_data_or))
print("Total clean comments = ", train_data_or['clean'].sum())
print("Total tags =", x.sum())

Total comments =  159571
Total clean comments =  149012
Total tags = 87818851.39726342


In [16]:
df_majority = train_data_or[train_data_or.clean==True]
df_minority = train_data_or[train_data_or.clean==False]

In [17]:
df_majority.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,total_length,is_upper,is_exclamation,is_question,sum_of_punctuation,sum_of_another_symbols,sum_of_words,count_if_unique,diff_sums_unique_and_words,if_positive,obscene_probability,clean
0,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0,265,17,0,1,7,2,43,41,0.953488,0,0.007771,True
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,112,8,1,0,6,4,17,17,1.0,0,0.016528,True
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,233,4,0,0,4,0,42,39,0.928571,0,0.086265,True
3,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0,626,11,0,0,16,11,113,82,0.725664,0,0.001756,True
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,67,2,0,1,4,0,13,13,1.0,0,0.029764,True


In [18]:
df_minority.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,total_length,is_upper,is_exclamation,is_question,sum_of_punctuation,sum_of_another_symbols,sum_of_words,count_if_unique,diff_sums_unique_and_words,if_positive,obscene_probability,clean
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,44,37,0,0,0,0,8,8,1.0,0,0.997755,False
42,You are gay or antisemmitian? \r\n\r\nArchange...,1,0,1,0,1,1,662,22,7,1,23,31,117,88,0.752137,0,0.992782,False
43,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0,40,31,1,0,2,0,8,8,1.0,0,0.999998,False
51,GET FUCKED UP. GET FUCKEEED UP. GOT A DRINK T...,1,0,1,0,0,0,124,89,1,3,7,1,25,17,0.68,0,0.999995,False
55,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0,89,1,1,0,1,0,19,18,0.947368,0,1.0,False


In [19]:
# print(df_majority.head(10))
# print(row_sums.head(10))
train_data_or.iloc[:,2:].head()

Unnamed: 0,severe_toxic,obscene,threat,insult,identity_hate,total_length,is_upper,is_exclamation,is_question,sum_of_punctuation,sum_of_another_symbols,sum_of_words,count_if_unique,diff_sums_unique_and_words,if_positive,obscene_probability,clean
0,0,0,0,0,0,265,17,0,1,7,2,43,41,0.953488,0,0.007771,True
1,0,0,0,0,0,112,8,1,0,6,4,17,17,1.0,0,0.016528,True
2,0,0,0,0,0,233,4,0,0,4,0,42,39,0.928571,0,0.086265,True
3,0,0,0,0,0,626,11,0,0,16,11,113,82,0.725664,0,0.001756,True
4,0,0,0,0,0,67,2,0,1,4,0,13,13,1.0,0,0.029764,True


In [20]:
df_majority_downsampled = resample(df_majority,
                                  replace=False,
                                  n_samples=10000,
                                  random_state=123)

In [21]:
print('resampled')
train_data = pd.concat([df_majority_downsampled,df_minority])
print('concatenated')
print(f'shape: {train_data.shape}')

resampled
concatenated
shape: (20559, 19)


In [22]:
def get_comment_type(row):
     for c in train_data.iloc[:,1:]:
        if row[c]==1:
            return c


In [23]:
print('making comment types')
comment_type = train_data.apply(get_comment_type, axis=1)
train_data['comment_type'] = comment_type
print('made comment types')
train_data = train_data.fillna(value=np.nan)
train_data = train_data.fillna(value='safe')
print('filled N/As')

making comment types
made comment types
filled N/As


In [24]:
train_data.shape

(20559, 20)

In [25]:
train_data.to_csv('./submissions/keras_train_data.csv')

In [26]:
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [27]:
def clean_text(text):

    text = text.lower()
    text = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', text) # clean url
    text = re.sub(r'#(\w+)', '', text)   # clean hashes
    text = re.sub(r'@(\w+)', '', text)   # clean @
    text = re.sub(r'<[^>]+>', '', text)  # clean tags
    text = re.sub(r'\d+', '', text)      # clean digits
    text = re.sub(r'[,!@\'\"?\.$%_&#*+-:;]', '', text)   # clean punctuation
    text = [APPO[word] if word in APPO else word for word in text.split()] #

    return text

In [28]:
# train_set = pd.read_csv("train_set_prepared.csv")
# test_set = pd.read_csv("test_set_prepared.csv")

In [29]:
train_data['comment_text'] = train_data['comment_text'].apply(clean_text)
test_data['comment_text'] = test_data['comment_text'].apply(clean_text)

In [30]:
train_data.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,total_length,is_upper,is_exclamation,is_question,sum_of_punctuation,sum_of_another_symbols,sum_of_words,count_if_unique,diff_sums_unique_and_words,if_positive,obscene_probability,clean,comment_type
49150,"[relations, do, you, think, the, federal, repu...",0,0,0,0,0,0,82,5,0,1,1,2,13,13,1.0,0,0.055355,True,is_question
65253,"[it, was, you, who, initiated, this, request, ...",0,0,0,0,0,0,367,13,0,1,5,2,63,57,0.904762,0,0.002695,True,is_question
43028,"[those, links, are, dead, but, this, appears, ...",0,0,0,0,0,0,107,5,0,1,7,3,13,13,1.0,0,0.026198,True,is_question
122949,"[you, dont, give, a, reason, for, why, this, s...",0,0,0,0,0,0,130,2,0,0,2,0,21,20,0.952381,0,0.024338,True,clean
68472,"[john, of, england, changing, spelling, from, ...",0,0,0,0,0,0,186,12,0,0,4,10,21,20,0.952381,0,0.009368,True,clean


In [31]:
test_data.head()

Unnamed: 0,comment_text,total_length,is_upper,is_exclamation,is_question,sum_of_punctuation,sum_of_another_symbols,sum_of_words,count_if_unique,diff_sums_unique_and_words,if_positive,obscene_probability
0,"[yo, bitch, ja, rule, is, more, succesful, the...",367,4,0,0,11,0,72,61,0.847222,0,0.999928
1,"[==, from, rfc, ==, the, title, is, fine, as, ...",52,7,0,0,2,4,12,11,0.916667,0,0.006622
2,"[==, sources, ==, zawe, ashton, on, lapland, —]",58,4,0,0,2,10,12,10,0.833333,0,0.014215
3,"[if, you, have, a, look, back, at, the, source...",205,4,0,0,5,0,38,30,0.789474,0,0.004834
4,"[i, dont, anonymously, edit, articles, at, all]",41,1,0,0,1,0,7,7,1.0,0,0.006948


In [34]:
tokenizing = Tokenizer(num_words = 100000, oov_token='<oov>')
tokenizing.fit_on_texts(train_data.comment_text)

In [37]:
# numeric_features = ['total_length', 'is_upper', 'is_exclamation', 'is_question',
#                     'sum_of_punctuation', 'sum_of_another_symbols', 'sum_of_words',
#                     'count_if_unique', 'diff_sums_unique_and_words', 'if_positive',
#                     'obscene_probability']

In [40]:
train_x = train_data.iloc[:,0]
train_y = train_data.iloc[:,1:7]

train_y = np.array(train_y)

In [42]:
train_x, val_x, train_y, val_y = train_test_split(train_x,train_y, test_size=0.2, random_state=1)

In [46]:
training_sequences = tokenizing.texts_to_sequences(train_x)

max_len = max([len(x) for x in np.array(training_sequences, dtype="object")])

training_padded = pad_sequences(training_sequences, maxlen = max_len,
                                padding = 'pre',
                                truncating='pre')

In [47]:
validation_sequences = tokenizing.texts_to_sequences(val_x)

validation_padded = pad_sequences(validation_sequences, maxlen = max_len,
                                  padding = 'pre',
                                  truncating='pre')

In [48]:
vocab_size = len(tokenizing.word_index) + 1
vocab_size

57059

In [49]:
embeddings_index = {}

EMBEDDING_FILE = open('/embeddings/glove.twitter.27B.200d.txt', 'r', encoding='utf-8')
# glovefile = open('glove.6B.200d.txt','r',encoding='utf-8')

for elem in tqdm(EMBEDDING_FILE):
    values = elem.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
EMBEDDING_FILE.close()

print('Found %s word vectors.' % len(embeddings_index))

1193514it [00:25, 47707.64it/s]

Found 1193514 word vectors.





In [50]:
embedding_matrix = np.zeros((len(tokenizing.word_index) + 1, 200))

for words, index in tqdm(tokenizing.word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

100%|████████████████████████████████████████████████████████████████████████| 57058/57058 [00:00<00:00, 1342924.79it/s]


In [51]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = 200, weights = [embedding_matrix], input_length = max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True)),
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='sigmoid')])

2023-05-16 18:56:17.274854: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-16 18:56:17.285853: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-16 18:56:17.285903: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-16 18:56:17.288522: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-16 18:56:17.288564: I tensorflow/compile

In [52]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1403, 200)         11411800  
                                                                 
 bidirectional (Bidirectiona  (None, 1403, 256)        336896    
 l)                                                              
                                                                 
 conv1d (Conv1D)             (None, 1401, 128)         98432     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dropout (Dropout)           (None, 32)                0

In [56]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit(training_padded, train_y, epochs = 2, validation_data=(validation_padded, val_y), batch_size = 32)

Epoch 1/2


2023-05-16 18:56:48.696298: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-16 18:56:48.697282: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-16 18:56:48.697972: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-05-16 18:58:06.890683: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-16 18:58:06.891596: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-16 18:58:06.892254: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/2


<keras.callbacks.History at 0x7f528c1bbd90>

In [57]:
model.save('keras_2.model')

2023-05-16 18:59:23.815517: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-16 18:59:23.816399: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-16 18:59:23.817325: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

2023-05-16 18:59:24.293256: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,?,200]
	 [[{{node inputs}}]]
2023-05-16 18:59:24.377630: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-16 18:59:24.378450: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split

2023-05-16 18:59:25.069792: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-16 18:59:25.070841: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-16 18:59:25.071554: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

2023-05-16 18:59:25.854633: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis' with dtype int32 and shape [1]
	 [[{{node gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis}}]]
2023-05-16 18:59:25.877801: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-16 18:59:25.878507: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message):

2023-05-16 18:59:26.379869: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-16 18:59:26.380779: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-16 18:59:26.381548: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

2023-05-16 18:59:26.884051: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-16 18:59:26.884900: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-16 18:59:26.885804: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

INFO:tensorflow:Assets written to: keras_2.model/assets


INFO:tensorflow:Assets written to: keras_2.model/assets


In [58]:
predicted = model.predict(validation_padded)
labels = (predicted > 0.5).astype(int)

2023-05-16 18:59:34.720342: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-16 18:59:34.721428: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-16 18:59:34.722109: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



In [59]:
testing_sequences = tokenizing.texts_to_sequences(test_data.comment_text)
test_padded = pad_sequences(testing_sequences, maxlen = max_len,
                            padding = 'pre',
                            truncating='pre')

In [60]:
predicted = model.predict(test_padded, batch_size = 200)



In [62]:
predict = np.hstack((np.array(test_data), predicted))

In [86]:
# there were an experiment with numeric features, but for result were deleted numeric columns, just because
pred_processed = np.delete(predict, list(range(12)), 1)

[0.9834794998168945 0.37203365564346313 0.9420222043991089
 0.14678557217121124 0.8873569965362549 0.28381863236427307]


In [88]:
columns_toxic = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

submission_samples = pd.read_csv('../data/sample_submission.csv')
sample_submission_id = pd.DataFrame({'id': submission_samples["id"]})
submission_output = pd.concat([sample_submission_id, pd.DataFrame(pred_processed, columns = columns_toxic)], axis=1)

submission_output.to_csv('submission_xgboost_toxic_cat_xgb_glovestm.csv', index=False)