In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import re, string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from profanity_check import predict_prob
import catboost as cb
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [None]:
nltk.download('stopwords')
# vulgar_words = []
# with open('./vulgar', 'r') as vulgars:
#     for line in vulgars:
#         vulgar_words.append(vulgars.readline().strip())

In [None]:
stop_words_list = set(stopwords.words('english'))

In [None]:
sample_submission_set = pd.read_csv("../data/sample_submission.csv")
train_set = pd.read_csv("../data/train.csv")
test_set = pd.read_csv("../data/test.csv")

In [None]:
test_set.head(10)

In [None]:
train_set.head(10)

In [None]:
def prepare_data_set(data_set):
    data_set = data_set.fillna("unknown")
    # data_set['comment_text'] = \
    #     data_set['comment_text'].apply(preprocess_text)
    data_set['total_length'] = \
        data_set['comment_text'].apply(len)
    data_set['is_upper'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(1 for sen in comment if sen.isupper()))
    data_set['is_exclamation'] = \
        data_set['comment_text'].apply(
            lambda comment:
                comment.count('!'))
    data_set['is_question'] = \
        data_set['comment_text'].apply(
            lambda comment:
                comment.count('?'))
    data_set['sum_of_punctuation'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(comment.count(word) for word in '.,;:-!?"'))
    data_set['sum_of_another_symbols'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(comment.count(word) for word in '*&$%_^#@()+/\\\r\n'))
    data_set['sum_of_words'] = \
        data_set['comment_text'].apply(
            lambda comment:
                len(comment.split())
        )
    data_set['count_if_unique'] = \
        data_set['comment_text'].apply(
            lambda comment:
                len(set(word for word in comment.split()))
        )
    data_set['diff_sums_unique_and_words'] = \
        data_set['count_if_unique'] / data_set['sum_of_words']

    data_set['if_positive'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(comment.count(word) for word in (':-)', ':)', ';-)', ';)')))

    data_set['obscene_probability'] = \
        data_set['comment_text'].apply(
            lambda comment:
                predict_prob([comment])[0]
        )

    return data_set

In [None]:
def preprocess_text(sen):
    # sentence = sen.lower()
    # sentence = re.sub('\[.*?\]', '', sen)
    # sentence = re.sub('https?://\S+|www\.\S+', '', sentence)
    # sentence = re.sub('<.*?>+', '', sentence)
    # sentence = re.sub('[%s]' % re.escape(string.punctuation), '', sentence)
    # sentence = re.sub('\n', '', sentence)
    # sentence = re.sub('\w*\d\w*', '', sentence)
    # Удаление символов пунктуации и специальных символов
    # sentence = re.sub('[^a-zA-Z0-9]', ' ', sentence)
    # replace anything non-word
    sen = re.sub(r'((\\n)|(\\r))+', ' ', sen)
    # sen = " ".join(re.findall(r'\w+', sen))
    # Заменяем любые пробелы на один обычный
    sen = re.sub(r'\s+', ' ', sen)
    # Удаляем стоп-слова
    sen = " ".join([word for word in sen.split() if word not in stop_words_list])
    return sen

In [None]:
train_set = prepare_data_set(train_set)
train_set.to_csv('train_set_prepared.csv', index=False)
test_set = prepare_data_set(test_set)
test_set.to_csv('test_set_prepared.csv', index=False)

In [None]:
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [None]:
def clean_text(text):

    text = text.lower()
    text = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', text) # clean url
    text = re.sub(r'#(\w+)', '', text)   # clean hashes
    text = re.sub(r'@(\w+)', '', text)   # clean @
    text = re.sub(r'<[^>]+>', '', text)  # clean tags
    text = re.sub(r'\d+', '', text)      # clean digits
    text = re.sub(r'[,!@\'\"?\.$%_&#*+-:;]', '', text)   # clean punctuation
    text = " ".join([APPO[word] if word in APPO else word for word in text.split()])  #

    return text

In [None]:
#load prepared data
train_set = pd.read_csv("train_set_prepared.csv")
test_set = pd.read_csv("test_set_prepared.csv")

In [None]:
train_set['comment_text'] = train_set['comment_text'].apply(clean_text)
test_set['comment_text'] = test_set['comment_text'].apply(clean_text)


In [None]:
# test_set.head(10)
# # test_set.head(10)

In [None]:
# train_set['comment_text'].head(100)

In [None]:
token_template = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenizer_template(s):
    return token_template.sub(r' \1 ', s).split()

In [None]:
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2),
                              tokenizer=tokenizer_template,
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1)),
])

In [None]:
numeric_features = ['total_length', 'is_upper', 'is_exclamation', 'is_question',
                    'sum_of_punctuation', 'sum_of_another_symbols', 'sum_of_words',
                    'count_if_unique', 'diff_sums_unique_and_words', 'if_positive',
                    'obscene_probability']

In [None]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'comment_text'),
        ('num', numeric_transformer, numeric_features),
    ])

In [None]:
train_X, valid_X, train_Y, valid_Y = train_test_split(
    train_set,
    train_set[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
    test_size=0.2,
    random_state=20)



In [None]:
preprocessor.fit(train_X)

In [None]:
tf_idf_train = preprocessor.transform(train_X)
tf_idf_valid = preprocessor.transform(valid_X)
tf_idf_test = preprocessor.transform(test_set)

In [None]:
columns_toxic = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
predictions = np.zeros((test_set.shape[0], len(columns_toxic)))

In [None]:
clf = cb.CatBoostClassifier(iterations=100,
                            learning_rate=0.3,
                            depth=6,
                            eval_metric='AUC',
                            random_seed=42,
                            bagging_temperature=0.2,
                            od_type='Iter',
                            metric_period=50,
                            od_wait=20
                           )
predictions_cat = np.zeros((test_set.shape[0], len(columns_toxic)))
for i, label in enumerate(columns_toxic):
    print('fitting column:' + label)
    clf.fit(tf_idf_train, train_Y[label], verbose=True)
    predictions_cat[:, i] = clf.predict_proba(tf_idf_test)[:, 1]

In [None]:
submission_samples = pd.read_csv('../data/sample_submission.csv')
sample_submission_id = pd.DataFrame({'id': submission_samples["id"]})
submission = pd.concat([sample_submission_id, pd.DataFrame(predictions_cat, columns=columns_toxic)], axis=1)
submission.to_csv('./submissions/submission_9_cat.csv', index=False)

In [None]:
# def trainByXGBoost(train_X, train_y, test_X, test_y=None,
#                    feature_names=None,
#                    seed_val=2017,
#                    num_rounds=1200):
#     param = {
#         'objective': 'binary:logistic',
#         'eta': 0.1,
#         'max_depth': 6,
#         'silent': 1,
#         'eval_metric': 'auc',
#         'min_child_weight': 1,
#         'subsample': 0.7,
#         'colsample_bytree': 0.7,
#         'seed': seed_val,
#     }
#     # num_rounds = num_rounds

#     params_list = list(param.items())
#     xgb_train = xgb.DMatrix(train_X, label=train_y)

#     if test_y is not None:
#         xgtest = xgb.DMatrix(test_X, label=test_y)
#         watchlist = [ (xgb_train,'train'), (xgtest, 'test') ]
#         model = xgb.train(params_list, xgb_train, num_rounds, watchlist, early_stopping_rounds=20)
#     else:
#         xgtest = xgb.DMatrix(test_X)
#         model = xgb.train(params_list, xgb_train, num_rounds)

#     return model

In [None]:
# for i, j in enumerate(columns_toxic):
#     print('fit for column '+ j)
#     model = trainByXGBoost(tf_idf_train, train_Y[j], tf_idf_valid, valid_Y[j])
#     predictions[:, i] = model.predict(xgb.DMatrix(tf_idf_test), ntree_limit = model.best_ntree_limit)

In [None]:
# submission_samples = pd.read_csv('../data/sample_submission.csv')
# sample_submission_id = pd.DataFrame({'id': submission_samples["id"]})
# submission_output = pd.concat([sample_submission_id, pd.DataFrame(predictions, columns = columns_toxic)], axis=1)
# submission_output.to_csv('submission_9.csv', index=False)

In [None]:
# ensemble_predictions = 0.6 * predictions + 0.4 * predictions_cat
# ensemble_rmse = np.sqrt(np.mean(tf_idf_test - ensemble_predictions)**2)
# print(f'ensemble RMSE =  | {len(ensemble_predictions)}')

In [None]:
# submission_samples = pd.read_csv('../data/sample_submission.csv')
# sample_submission_id = pd.DataFrame({'id': submission_samples["id"]})
# submission_output = pd.concat([sample_submission_id, pd.DataFrame(ensemble_predictions, columns = columns_toxic)], axis=1)
# submission_output.to_csv('submission_9_ensemble.csv', index=False)

In [1]:
import pandas as pd
import numpy as np

import re
from tqdm import tqdm


from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.utils import resample




2023-05-16 16:25:07.165927: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

train_data_or = pd.read_csv('../data/train.csv')
train_data_or.head()
test_data = pd.read_csv('../data/test.csv')



In [3]:
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \r\n\r\n The title is fine as i...
2,00013b17ad220c46,""" \r\n\r\n == Sources == \r\n\r\n * Zawe Ashto..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [4]:

train_data_or.drop(['id'],axis=1,inplace=True)
test_data.drop(['id'],axis=1,inplace=True)
x=train_data_or.iloc[:,2:].sum()



In [5]:
train_data_or.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
test_data.head()

Unnamed: 0,comment_text
0,Yo bitch Ja Rule is more succesful then you'll...
1,== From RfC == \r\n\r\n The title is fine as i...
2,""" \r\n\r\n == Sources == \r\n\r\n * Zawe Ashto..."
3,":If you have a look back at the source, the in..."
4,I don't anonymously edit articles at all.


In [7]:
x.head()

severe_toxic     1595
obscene          8449
threat            478
insult           7877
identity_hate    1405
dtype: int64

In [8]:
#marking comments without any tags as "clean"
rowsums=train_data_or.iloc[:,2:].sum(axis=1)
train_data_or['clean']=(rowsums==0)



In [9]:
rowsums.head()

0    0
1    0
2    0
3    0
4    0
dtype: int64

In [10]:
train_data_or.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean
0,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0,True
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,True
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,True
3,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0,True
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,True


In [11]:
#count number of clean entries
train_data_or['clean'].sum()
print("Total comments = ",len(train_data_or))
print("Total clean comments = ",train_data_or['clean'].sum())
print("Total tags =",x.sum())


Total comments =  159571
Total clean comments =  149012
Total tags = 19804


In [12]:

df_majority = train_data_or[train_data_or.clean==True]
df_minority = train_data_or[train_data_or.clean==False]

In [13]:
df_majority.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean
0,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0,True
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,True
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,True
3,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0,True
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,True


In [14]:
df_minority.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,False
42,You are gay or antisemmitian? \r\n\r\nArchange...,1,0,1,0,1,1,False
43,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0,False
51,GET FUCKED UP. GET FUCKEEED UP. GOT A DRINK T...,1,0,1,0,0,0,False
55,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0,False


In [15]:
# print(df_majority.head(10))
print(train_data_or.iloc[:,2:].head())

   severe_toxic  obscene  threat  insult  identity_hate  clean
0             0        0       0       0              0   True
1             0        0       0       0              0   True
2             0        0       0       0              0   True
3             0        0       0       0              0   True
4             0        0       0       0              0   True


In [16]:
df_majority_downsampled = resample(df_majority,
                                  replace=False,
                                  n_samples=10000,
                                  random_state=123)


In [17]:
print('resampled')
train_data = pd.concat([df_majority_downsampled,df_minority])
print('concatenated')
print(f'shape: {train_data.shape}')



resampled
concatenated
shape: (20559, 8)


In [18]:
def get_comment_type(row):
     for c in train_data.iloc[:,1:]:
        if row[c]==1:
            return c


In [19]:
print('making comment types')
comment_type = train_data.apply(get_comment_type, axis=1)
train_data['comment_type'] = comment_type
print('made comment types')
train_data = train_data.fillna(value=np.nan)
train_data = train_data.fillna(value='safe')
print('filled N/As')

making comment types
made comment types
filled N/As


In [20]:
# train_data = pd.read_csv('keras_train.csv')

In [21]:
train_data.shape

(20559, 9)

In [22]:
train_data.to_csv('keras_train.csv')

In [23]:
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [24]:
def clean_text(text):

    text = text.lower()
    text = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', text) # clean url
    text = re.sub(r'#(\w+)', '', text)   # clean hashes
    text = re.sub(r'@(\w+)', '', text)   # clean @
    text = re.sub(r'<[^>]+>', '', text)  # clean tags
    text = re.sub(r'\d+', '', text)      # clean digits
    text = re.sub(r'[,!@\'\"?\.$%_&#*+-:;]', '', text)   # clean punctuation
    text = [APPO[word] if word in APPO else word for word in text.split()] #

    return text



In [25]:
train_data['comment_text'] = train_data['comment_text'].apply(clean_text)
test_data['comment_text'] = test_data['comment_text'].apply(clean_text)

In [26]:
train_data.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,comment_type
49150,"[relations, do, you, think, the, federal, repu...",0,0,0,0,0,0,True,clean
65253,"[it, was, you, who, initiated, this, request, ...",0,0,0,0,0,0,True,clean
43028,"[those, links, are, dead, but, this, appears, ...",0,0,0,0,0,0,True,clean
122949,"[you, dont, give, a, reason, for, why, this, s...",0,0,0,0,0,0,True,clean
68472,"[john, of, england, changing, spelling, from, ...",0,0,0,0,0,0,True,clean


In [27]:
test_data.head()

Unnamed: 0,comment_text
0,"[yo, bitch, ja, rule, is, more, succesful, the..."
1,"[==, from, rfc, ==, the, title, is, fine, as, ..."
2,"[==, sources, ==, zawe, ashton, on, lapland, —]"
3,"[if, you, have, a, look, back, at, the, source..."
4,"[i, dont, anonymously, edit, articles, at, all]"


In [29]:
train_x = train_data.iloc[:,0]
train_y = train_data.iloc[:,1:7]

train_y = np.array(train_y)

In [30]:
train_x, val_x, train_y, val_y = train_test_split(train_x,train_y, test_size=0.2, random_state=1)

In [31]:
tokenizer = Tokenizer(num_words = 100000, oov_token='<oov>')
tokenizer.fit_on_texts(train_data.comment_text)

In [32]:
type(tokenizer)

keras.preprocessing.text.Tokenizer

In [33]:
traning_sequences = tokenizer.texts_to_sequences(train_x)
maxlen = max([len(x) for x in np.array(traning_sequences, dtype="object")])
training_padded = pad_sequences(traning_sequences, maxlen = maxlen,
                                padding = 'pre',
                                truncating='pre')

In [36]:
print(maxlen)

1403


In [11]:
validation_sequences = tokenizer.texts_to_sequences(val_x)
validation_padded = pad_sequences(validation_sequences, maxlen = maxlen,
                                padding = 'pre',
                                truncating='pre')

In [12]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

57059

In [13]:
embeddings_index = {}
glovefile = open('glove.twitter.27B.200d.txt','r',encoding='utf-8')
# glovefile = open('glove.6B.200d.txt','r',encoding='utf-8')
for line in tqdm(glovefile):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
#     coefs.shape
    embeddings_index[word] = coefs
glovefile.close()

print('Found %s word vectors.' % len(embeddings_index))

1193514it [00:24, 47851.48it/s]

Found 1193514 word vectors.





In [14]:
embedding_matrix = np.zeros((len(tokenizer.word_index)+1, 200))
for words, index in tqdm(tokenizer.word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

100%|████████████████████████████████████████████████████████████████████████| 57058/57058 [00:00<00:00, 1391330.57it/s]


In [15]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(input_dim = vocab_size,output_dim = 200,weights = [embedding_matrix],input_length = maxlen),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True)),
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='sigmoid')])

2023-05-15 21:19:47.582841: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-15 21:19:47.593699: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-15 21:19:47.593748: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-15 21:19:47.596336: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-15 21:19:47.596383: I tensorflow/compile

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1403, 200)         11411800  
                                                                 
 bidirectional (Bidirectiona  (None, 1403, 256)        336896    
 l)                                                              
                                                                 
 conv1d (Conv1D)             (None, 1401, 128)         98432     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dropout (Dropout)           (None, 32)                0

In [37]:
print(training_padded)

[[    0     0     0 ...    72  1319    75]
 [    0     0     0 ... 13819 50363  4430]
 [    0     0     0 ...  3074     8   459]
 ...
 [    0     0     0 ... 35455 35456  1227]
 [    0     0     0 ...    17     5   489]
 [    0     0     0 ...     2   558    17]]


In [38]:
train_y.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [17]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit(training_padded, train_y, epochs = 2, validation_data=(validation_padded, val_y), batch_size = 32)

Epoch 1/2


2023-05-15 21:19:52.783125: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-15 21:19:52.784298: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-15 21:19:52.785000: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-05-15 21:21:10.216107: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-15 21:21:10.217195: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-15 21:21:10.217970: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/2


<keras.callbacks.History at 0x7efd493aba00>

In [18]:
model.save('keras.model')

2023-05-15 21:22:16.446082: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-15 21:22:16.446995: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-15 21:22:16.447635: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

2023-05-15 21:22:16.950507: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,?,200]
	 [[{{node inputs}}]]
2023-05-15 21:22:16.955473: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,?,200]
	 [[{{node inputs}}]]
2023-05-15 21:22:17.040672: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node 

2023-05-15 21:22:17.480575: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-15 21:22:17.481543: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-15 21:22:17.482236: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

2023-05-15 21:22:18.242304: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-15 21:22:18.243318: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_1_grad/concat/split_1/split_dim' with dtype int32
	 [[{{node gradients/split_1_grad/concat/split_1/split_dim}}]]
2023-05-15 21:22:18.311379: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

2023-05-15 21:22:18.954942: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-15 21:22:18.956138: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-15 21:22:18.956922: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

2023-05-15 21:22:19.886169: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_0' with dtype float and shape [?,?,200]
	 [[{{node inputs_0}}]]
2023-05-15 21:22:19.890964: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_0' with dtype float and shape [?,?,200]
	 [[{{node inputs_0}}]]
2023-05-15 21:22:19.894934: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,?,?]
	 [[{{node inputs}}]]
2023

INFO:tensorflow:Assets written to: keras.model/assets


INFO:tensorflow:Assets written to: keras.model/assets


In [19]:
# model = tf.keras.models.load_model #load saved model to skip re-training

In [20]:
predicted = model.predict(validation_padded)
labels = (predicted > 0.5).astype(int)

2023-05-15 21:22:21.152818: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-15 21:22:21.154055: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-15 21:22:21.154753: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



In [21]:
testing_sequences = tokenizer.texts_to_sequences(test_data.comment_text)
test_padded = pad_sequences(testing_sequences, maxlen = maxlen,
                                padding = 'pre',
                                truncating='pre')

In [22]:
predicted = model.predict(test_padded, batch_size = 200)
# predict = np.hstack((test_data.id[:, np.newaxis], predicted))



In [23]:
test_data = pd.read_csv('../data/test.csv')
test_data.head()
# test_data_arr = np.array(test_data)
# print(test_data_arr.shape)
# print(predicted.shape)

# predict = np.hstack((test_data_arr.id[:, np.newaxis], predicted))

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \r\n\r\n The title is fine as i...
2,00013b17ad220c46,""" \r\n\r\n == Sources == \r\n\r\n * Zawe Ashto..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [24]:
predict = np.hstack((np.array(test_data), predicted))

In [25]:
# predict.shape
print(predict.shape)
print(predict[0])
pred_processed = np.delete(predict, 1, 1)
print(pred_processed[0])

(153164, 8)
['00001cee341fdb12'
 "Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"
 0.9967250227928162 0.19510751962661743 0.9882634878158569
 0.021599698811769485 0.794369637966156 0.07796985656023026]
['00001cee341fdb12' 0.9967250227928162 0.19510751962661743
 0.9882634878158569 0.021599698811769485 0.794369637966156
 0.07796985656023026]


In [26]:
pd.DataFrame(pred_processed).to_csv('keras_predictions.csv')

In [27]:
subm = pd.DataFrame(pred_processed, columns = ['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
subm.to_csv('subm.csv', index = False)

In [1]:
xgboostdata = pd.read_csv('submission_9.csv')
catboostdata = pd.read_csv('submission_9_cat.csv')
kerasdata = pd.read_csv('subm.csv')
xgboostdata.drop(['id'],axis=1,inplace=True)
catboostdata.drop(['id'],axis=1,inplace=True)
kerasdata.drop(['id'],axis=1,inplace=True)

orchestra_predictions = 0.5 * xgboostdata + 0.3 * catboostdata + 0.2 * kerasdata

NameError: name 'pd' is not defined

In [29]:
columns_toxic = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [30]:
submission_samples = pd.read_csv('../data/sample_submission.csv')
sample_submission_id = pd.DataFrame({'id': submission_samples["id"]})
submission_output = pd.concat([sample_submission_id, pd.DataFrame(orchestra_predictions, columns = columns_toxic)], axis=1)

submission_output.to_csv('submission_9_orchestra.csv', index=False)