In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import zipfile
unzip = zipfile.ZipFile('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
unzip.extractall()
unzip = zipfile.ZipFile('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
unzip.extractall()

In [None]:
test_df = pd.read_csv('/kaggle/working/test.csv')
test_df.head()

In [None]:
train_df = pd.read_csv('/kaggle/working/train.csv')
train_df.head()

In [None]:
train_df.shape

In [None]:
train_df.loc[3,'comment_text']

In [None]:
train_df.loc[:,'toxic':].mean(axis = 0)

In [None]:
train_df[(train_df['toxic']==0) & (train_df['severe_toxic']==0) & (train_df['obscene']==0) & (train_df['threat']==0) & 
          (train_df['insult']==0) & (train_df['identity_hate']==0)].count()

In [None]:
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [None]:
import re

In [None]:
def clean_text(text):
    
    text = text.lower()
    text = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', text) # clean url
    text = re.sub(r'#(\w+)', '', text)   # clean hashes
    text = re.sub(r'@(\w+)', '', text)   # clean @
    text = re.sub(r'<[^>]+>', '', text)  # clean tags
    text = re.sub(r'\d+', '', text)      # clean digits
    text = re.sub(r'[,!@\'\"?\.$%_&#*+-:;]', '', text)   # clean punctuation
    text = [APPO[word] if word in APPO else word for word in text.split()]  #
    
    return text

In [None]:
train_df['comment_text'] = train_df['comment_text'].apply(clean_text)
test_df['comment_text'] = test_df['comment_text'].apply(clean_text)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_set, val_set = train_test_split(train_df, test_size = 0.2, random_state=11)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = 100000, oov_token='<oov>')
tokenizer.fit_on_texts(train_df.comment_text)

In [None]:
traning_sequences = tokenizer.texts_to_sequences(train_set.comment_text)

In [None]:
maxlen = max([len(x) for x in np.array(traning_sequences)])

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
training_padded = pad_sequences(traning_sequences, maxlen = maxlen,
                                padding = 'pre',
                                truncating='pre')
train_y = np.array(train_set.loc[:,'toxic':])

In [None]:
traning_sequences = tokenizer.texts_to_sequences(val_set.comment_text)
val_padded = pad_sequences(traning_sequences,maxlen = 1403,
                                padding = 'pre',
                                truncating='pre')
val_y = np.array(val_set.loc[:,'toxic':])

In [None]:
'''model = tf.keras.Sequential([tf.keras.layers.Embedding(150000, 128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(6, activation="sigmoid")])
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['AUC'])
model.fit(x = training_padded, y = train_y, validation_data = (val_padded, val_y), epochs = 2, batch_size = 100)'''

In [None]:
'''model = tf.keras.Sequential([tf.keras.layers.Embedding(150000, 128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),    
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(6, activation='sigmoid')])
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['AUC'])
model.fit(x = training_padded, y = train_y, validation_data = (val_padded, val_y), epochs = 5, batch_size = 200)'''

In [None]:
'''model = tf.keras.Sequential([tf.keras.layers.Embedding(100000, 128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),    
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='sigmoid')])
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['AUC'])
model.fit(x = training_padded, y = train_y, validation_data = (val_padded, val_y), epochs = 5, batch_size = 200)'''

In [None]:
'''model = tf.keras.Sequential([tf.keras.layers.Embedding(10000, 128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True, dropout=0.15, recurrent_dropout=0.15)),    
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='sigmoid')])
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['AUC'])
model.fit(x = training_padded, y = train_y, validation_data = (val_padded, val_y), epochs = 5, batch_size = 200)'''

In [None]:
'''model = tf.keras.Sequential([tf.keras.layers.Embedding(150000, 128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True)),
    tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='sigmoid')])
                             
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['AUC'])
model.fit(x = training_padded, y = train_y, validation_data = (val_padded, val_y), epochs = 5, batch_size = 200)'''

In [None]:
'''model = tf.keras.Sequential([tf.keras.layers.Embedding(150000, 300),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True)),
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='sigmoid')])
                             
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['AUC'])
model.fit(x = training_padded, y = train_y, validation_data = (val_padded, val_y), epochs = 5, batch_size = 50)'''

In [None]:
traning_sequences = tokenizer.texts_to_sequences(train_df.comment_text)
training_padded = pad_sequences(traning_sequences, maxlen = maxlen,
                                padding = 'pre',
                                truncating='pre')
train_y = np.array(train_df.loc[:,'toxic':])

In [None]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(150000, 300),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True)),
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='sigmoid')])
                             
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['AUC'])
model.fit(x = training_padded, y = train_y, epochs = 2, batch_size = 200)

In [None]:
testing_sequences = tokenizer.texts_to_sequences(test_df.comment_text)
test_padded = pad_sequences(testing_sequences, maxlen = maxlen,
                                padding = 'pre',
                                truncating='pre')

In [None]:
predicted = model.predict(test_padded, batch_size = 200)
predict = np.hstack((test_df.id[:, np.newaxis], predicted))

In [None]:
subm = pd.DataFrame(predict, columns = ['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
subm.to_csv('subm.csv', index = False)