In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import zipfile
unzip = zipfile.ZipFile('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
unzip.extractall()
unzip = zipfile.ZipFile('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
unzip.extractall()

In [None]:
test_set = pd.read_csv('/kaggle/working/test.csv')
train_set = pd.read_csv('/kaggle/working/train.csv')

In [None]:
stop_words = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [None]:
import re

In [None]:
def clean_text(text):
    text = str(text)
    
    text = text.lower()
    text = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', text) # clean url
    text = re.sub(r'#(\w+)', '', text)   # clean hashes
    text = re.sub(r'@(\w+)', '', text)   # clean @
    text = re.sub(r'<[^>]+>', '', text)  # clean tags
    text = re.sub(r'\d+', '', text)      # clean digits
    text = re.sub(r'[,!@\'\"?\.$%_&#*+-:;]', '', text)   # clean punctuation
    text = [stop_words[word] if word in stop_words else word for word in text.split()]  #
    
    return text

In [None]:
train_set['comment_text'] = train_set['comment_text'].apply(clean_text)
test_set['comment_text'] = test_set['comment_text'].apply(clean_text)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_set, val_set = train_test_split(train_set, test_size = 0.2, random_state=11)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = 100000, oov_token='<oov>')
tokenizer.fit_on_texts(train_set.comment_text)

In [None]:
traning_sequences = tokenizer.texts_to_sequences(train_set.comment_text)

In [None]:
maxlen = max([len(x) for x in np.array(traning_sequences)])

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
training_padded = pad_sequences(traning_sequences, maxlen = maxlen,
                                padding = 'pre',
                                truncating='pre')
train_y = np.array(train_set.loc[:,'toxic':])

In [None]:
traning_sequences = tokenizer.texts_to_sequences(val_set.comment_text)
val_padded = pad_sequences(traning_sequences,maxlen = 1403,
                                padding = 'pre',
                                truncating='pre')
val_y = np.array(val_set.loc[:,'toxic':])

In [None]:
traning_sequences = tokenizer.texts_to_sequences(train_set.comment_text)
training_padded = pad_sequences(traning_sequences, maxlen = maxlen,
                                padding = 'pre',
                                truncating='pre')
train_y = np.array(train_set.loc[:,'toxic':])

In [None]:
testing_sequences = tokenizer.texts_to_sequences(test_set.comment_text)
test_padded = pad_sequences(testing_sequences, maxlen = maxlen,
                                padding = 'pre',
                                truncating='pre')




In [None]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(150000, 300),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True)),
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='sigmoid')])
                             
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['AUC'])


In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, to_file='model.png')
plot_model(model, to_file='model_shapes.png', show_shapes=True)


In [None]:
model.fit(x = training_padded, y = train_y, epochs = 2, batch_size = 200)

In [None]:
file_names = ['../input/youtube-comment-old-to-new/gameTrailer_CoD.csv','../input/youtube-comment-old-to-new/gameTrailer_FarCry6.csv', '../input/youtube-comment-old-to-new/gameTrailer_Pokemon.csv'
        ,'../input/youtube-comment-old-to-new/musicVideo_Baby.csv','../input/youtube-comment-old-to-new/musicVideo_Friday.csv',
#               '../input/youtube-comment-old-to-new/musicVideo_GangnamStyle.csv',
        '../input/youtube-comment-old-to-new/apology_LoganPaul.csv','../input/youtube-comment-old-to-new/apology_Pokimane.csv','../input/youtube-comment-old-to-new/apology_PewDiePie.csv']




file_result_name = ['gameTrailer_CoD','FarCry6','gameTrailer_Pokemon', 
                    'musicVideo_Baby', 'musicVideo_Friday',
#                     'musicVideo_Gangnam', 
 'apology_LoganPaul', 'apology_Pokimane', 'apology_Pewdipie'                
                   ]
for idx, file in enumerate(file_names):

    dataset = pd.read_csv(file, usecols = [0], engine = "python", sep=',', quotechar='"', error_bad_lines=False)
    dataset_save = pd.read_csv(file, usecols = [0], engine = "python", sep=',', quotechar='"', error_bad_lines=False)
    dataset.iloc[:, 0] = dataset.iloc[:, 0].apply(clean_text)
    dataset_sequences = tokenizer.texts_to_sequences(dataset.iloc[:, 0])
    dataset_padded = pad_sequences(dataset_sequences, maxlen = maxlen,
                                padding = 'pre',
                                truncating='pre')
    predicted = model.predict(dataset_padded, batch_size = 200)
    predicted_result = pd.DataFrame(predicted, columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
    
    print("----------dataset_Save------------")
    
    
    joined = dataset_save.join(predicted_result)
    print("----------joined------------")
    print(joined.head(5))
    
    joined.columns = ['comments', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    joined.to_csv('result_time_{}.csv'.format(file_result_name[idx]), index = False)
    

In [None]:
file_names = ['../input/youtube-comments-datas/gameTrailer_CoD.csv','../input/youtube-comments-datas/gameTrailer_FarCry6.csv', '../input/youtube-comments-datas/gameTrailer_Pokemon.csv'
        ,'../input/youtube-comments-datas/musicVideo_Baby.csv','../input/youtube-comments-datas/musicVideo_Friday.csv','../input/youtube-comments-datas/musicVideo_GangnamStyle.csv',
        '../input/youtube-comments-datas/apology_my_overdue_apology.csv','../input/youtube-comments-datas/apology_my_response.csv','../input/youtube-comments-datas/apology_so_sorry.csv']
file_result_name = ['gameTrailer_CoD','FarCry6','gameTrailer_Pokemon', 
                    'musicVideo_Baby', 'musicVideo_Friday','musicVideo_Gangnam', 
 'apology_my_overdue_apology', 'apology_my_response', 'apology_so_sorry'                
                   ]
for idx, file in enumerate(file_names):

    dataset = pd.read_csv(file, usecols = [0])
    dataset_save = pd.read_csv(file, usecols = [0])
    dataset.iloc[:, 0] = dataset.iloc[:, 0].apply(clean_text)
    dataset_sequences = tokenizer.texts_to_sequences(dataset.iloc[:, 0])
    dataset_padded = pad_sequences(dataset_sequences, maxlen = maxlen,
                                padding = 'pre',
                                truncating='pre')
    predicted = model.predict(dataset_padded, batch_size = 200)
    predicted_result = pd.DataFrame(predicted, columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
    
    print("----------dataset_Save------------")
    
    
    joined = dataset_save.join(predicted_result)
    print("----------joined------------")
    print(joined.head(5))
    
    joined.columns = ['comments', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    joined.to_csv('result_{}.csv'.format(file_result_name[idx]), index = False)