In [None]:
### importing các thư viện
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
import re ## Regular expresssions
from nltk import word_tokenize
from sklearn import metrics
from gensim.models import KeyedVectors
import operator
import gc

In [None]:
### import keras và các module để tạo model
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D,GRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model,load_model
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint,EarlyStopping
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import keras_tuner

In [None]:
### Load dataframe từ file csv
test_data=pd.read_csv("../input/quora-insincere-questions-classification/test.csv")
train_data=pd.read_csv("../input/quora-insincere-questions-classification/train.csv")

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
non_toxic = len(train_data[train_data['target']== 0])
toxic = len(train_data[train_data['target']== 1])
total = len(train_data)

non_toxic_percentage = (non_toxic/total)*100
toxic_percentage = (toxic/total)*100

txt1 = "Number of toxic questions: {number}, constitute {percentage:.2f}% of total"
txt2 = "Number of non-toxic questions: {number}, constitute {percentage:.2f}% of total"

print(txt1.format(number=toxic,percentage=toxic_percentage))
print(txt2.format(number=non_toxic,percentage=non_toxic_percentage))

pie_chart = pd.DataFrame({"percentage": [toxic_percentage,non_toxic_percentage]},index=['Toxic','Non-toxic'])

pie_chart.plot.pie(subplots=True,figsize=(7.5, 7.5),title='Target Distribution',autopct='%0.2f%%')  

In [None]:
### chia train_val thành 2 tập train và val riêng biệt
train,val=train_test_split(train_data,test_size=0.2,stratify=train_data.target,random_state=123)
print("Shape of the Training set :",train.shape)
print("Shape of the Validation set :",val.shape)

In [None]:
### giải nén pretrainmodel 
!unzip ../input/quora-insincere-questions-classification/embeddings.zip

 # Tiền xử lý dữ liệu và Encoding

In [None]:
#Định nghĩa các từ viết tắt và dạng đầy đủ của chúng
contractions={"I'm": 'I am',
 "I'm'a": 'I am about to',
 "I'm'o": 'I am going to',
 "I've": 'I have',
 "I'll": 'I will',
 "I'll've": 'I will have',
 "I'd": 'I would',
 "I'd've": 'I would have',
 'Whatcha': 'What are you',
 "amn't": 'am not',
 "ain't": 'are not',
 "aren't": 'are not',
 "'cause": 'because',
 "can't": 'can not',
 "can't've": 'can not have',
 "could've": 'could have',
 "couldn't": 'could not',
 "couldn't've": 'could not have',
 "daren't": 'dare not',
 "daresn't": 'dare not',
 "dasn't": 'dare not',
 "didn't": 'did not',
 'didn’t': 'did not',
 "don't": 'do not',
 'don’t': 'do not',
 "doesn't": 'does not',
 "e'er": 'ever',
 "everyone's": 'everyone is',
 'finna': 'fixing to',
 'gimme': 'give me',
 "gon't": 'go not',
 'gonna': 'going to',
 'gotta': 'got to',
 "hadn't": 'had not',
 "hadn't've": 'had not have',
 "hasn't": 'has not',
 "haven't": 'have not',
 "he've": 'he have',
 "he's": 'he is',
 "he'll": 'he will',
 "he'll've": 'he will have',
 "he'd": 'he would',
 "he'd've": 'he would have',
 "here's": 'here is',
 "how're": 'how are',
 "how'd": 'how did',
 "how'd'y": 'how do you',
 "how's": 'how is',
 "how'll": 'how will',
 "isn't": 'is not',
 "it's": 'it is',
 "'tis": 'it is',
 "'twas": 'it was',
 "it'll": 'it will',
 "it'll've": 'it will have',
 "it'd": 'it would',
 "it'd've": 'it would have',
 'kinda': 'kind of',
 "let's": 'let us',
 'luv': 'love',
 "ma'am": 'madam',
 "may've": 'may have',
 "mayn't": 'may not',
 "might've": 'might have',
 "mightn't": 'might not',
 "mightn't've": 'might not have',
 "must've": 'must have',
 "mustn't": 'must not',
 "mustn't've": 'must not have',
 "needn't": 'need not',
 "needn't've": 'need not have',
 "ne'er": 'never',
 "o'": 'of',
 "o'clock": 'of the clock',
 "ol'": 'old',
 "oughtn't": 'ought not',
 "oughtn't've": 'ought not have',
 "o'er": 'over',
 "shan't": 'shall not',
 "sha'n't": 'shall not',
 "shalln't": 'shall not',
 "shan't've": 'shall not have',
 "she's": 'she is',
 "she'll": 'she will',
 "she'd": 'she would',
 "she'd've": 'she would have',
 "should've": 'should have',
 "shouldn't": 'should not',
 "shouldn't've": 'should not have',
 "so've": 'so have',
 "so's": 'so is',
 "somebody's": 'somebody is',
 "someone's": 'someone is',
 "something's": 'something is',
 'sux': 'sucks',
 "that're": 'that are',
 "that's": 'that is',
 "that'll": 'that will',
 "that'd": 'that would',
 "that'd've": 'that would have',
 'em': 'them',
 "there're": 'there are',
 "there's": 'there is',
 "there'll": 'there will',
 "there'd": 'there would',
 "there'd've": 'there would have',
 "these're": 'these are',
 "they're": 'they are',
 "they've": 'they have',
 "they'll": 'they will',
 "they'll've": 'they will have',
 "they'd": 'they would',
 "they'd've": 'they would have',
 "this's": 'this is',
 "those're": 'those are',
 "to've": 'to have',
 'wanna': 'want to',
 "wasn't": 'was not',
 "we're": 'we are',
 "we've": 'we have',
 "we'll": 'we will',
 "we'll've": 'we will have',
 "we'd": 'we would',
 "we'd've": 'we would have',
 "weren't": 'were not',
 "what're": 'what are',
 "what'd": 'what did',
 "what've": 'what have',
 "what's": 'what is',
 "what'll": 'what will',
 "what'll've": 'what will have',
 "when've": 'when have',
 "when's": 'when is',
 "where're": 'where are',
 "where'd": 'where did',
 "where've": 'where have',
 "where's": 'where is',
 "which's": 'which is',
 "who're": 'who are',
 "who've": 'who have',
 "who's": 'who is',
 "who'll": 'who will',
 "who'll've": 'who will have',
 "who'd": 'who would',
 "who'd've": 'who would have',
 "why're": 'why are',
 "why'd": 'why did',
 "why've": 'why have',
 "why's": 'why is',
 "will've": 'will have',
 "won't": 'will not',
 "won't've": 'will not have',
 "would've": 'would have',
 "wouldn't": 'would not',
 "wouldn't've": 'would not have',
 "y'all": 'you all',
 "y'all're": 'you all are',
 "y'all've": 'you all have',
 "y'all'd": 'you all would',
 "y'all'd've": 'you all would have',
 "you're": 'you are',
 "you've": 'you have',
 "you'll've": 'you shall have',
 "you'll": 'you will',
 "you'd": 'you would',
 "you'd've": 'you would have',
 'jan.': 'january',
 'feb.': 'february',
 'mar.': 'march',
 'apr.': 'april',
 'jun.': 'june',
 'jul.': 'july',
 'aug.': 'august',
 'sep.': 'september',
 'oct.': 'october',
 'nov.': 'november',
 'dec.': 'december',
 'I’m': 'I am',
 'I’m’a': 'I am about to',
 'I’m’o': 'I am going to',
 'I’ve': 'I have',
 'I’ll': 'I will',
 'I’ll’ve': 'I will have',
 'I’d': 'I would',
 'I’d’ve': 'I would have',
 'amn’t': 'am not',
 'ain’t': 'are not',
 'aren’t': 'are not',
 '’cause': 'because',
 'can’t': 'can not',
 'can’t’ve': 'can not have',
 'could’ve': 'could have',
 'couldn’t': 'could not',
 'couldn’t’ve': 'could not have',
 'daren’t': 'dare not',
 'daresn’t': 'dare not',
 'dasn’t': 'dare not',
 'doesn’t': 'does not',
 'e’er': 'ever',
 'everyone’s': 'everyone is',
 'gon’t': 'go not',
 'hadn’t': 'had not',
 'hadn’t’ve': 'had not have',
 'hasn’t': 'has not',
 'haven’t': 'have not',
 'he’ve': 'he have',
 'he’s': 'he is',
 'he’ll': 'he will',
 'he’ll’ve': 'he will have',
 'he’d': 'he would',
 'he’d’ve': 'he would have',
 'here’s': 'here is',
 'how’re': 'how are',
 'how’d': 'how did',
 'how’d’y': 'how do you',
 'how’s': 'how is',
 'how’ll': 'how will',
 'isn’t': 'is not',
 'it’s': 'it is',
 '’tis': 'it is',
 '’twas': 'it was',
 'it’ll': 'it will',
 'it’ll’ve': 'it will have',
 'it’d': 'it would',
 'it’d’ve': 'it would have',
 'let’s': 'let us',
 'ma’am': 'madam',
 'may’ve': 'may have',
 'mayn’t': 'may not',
 'might’ve': 'might have',
 'mightn’t': 'might not',
 'mightn’t’ve': 'might not have',
 'must’ve': 'must have',
 'mustn’t': 'must not',
 'mustn’t’ve': 'must not have',
 'needn’t': 'need not',
 'needn’t’ve': 'need not have',
 'ne’er': 'never',
 'o’': 'of',
 'o’clock': 'of the clock',
 'ol’': 'old',
 'oughtn’t': 'ought not',
 'oughtn’t’ve': 'ought not have',
 'o’er': 'over',
 'shan’t': 'shall not',
 'sha’n’t': 'shall not',
 'shalln’t': 'shall not',
 'shan’t’ve': 'shall not have',
 'she’s': 'she is',
 'she’ll': 'she will',
 'she’d': 'she would',
 'she’d’ve': 'she would have',
 'should’ve': 'should have',
 'shouldn’t': 'should not',
 'shouldn’t’ve': 'should not have',
 'so’ve': 'so have',
 'so’s': 'so is',
 'somebody’s': 'somebody is',
 'someone’s': 'someone is',
 'something’s': 'something is',
 'that’re': 'that are',
 'that’s': 'that is',
 'that’ll': 'that will',
 'that’d': 'that would',
 'that’d’ve': 'that would have',
 'there’re': 'there are',
 'there’s': 'there is',
 'there’ll': 'there will',
 'there’d': 'there would',
 'there’d’ve': 'there would have',
 'these’re': 'these are',
 'they’re': 'they are',
 'they’ve': 'they have',
 'they’ll': 'they will',
 'they’ll’ve': 'they will have',
 'they’d': 'they would',
 'they’d’ve': 'they would have',
 'this’s': 'this is',
 'those’re': 'those are',
 'to’ve': 'to have',
 'wasn’t': 'was not',
 'we’re': 'we are',
 'we’ve': 'we have',
 'we’ll': 'we will',
 'we’ll’ve': 'we will have',
 'we’d': 'we would',
 'we’d’ve': 'we would have',
 'weren’t': 'were not',
 'what’re': 'what are',
 'what’d': 'what did',
 'what’ve': 'what have',
 'what’s': 'what is',
 'what’ll': 'what will',
 'what’ll’ve': 'what will have',
 'when’ve': 'when have',
 'when’s': 'when is',
 'where’re': 'where are',
 'where’d': 'where did',
 'where’ve': 'where have',
 'where’s': 'where is',
 'which’s': 'which is',
 'who’re': 'who are',
 'who’ve': 'who have',
 'who’s': 'who is',
 'who’ll': 'who will',
 'who’ll’ve': 'who will have',
 'who’d': 'who would',
 'who’d’ve': 'who would have',
 'why’re': 'why are',
 'why’d': 'why did',
 'why’ve': 'why have',
 'why’s': 'why is',
 'will’ve': 'will have',
 'won’t': 'will not',
 'won’t’ve': 'will not have',
 'would’ve': 'would have',
 'wouldn’t': 'would not',
 'wouldn’t’ve': 'would not have',
 'y’all': 'you all',
 'y’all’re': 'you all are',
 'y’all’ve': 'you all have',
 'y’all’d': 'you all would',
 'y’all’d’ve': 'you all would have',
 'you’re': 'you are',
 'you’ve': 'you have',
 'you’ll’ve': 'you shall have',
 'you’ll': 'you will',
 'you’d': 'you would',
 'you’d’ve': 'you would have'}

#Đưa các từ viết tắt về dạng chuẩn
def contraction_fix(word):
    try:
        a=contractions[word]
    except KeyError:
        a=word
    return a

In [None]:
#Tiền xử lý dữ liệu câu hỏi, sau đó thêm vào tập văn phạm
def Preprocess(doc):
    corpus=[] #Tập văn bản câu hỏi  được xử lý
    for text in tqdm(doc):
        text=" ".join([contraction_fix(w) for w in text.split()])
        text=re.sub(r'[^a-z0-9A-Z]'," ",text) #bỏ dấu gạch nối giữa các kí tự
        text=re.sub(r'[0-9]{1}',"#",text) 
        text=re.sub(r'[0-9]{2}','##',text)   #thay các kí tự số bằng #
        text=re.sub(r'[0-9]{3}','###',text)
        text=re.sub(r'[0-9]{4}','####',text)
        text=re.sub(r'[0-9]{5,}','#####',text)
        corpus.append(text)
    return corpus

In [None]:
### tạo bộ từ vựng từ tập văn phạm
### cấu trúc {từ vựng : số lần xuất hiện từ vựng đó trong tập văn phạm}
def vocab_build(corpus):
    vocab={}
    for text in tqdm(corpus):
        for word in text.split():
            try:
                vocab[word]+=1
            except KeyError:
                vocab[word]=1
    return vocab

In [None]:
### Lấy vị trí các từ trong bộ từ vựng
def get_word_index(vocab):
    word_index=dict((w,i+1) for i,w in enumerate(vocab.keys()))
    return word_index
###Encoding các văn bản trong corpus
def fit_one_hot(word_index,corpus):
    all_questions=[]
    for text in tqdm(corpus):
        #Một câu được mã hoá bằng một vector chứa các thứ tự của các từ trong câu
        question=[]
        for word in text.split():
            try:
                #Mỗi từ trong câu sẽ được mã hoá bằng thứ tự từ đó trong bộ từ vựng
                question.append(word_index[word])
            except KeyError:
                #Nếu từ đó không có trong bộ từ vựng sẽ được mã hoá là 0
                question.append(0)
        all_questions.append(question)
    return all_questions

In [None]:
### Load Google News pretrain embedding (model phụ trách việc embedding từ)
file_name="./GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
model_embed=KeyedVectors.load_word2vec_format(file_name,binary=True)

In [None]:
#Lấy toàn bộ câu hỏi trong train và test dataset
total_text=pd.concat([train_data.question_text,test_data.question_text])
#Tiền xử lý 
pre_text=Preprocess(total_text)
#Tạo bộ từ vựng
vocabulary=vocab_build(pre_text)

In [None]:
vocab_size=len(vocabulary)+1
#Khởi tạo độ dài mỗi câu đã được encoding cố định là 40
max_len=40

word_index=get_word_index(vocabulary)
### tiền xử lý dữ liệu
train_text=Preprocess(train.question_text)
val_text=Preprocess(val.question_text)
test_text=Preprocess(test_data.question_text)

### encoding dữ liệu train
encodes=fit_one_hot(word_index,train_text)
#Nếu câu chưa đủ 40 từ thì padding các số 0 vào cuối cho đủ 40
train_padded=pad_sequences(encodes,maxlen=max_len,padding="post")

### encoding dữ liệu valid
encodes_=fit_one_hot(word_index,val_text)
#Nếu câu chưa đủ 40 từ thì padding các số 0 vào cuối cho đủ 40
val_padded=pad_sequences(encodes_,maxlen=max_len,padding="post")

### encoding dữ liệu test
encodes__=fit_one_hot(word_index,test_text)
#Nếu câu chưa đủ 40 từ thì padding các số 0 vào cuối cho đủ 40
test_padded=pad_sequences(encodes__,maxlen=max_len,padding="post")

In [None]:
#Khởi tạo ma trận embedding cho các từ trong bộ từ vựng
embedding_mat=np.zeros((vocab_size,300))
for word,i in tqdm(word_index.items()):
    try:
        vec=model_embed[word]
        embedding_mat[i]=vec
    except KeyError:
        continue

# Tạo mô hình-huấn luyện-show kết quả

In [None]:
#Tạo mô hình
inp = Input(shape=(max_len,))
x = Embedding(vocab_size,300,weights=[embedding_mat],input_length=max_len,trainable=False)(inp)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Conv1D(64,3,activation="relu")(x)
x = GlobalMaxPool1D()(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
print(model.summary())

In [None]:
opt=Adam(learning_rate=0.001)
bin_loss=tf.keras.losses.BinaryCrossentropy(
                                            from_logits=False, 
                                            label_smoothing=0,
                                            name='binary_crossentropy'
                                        )

early_stopping=tf.keras.callbacks.EarlyStopping(
                                                monitor="val_loss",
                                                patience=3,
                                                mode="min",
                                                restore_best_weights=True
                                              )

reduce_lr=tf.keras.callbacks.ReduceLROnPlateau(
                                                monitor="val_loss",
                                                factor=0.2,
                                                patience=2,
                                                verbose=1,
                                                mode="auto"
                                            )

my_callbacks=[early_stopping,reduce_lr]

In [None]:
model.compile(loss=bin_loss, optimizer=opt, metrics=['accuracy'])
history=model.fit(train_padded, train.target, batch_size=512, epochs=30, validation_data=(val_padded, val.target),callbacks=my_callbacks)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
y_pre=model.predict(val_padded)
scores_f1 = []
threshold = []
#Check f1 score tại các threshold từ 0.1 đến 0.5 để đánh giá
for thresh in np.arange(0.1,0.5,0.01):
    f1= metrics.f1_score(val.target,(y_pre>thresh).astype(int))
    scores_f1.append(f1)
    threshold.append(thresh)
    print("threshold {0:2.2f} f1 score:{1:2.3f}".format(thresh,f1))
    
plt.plot(threshold,scores_f1)
plt.show()


In [None]:
threshold=0.375
y_test_pre=model.predict(test_padded)
y_test_pre=(y_test_pre>thresh).astype(int)

### ghi file submission
submit=pd.DataFrame()
submit["qid"]=test_data.qid
submit["prediction"]=y_test_pre
submit.to_csv("submission.csv",index=False)