# Phân tích dữ liệu

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from nltk.corpus import stopwords

In [None]:
test_df = pd.read_csv('../input/quora-insincere-questions-classification/test.csv')
train_df = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')

**Xem dữ liệu tập train**

In [None]:
train_df.head()

**Xem dữ liệu tập test**

In [None]:
test_df.head()

**Kiểm tra số lượng câu hỏi và ratio trong tập train**

In [None]:
print('Tổng số câu hỏi', train_df.shape[0])
print('Câu hỏi chân thành', train_df[train_df['target'] == 0].shape[0])
print('Câu hỏi không chân thành', train_df[train_df['target'] == 1].shape[0])

**Kiếm tra số từ xuất hiện nhiều nhất trong cả insincere và sincere questions**

In [None]:
class Vocabulary(object):
    
    def __init__(self):
        self.vocab = {}
        self.STOPWORDS = set()
        self.STOPWORDS = set(stopwords.words('english'))
        
    def build_vocab(self, lines):
        for line in lines:
            for word in line.split(' '):
                word = word.lower()
                if (word in self.STOPWORDS):
                    continue
                if (word not in self.vocab):
                    self.vocab[word] = 0
                self.vocab[word] +=1 

In [None]:
sincere_vocab = Vocabulary()
sincere_vocab.build_vocab(train_df[train_df['target'] == 0]['question_text'])
sincere_vocabulary = sorted(sincere_vocab.vocab.items(), reverse=True, key=lambda kv: kv[1])
for word, count in sincere_vocabulary[:10]:
    print(word, count)

In [None]:
insincere_vocab = Vocabulary()
insincere_vocab.build_vocab(train_df[train_df['target'] == 1]['question_text'])
insincere_vocabulary = sorted(insincere_vocab.vocab.items(), reverse=True, key=lambda kv: kv[1])
for word, count in insincere_vocabulary[:10]:
    print(word, count)

**Xem thử vài mẫu câu bị đánh giá không chân thành**

In [None]:
print(train_df[train_df['target'] == 1]['question_text'])

# Làm sạch dữ liệu

In [None]:
import unidecode
import re
import nltk
import string
import codecs
import spacy
from nltk.corpus import stopwords
import pickle

**Clean punctations** 

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\xa0', '\t',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

# replace puncts
def clean_puncts(text):
    text = str(text)
    for punct in "/-'":
        text = text.replace(punct, ' ')
    for punct in puncts:
        text = text.replace(punct, f' {punct} ')
    return text

**Clean Mispell**

In [None]:
mispell_dict = {
    'grey': 'gray',
    'litre': 'liter',
    'labour': 'labor',
    'favour': 'favor',
    'colour': 'color',
    'centre': 'center',
    'honours': 'honor',
    'theatre': 'theater',
    'realise': 'realize',
    'defence': 'defense',
    'licence': 'license',
    'analyse': 'analyze',
    'practise': 'practice',
    'behaviour': 'behavior',
    'neighbour': 'neighbor',
    'recognise': 'recognize',
    'organisation':'organization',
    'Qoura': 'Quora',
    'quora': 'Quora',
    'Quorans': 'Quoran',
    'infty': 'infinity',
    'judgement': 'judge',
    'learnt': 'learn',
    'modelling': 'model',
    'cancelled': 'cancel',
    'travelled': 'travel',
    'travelling': 'travel',
    'aluminium': 'alumini',
    'counselling':'counseling',
    'cheque': 'bill',
    'upvote': 'agree',
    'upvotes': 'agree',
    'vape': 'cigarette',
    'jewellery': 'jewell',
    'Fiverr': 'freelance',
    'programd': 'program',
    'programme': 'program',
    'programr': 'programer',
    'programrs': 'programer',
    'WeChat': 'socialmedia',
    'Snapchat': 'socialmedia',
    'Redmi': 'cellphone',
    'Xiaomi': 'cellphone',
    'OnePlus': 'cellphone',
    'cryptos': 'crypto',
    'bitcoin': 'crypto',
    'Coinbase': 'crypto',
    'bitcoins': 'crypto',
    'ethereum': 'crypto',
    'Ethereum': 'crypto',
    'Blockchain': 'crypto',
    'blockchain': 'crypto',
    'cryptocurrency': 'crypto',
    'cryptocurrencies': 'crypto',
    '₹': 'rupee',
    'Brexit': 'Britain exit',
    'Paytm': 'Pay Through Mobile',
    'KVPY': 'Kishore Vaigyanik Protsahan Yojana',
    'GDPR': 'General Data Protection Regulation',
    'INTJ': 'Introversion Intuition Thinking Judgment',
    "ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because",
    "could've": "could have", "couldn't": "could not", "didn't": "did not", "doesn't": "does not",
    "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
    "he'd": "he would", "he'll": "he will", "he's": "he is", "how'd": "how did",
    "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "i'd": "i would",
    "i'd've": "i would have", "i'll": "i will", "i'll've": "I will have", "i'm": "i am",
    "i've": "I have", "isn't": "is not", "it'd": "it would",
    "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is",
    "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have",
    "mightn't": "might not", "mightn't've": "might not have", "must've": "must have",
    "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
    "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not",
    "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
    "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
    "she'll": "she will", "she'll've": "she will have", "she's": "she is",
    "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
    "so've": "so have", "so's": "so as", "this's": "this is", "that'd": "that would",
    "that'd've": "that would have", "that's": "that is", "there'd": "there would",
    "there'd've": "there would have", "there's": "there is", "here's": "here is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
    "they'll've": "they will have", "they're": "they are", "they've": "they have",
    "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have",
    "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
    "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
    "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have",
    "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
    "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
    "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
    "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
    "you'd": "you would", "you'd've": "you would have", "you'll": "you will",
    "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color',
    'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling',
    'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor',
    'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize',
    'youtu ': 'youtube ', 'qoura': 'quora', 'sallary': 'salary', 'whta': 'what',
    'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can',
    'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doi': 'do I',
    'thebest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation',
    'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis',
    'etherium': 'ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017',
    '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess',
    "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization',
    'demonitization': 'demonetization', 'demonetisation': 'demonetization'
}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)

def clean_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

def contraction_fix(word):
    try:
        a=mispell_dict[word]
    except KeyError:
        a=word
    return a

In [None]:
train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_puncts(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_puncts(x))

train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_misspell(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_misspell(x))

**Sử dụng CountVectorizer / Logistic Regression**

In [None]:
'''
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
'''

In [None]:
'''
train_file = train_df[['question_text', 'target']]
def text_processing(data_file):
     stemmer = PorterStemmer()
     #Thực hiện processing:
     data_file['txt_processed'] = data_file['question_text'].apply(lambda train_file: word_tokenize(train_file))
     print('step1 done...')
     data_file['txt_processed'] = data_file['txt_processed'].apply(lambda x: [item for item in x if item.isalpha()])
     print('step2 done..')
     #data_file['txt_processed'] = data_file['txt_processed'].apply(lambda x: [item for item in x if item not in stop_words])
     #print('step3 done..')
     data_file['txt_processed'] = data_file['txt_processed'].apply(lambda x: [stemmer.stem(item) for item in x])
     print('done')
     return data_file
'''

In [None]:
#train_file_sw = text_processing(train_file)
#train_file_sw.tail()

**Xác định fscore của model**

In [None]:
'''
# Hàm tính fscore của model
def get_fscore_matrix(fitted_clf, model_name):
    print(model_name, ' :')
    
    # get classes predictions for the classification report 
    y_train_pred, y_pred = fitted_clf.predict(X_train), fitted_clf.predict(X_test)
    print(classification_report(y_test, y_pred), '\n') # target_names=y
    
    # computes probabilities keep the ones for the positive outcome only      
    print(f'F1-score = {f1_score(y_test, y_pred):.2f}')
'''

In [None]:
'''
# Convert a collection of text documents to string
train_file_sw['str_processed'] = train_file_sw['txt_processed'].apply(lambda x: " ".join(x))
train_file_sw.head()
'''

In [None]:
'''
pl_model = pipeline.fit(X_train, y_train)
pl_model
'''

In [None]:
#get_fscore_matrix(pl_model, 'Sử dụng Pipeline:')

In [None]:
#pd.read_csv("../input/quora-insincere-questions-classification/sample_submission.csv").head()

In [None]:
'''
test_df = text_processing(test_df)
test_df['str_processed'] = test_df['txt_processed'].apply(lambda x: " ".join(x))
test_df.head()
'''

In [None]:
'''
y_pred_final = pl_model.predict(test_df['str_processed'])
df_submission = pd.DataFrame({"qid":test_df["qid"], "prediction":y_pred_final})
df_submission.head()
'''

In [None]:
#df_submission.to_csv('submission.csv', index=False)

**Sử dụng one-hot / LSTM**

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
import re ## Regular expresssions
from nltk import word_tokenize
from sklearn import metrics
from gensim.models import KeyedVectors
import operator
import gc

In [None]:
from tqdm import tqdm
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D,GRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model,load_model
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint,EarlyStopping
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

**Chia tập dữ liệu Train thành train data và validation data**

In [None]:
train_data,val_data = train_test_split(train_df,test_size=0.2,stratify=train_df.target,random_state=123)

In [None]:
!unzip ../input/quora-insincere-questions-classification/embeddings.zip

In [None]:
def Preprocess(doc):
    corpus=[]
    for text in tqdm(doc):
        text=" ".join([contraction_fix(w) for w in text.split()])
        text=re.sub(r'[^a-z0-9A-Z]'," ",text) 
        text=re.sub(r'[0-9]{1}',"#",text) 
        text=re.sub(r'[0-9]{2}','##',text)   
        text=re.sub(r'[0-9]{3}','###',text)
        text=re.sub(r'[0-9]{4}','####',text)
        text=re.sub(r'[0-9]{5,}','#####',text)
        corpus.append(text)
    return corpus

**Xây dựng bộ từ vựng và encoding**

In [None]:
def vocab_build(corpus):
    vocab={}
    for text in tqdm(corpus):
        for word in text.split():
            try:
                vocab[word]+=1
            except KeyError:
                vocab[word]=1
    return vocab

In [None]:
def get_word_index(vocab):
    word_index=dict((w,i+1) for i,w in enumerate(vocab.keys()))
    return word_index

def fit_one_hot(word_index,corpus):
    all_questions=[]
    for text in tqdm(corpus):
        question=[]
        for word in text.split():
            try:
                question.append(word_index[word])
            except KeyError:
                question.append(0)
        all_questions.append(question)
    return all_questions

In [None]:

file_name="./GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
model_embed=KeyedVectors.load_word2vec_format(file_name,binary=True)

In [None]:
total_text=pd.concat([train_df.question_text,test_df.question_text])
pre_text=Preprocess(total_text)
vocabulary=vocab_build(pre_text)

In [None]:
vocab_size=len(vocabulary)+1

max_len=40 #vector có độ dài cố định là 40

word_index=get_word_index(vocabulary)

train_text=Preprocess(train_data.question_text)
val_text=Preprocess(val_data.question_text)
test_text=Preprocess(test_df.question_text)

#encodings and paddings
encodes=fit_one_hot(word_index,train_text) 
train_padded=pad_sequences(encodes,maxlen=max_len,padding="post") 

encodes_=fit_one_hot(word_index,val_text)  
val_padded=pad_sequences(encodes_,maxlen=max_len,padding="post") 

encodes__=fit_one_hot(word_index,test_text) 
test_padded=pad_sequences(encodes__,maxlen=max_len,padding="post") 

In [None]:
embedding_mat=np.zeros((vocab_size,300))
for word,i in tqdm(word_index.items()):
    try:
        vec=model_embed[word]
        embedding_mat[i]=vec
    except KeyError:
        continue

In [None]:
#Tạo mô hình
inp = Input(shape=(max_len,))
x = Embedding(vocab_size,300,weights=[embedding_mat],input_length=max_len,trainable=False)(inp)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Conv1D(64,3,activation="relu")(x)
x = GlobalMaxPool1D()(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(1, activation="sigmoid")(x)
biLSTM = Model(inputs=inp, outputs=x)
plot_model(biLSTM, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
print(biLSTM.summary())

In [None]:
opt=Adam(learning_rate=0.001)
bin_loss=tf.keras.losses.BinaryCrossentropy(
                                            from_logits=False, 
                                            label_smoothing=0,
                                            name='binary_crossentropy'
                                        )

early_stopping=tf.keras.callbacks.EarlyStopping(
                                                monitor="val_loss",
                                                patience=3,
                                                mode="min",
                                                restore_best_weights=True
                                              )

reduce_lr=tf.keras.callbacks.ReduceLROnPlateau(
                                                monitor="val_loss",
                                                factor=0.2,
                                                patience=2,
                                                verbose=1,
                                                mode="auto"
                                            )

my_callbacks=[early_stopping,reduce_lr]

In [None]:
biLSTM.compile(loss=bin_loss, optimizer=opt, metrics=['accuracy'])
trained = biLSTM.fit(train_padded, train_data.target, batch_size=512, epochs=30, validation_data=(val_padded, val_data.target),callbacks=my_callbacks)

In [None]:
import matplotlib.pyplot as plt
#accuracy của mô hình
plt.plot(trained.history['accuracy'])
plt.plot(trained.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

#loss của mô hình
plt.plot(trained.history['loss'])
plt.plot(trained.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
y_pre = biLSTM.predict(val_padded)
scores_f1 = []
threshold = []

for thresh in np.arange(0.1,0.5,0.01):
    f1= metrics.f1_score(val_data.target,(y_pre>thresh).astype(int))
    scores_f1.append(f1)
    threshold.append(thresh)
    print("threshold {0:2.2f} f1 score:{1:2.3f}".format(thresh,f1))
    
plt.plot(threshold,scores_f1)
plt.show()

In [None]:
threshold = 0.375
y_test_pre = biLSTM.predict(test_padded)
y_test_pre = (y_test_pre>thresh).astype(int)

submit=pd.DataFrame()
submit["qid"]=test_df.qid
submit["prediction"]=y_test_pre
submit.to_csv("submission.csv",index=False)


In [None]:
#print(y_test_pre[2])

**Sử dụng pre-trained model BERT**

In [None]:
#from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
#tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
class QuoraDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.text = X.reset_index(drop=True)
        self.targets = y.reset_index(drop=True)
        self.tok = tokenizer
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        
        text = self.text[idx]
        targ = self.targets[idx]
        
        return self.tok(text, padding='max_length', 
                        truncation=True,
                        max_length=30,
                        return_tensors="pt")["input_ids"][0], tensor(targ)

In [None]:
# Split data
X_train, X_valid, y_train, y_valid = train_test_split(train_df["question_text"], train_df["target"], 
                                                      stratify=train_df["target"],  test_size=0.1)

In [None]:
train_ds = QuoraDataset(X_train, y_train, tokenizer) 
valid_ds = QuoraDataset(X_valid, y_valid, tokenizer)

In [None]:
# Prepare data loader
train_dl = DataLoader(train_ds, bs=128)
valid_dl = DataLoader(valid_ds, bs=256)
data_loader = DataLoaders(train_dl, valid_dl).to("cuda")

In [None]:
# Xây dựng BERT model

bert = AutoModelForSequenceClassification.from_pretrained('bert-base-cased').train()
classifier = nn.Sequential(
    nn.Linear(768, 1024),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(1024, 2)
)
bert.classifier = classifier
class BertClassifier(Module):
    def __init__(self, bert):
        self.bert = bert
    def forward(self, x):
        x = self.bert(x)
        return x.logits

model = BertClassifier(bert).to("cuda")

In [None]:
num_sin = (train_file["target"] == 0).sum() # số câu hỏi chân thành
num_insin = (train_file["target"] == 1).sum() # Số câu hỏi không chân thành
n = num_sin + num_insin
class_weights = tensor([n / (n+num_sin), n / (n+num_insin)]).to('cuda')
learn = Learner(data_loader, model, 
                loss_func=nn.CrossEntropyLoss(weight=class_weights), 
                metrics=[accuracy, F1Score()]).to_fp16()
learn.lr_find() # Biểu thị bảng learning rate

In [None]:
learn.fit_one_cycle(2, lr_max=2e-5)