In [None]:
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import re

# Define LSTM-based model

In [None]:
class bi_lstm(nn.Module):
    def __init__(self, input_size, hidden_size,weight_mattrix):
        super(bi_lstm, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embed = nn.Embedding.from_pretrained(weight_mattrix)
        self.bilstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.conv1d = nn.Conv1d(256, 64, 3)
        self.max_pool_1d = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(64, 128)
        self.drop = nn.Dropout(0.2)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64,2)
    def forward(self, x):
        batch, seq_len = x.size()
        out = self.embed(x)
        h0 = torch.zeros(2, batch, self.hidden_size,dtype=torch.double).to(device)
        c0 = torch.zeros(2, batch, self.hidden_size,dtype=torch.double).to(device)
        out1, _ = self.bilstm(out.float(), (h0.float(),c0.float()))
        batch,lin,cin = out1.size()
        out1 = out1.reshape(batch,cin,lin)
        out2 = self.conv1d(out1)
        out3 = self.max_pool_1d(out2)
        out3 = out3.reshape(batch,-1)
        out4 = self.fc1(out3)
        out4 = self.drop(out4)
        out5 = self.fc2(out4)
        out6 = self.fc3(out5)
        return out6

# Define Question Dataset Class

In [None]:
class QuestionDataset(Dataset):
    def __init__(self,df,encode_mattrix,test=False):
        super(QuestionDataset, self).__init__()
        self.df = df
        self.encode = encode_mattrix
        self.is_test = test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        if self.is_test == False:
            question_text = self.encode[index]
            target = self.df['target'].iloc[index]
            return {'question_text': question_text, 'target': target}
        else:
            qid = self.df['qid'].iloc[index]
            question_text = self.encode[index]
            return {'question_text': question_text,'qid':qid}

# Define EarlyStoping module

In [None]:
class EarlyStopping():
    def __init__(self, patience, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
    def __call__(self, val_loss):
        if self.best_loss == None:
            self.best_loss = val_loss
        elif self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            # reset counter if validation loss improves
            self.counter = 0
        elif self.best_loss - val_loss < self.min_delta:
            self.counter += 1
            print(f"INFO: Early stopping counter {self.counter} of {self.patience}")
            if self.counter >= self.patience:
                print('INFO: Early stopping')
                self.early_stop = True

# Train and valid function

In [None]:
#Train function
def train_model(dataloader,dataset,optimizer,device,model,loss_function):
    print('Training')
    model.train()
    train_loss=0.0
    train_acc = 0.0
    counter = 0
    total = 0
    for batch,data in tqdm(enumerate(dataloader),total=int(len(dataset)/dataloader.batch_size)):
        counter += 1
        question = data['question_text']
        question = question.to(device)
        target = data['target'].to(device)
        total += target.size(0)
        result = model(question)
        loss = loss_function(result,target)
        train_loss += loss.item()
        _, preds = result.max(1)
        train_acc += (preds == target).sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return train_loss/counter,100. * train_acc/total

In [None]:
#valid function
def valid_model(dataloader,dataset,model,device,loss_function):
    print('Validating')
    model.eval()
    val_loss = 0.0
    val_acc = 0.0
    counter=0
    total=0
    with torch.no_grad():
        for batch,data in tqdm(enumerate(dataloader),total=int(len(dataset)/dataloader.batch_size)):
            counter += 1
            question = data['question_text'].to(device)
            target = data['target'].to(device)
            total += target.size(0)
            result = model(question)
            _,preds = result.max(1)
            val_acc += (preds == target).sum().item()
            loss = loss_function(result,target)
            val_loss += loss.item()
        return val_loss/counter, 100. * val_acc/total

# Define contractions and their normal words

In [None]:

contractions={"I'm": 'I am',
 "I'm'a": 'I am about to',
 "I'm'o": 'I am going to',
 "I've": 'I have',
 "I'll": 'I will',
 "I'll've": 'I will have',
 "I'd": 'I would',
 "I'd've": 'I would have',
 'Whatcha': 'What are you',
 "amn't": 'am not',
 "ain't": 'are not',
 "aren't": 'are not',
 "'cause": 'because',
 "can't": 'can not',
 "can't've": 'can not have',
 "could've": 'could have',
 "couldn't": 'could not',
 "couldn't've": 'could not have',
 "daren't": 'dare not',
 "daresn't": 'dare not',
 "dasn't": 'dare not',
 "didn't": 'did not',
 'didn’t': 'did not',
 "don't": 'do not',
 'don’t': 'do not',
 "doesn't": 'does not',
 "e'er": 'ever',
 "everyone's": 'everyone is',
 'finna': 'fixing to',
 'gimme': 'give me',
 "gon't": 'go not',
 'gonna': 'going to',
 'gotta': 'got to',
 "hadn't": 'had not',
 "hadn't've": 'had not have',
 "hasn't": 'has not',
 "haven't": 'have not',
 "he've": 'he have',
 "he's": 'he is',
 "he'll": 'he will',
 "he'll've": 'he will have',
 "he'd": 'he would',
 "he'd've": 'he would have',
 "here's": 'here is',
 "how're": 'how are',
 "how'd": 'how did',
 "how'd'y": 'how do you',
 "how's": 'how is',
 "how'll": 'how will',
 "isn't": 'is not',
 "it's": 'it is',
 "'tis": 'it is',
 "'twas": 'it was',
 "it'll": 'it will',
 "it'll've": 'it will have',
 "it'd": 'it would',
 "it'd've": 'it would have',
 'kinda': 'kind of',
 "let's": 'let us',
 'luv': 'love',
 "ma'am": 'madam',
 "may've": 'may have',
 "mayn't": 'may not',
 "might've": 'might have',
 "mightn't": 'might not',
 "mightn't've": 'might not have',
 "must've": 'must have',
 "mustn't": 'must not',
 "mustn't've": 'must not have',
 "needn't": 'need not',
 "needn't've": 'need not have',
 "ne'er": 'never',
 "o'": 'of',
 "o'clock": 'of the clock',
 "ol'": 'old',
 "oughtn't": 'ought not',
 "oughtn't've": 'ought not have',
 "o'er": 'over',
 "shan't": 'shall not',
 "sha'n't": 'shall not',
 "shalln't": 'shall not',
 "shan't've": 'shall not have',
 "she's": 'she is',
 "she'll": 'she will',
 "she'd": 'she would',
 "she'd've": 'she would have',
 "should've": 'should have',
 "shouldn't": 'should not',
 "shouldn't've": 'should not have',
 "so've": 'so have',
 "so's": 'so is',
 "somebody's": 'somebody is',
 "someone's": 'someone is',
 "something's": 'something is',
 'sux': 'sucks',
 "that're": 'that are',
 "that's": 'that is',
 "that'll": 'that will',
 "that'd": 'that would',
 "that'd've": 'that would have',
 'em': 'them',
 "there're": 'there are',
 "there's": 'there is',
 "there'll": 'there will',
 "there'd": 'there would',
 "there'd've": 'there would have',
 "these're": 'these are',
 "they're": 'they are',
 "they've": 'they have',
 "they'll": 'they will',
 "they'll've": 'they will have',
 "they'd": 'they would',
 "they'd've": 'they would have',
 "this's": 'this is',
 "those're": 'those are',
 "to've": 'to have',
 'wanna': 'want to',
 "wasn't": 'was not',
 "we're": 'we are',
 "we've": 'we have',
 "we'll": 'we will',
 "we'll've": 'we will have',
 "we'd": 'we would',
 "we'd've": 'we would have',
 "weren't": 'were not',
 "what're": 'what are',
 "what'd": 'what did',
 "what've": 'what have',
 "what's": 'what is',
 "what'll": 'what will',
 "what'll've": 'what will have',
 "when've": 'when have',
 "when's": 'when is',
 "where're": 'where are',
 "where'd": 'where did',
 "where've": 'where have',
 "where's": 'where is',
 "which's": 'which is',
 "who're": 'who are',
 "who've": 'who have',
 "who's": 'who is',
 "who'll": 'who will',
 "who'll've": 'who will have',
 "who'd": 'who would',
 "who'd've": 'who would have',
 "why're": 'why are',
 "why'd": 'why did',
 "why've": 'why have',
 "why's": 'why is',
 "will've": 'will have',
 "won't": 'will not',
 "won't've": 'will not have',
 "would've": 'would have',
 "wouldn't": 'would not',
 "wouldn't've": 'would not have',
 "y'all": 'you all',
 "y'all're": 'you all are',
 "y'all've": 'you all have',
 "y'all'd": 'you all would',
 "y'all'd've": 'you all would have',
 "you're": 'you are',
 "you've": 'you have',
 "you'll've": 'you shall have',
 "you'll": 'you will',
 "you'd": 'you would',
 "you'd've": 'you would have',
 'jan.': 'january',
 'feb.': 'february',
 'mar.': 'march',
 'apr.': 'april',
 'jun.': 'june',
 'jul.': 'july',
 'aug.': 'august',
 'sep.': 'september',
 'oct.': 'october',
 'nov.': 'november',
 'dec.': 'december',
 'I’m': 'I am',
 'I’m’a': 'I am about to',
 'I’m’o': 'I am going to',
 'I’ve': 'I have',
 'I’ll': 'I will',
 'I’ll’ve': 'I will have',
 'I’d': 'I would',
 'I’d’ve': 'I would have',
 'amn’t': 'am not',
 'ain’t': 'are not',
 'aren’t': 'are not',
 '’cause': 'because',
 'can’t': 'can not',
 'can’t’ve': 'can not have',
 'could’ve': 'could have',
 'couldn’t': 'could not',
 'couldn’t’ve': 'could not have',
 'daren’t': 'dare not',
 'daresn’t': 'dare not',
 'dasn’t': 'dare not',
 'doesn’t': 'does not',
 'e’er': 'ever',
 'everyone’s': 'everyone is',
 'gon’t': 'go not',
 'hadn’t': 'had not',
 'hadn’t’ve': 'had not have',
 'hasn’t': 'has not',
 'haven’t': 'have not',
 'he’ve': 'he have',
 'he’s': 'he is',
 'he’ll': 'he will',
 'he’ll’ve': 'he will have',
 'he’d': 'he would',
 'he’d’ve': 'he would have',
 'here’s': 'here is',
 'how’re': 'how are',
 'how’d': 'how did',
 'how’d’y': 'how do you',
 'how’s': 'how is',
 'how’ll': 'how will',
 'isn’t': 'is not',
 'it’s': 'it is',
 '’tis': 'it is',
 '’twas': 'it was',
 'it’ll': 'it will',
 'it’ll’ve': 'it will have',
 'it’d': 'it would',
 'it’d’ve': 'it would have',
 'let’s': 'let us',
 'ma’am': 'madam',
 'may’ve': 'may have',
 'mayn’t': 'may not',
 'might’ve': 'might have',
 'mightn’t': 'might not',
 'mightn’t’ve': 'might not have',
 'must’ve': 'must have',
 'mustn’t': 'must not',
 'mustn’t’ve': 'must not have',
 'needn’t': 'need not',
 'needn’t’ve': 'need not have',
 'ne’er': 'never',
 'o’': 'of',
 'o’clock': 'of the clock',
 'ol’': 'old',
 'oughtn’t': 'ought not',
 'oughtn’t’ve': 'ought not have',
 'o’er': 'over',
 'shan’t': 'shall not',
 'sha’n’t': 'shall not',
 'shalln’t': 'shall not',
 'shan’t’ve': 'shall not have',
 'she’s': 'she is',
 'she’ll': 'she will',
 'she’d': 'she would',
 'she’d’ve': 'she would have',
 'should’ve': 'should have',
 'shouldn’t': 'should not',
 'shouldn’t’ve': 'should not have',
 'so’ve': 'so have',
 'so’s': 'so is',
 'somebody’s': 'somebody is',
 'someone’s': 'someone is',
 'something’s': 'something is',
 'that’re': 'that are',
 'that’s': 'that is',
 'that’ll': 'that will',
 'that’d': 'that would',
 'that’d’ve': 'that would have',
 'there’re': 'there are',
 'there’s': 'there is',
 'there’ll': 'there will',
 'there’d': 'there would',
 'there’d’ve': 'there would have',
 'these’re': 'these are',
 'they’re': 'they are',
 'they’ve': 'they have',
 'they’ll': 'they will',
 'they’ll’ve': 'they will have',
 'they’d': 'they would',
 'they’d’ve': 'they would have',
 'this’s': 'this is',
 'those’re': 'those are',
 'to’ve': 'to have',
 'wasn’t': 'was not',
 'we’re': 'we are',
 'we’ve': 'we have',
 'we’ll': 'we will',
 'we’ll’ve': 'we will have',
 'we’d': 'we would',
 'we’d’ve': 'we would have',
 'weren’t': 'were not',
 'what’re': 'what are',
 'what’d': 'what did',
 'what’ve': 'what have',
 'what’s': 'what is',
 'what’ll': 'what will',
 'what’ll’ve': 'what will have',
 'when’ve': 'when have',
 'when’s': 'when is',
 'where’re': 'where are',
 'where’d': 'where did',
 'where’ve': 'where have',
 'where’s': 'where is',
 'which’s': 'which is',
 'who’re': 'who are',
 'who’ve': 'who have',
 'who’s': 'who is',
 'who’ll': 'who will',
 'who’ll’ve': 'who will have',
 'who’d': 'who would',
 'who’d’ve': 'who would have',
 'why’re': 'why are',
 'why’d': 'why did',
 'why’ve': 'why have',
 'why’s': 'why is',
 'will’ve': 'will have',
 'won’t': 'will not',
 'won’t’ve': 'will not have',
 'would’ve': 'would have',
 'wouldn’t': 'would not',
 'wouldn’t’ve': 'would not have',
 'y’all': 'you all',
 'y’all’re': 'you all are',
 'y’all’ve': 'you all have',
 'y’all’d': 'you all would',
 'y’all’d’ve': 'you all would have',
 'you’re': 'you are',
 'you’ve': 'you have',
 'you’ll’ve': 'you shall have',
 'you’ll': 'you will',
 'you’d': 'you would',
 'you’d’ve': 'you would have'}

#Đưa các từ viết tắt về dạng chuẩn
def contraction_fix(word):
    try:
        a=contractions[word]
    except KeyError:
        a=word
    return a

In [None]:
def Preprocess(doc):
    corpus=[] #Tập văn bản câu hỏi  được xử lý
    for text in tqdm(doc):
        text=" ".join([contraction_fix(w) for w in text.split()])
        text=re.sub(r'[^a-z0-9A-Z]'," ",text) #bỏ dấu gạch nối giữa các kí tự
        text=re.sub(r'[0-9]{1}',"#",text) 
        text=re.sub(r'[0-9]{2}','##',text)   #thay các kí tự số bằng #
        text=re.sub(r'[0-9]{3}','###',text)
        text=re.sub(r'[0-9]{4}','####',text)
        text=re.sub(r'[0-9]{5,}','#####',text)
        corpus.append(text)
    return corpus

In [None]:
### tạo bộ từ vựng từ tập văn phạm
### cấu trúc {từ vựng : số lần xuất hiện từ vựng đó trong tập văn phạm}
def vocab_build(corpus):
    vocab={}
    for text in tqdm(corpus):
        for word in text.split():
            try:
                vocab[word]+=1
            except KeyError:
                vocab[word]=1
    return vocab

In [None]:
### Lấy vị trí các từ trong bộ từ vựng
def get_word_index(vocab):
    word_index=dict((w,i+1) for i,w in enumerate(vocab.keys()))
    return word_index
###Encoding các văn bản trong corpus
def fit_one_hot(word_index,corpus):
    all_questions=[]
    for text in tqdm(corpus):
        #Một câu được mã hoá bằng một vector chứa các thứ tự của các từ trong câu
        question=[]
        for word in text.split():
            try:
                #Mỗi từ trong câu sẽ được mã hoá bằng thứ tự từ đó trong bộ từ vựng
                question.append(word_index[word])
            except KeyError:
                #Nếu từ đó không có trong bộ từ vựng sẽ được mã hoá là 0
                question.append(0)
        all_questions.append(question)
    return all_questions

In [None]:
!unzip ../input/quora-insincere-questions-classification/embeddings.zip

# Download Embedding packages

In [None]:
### Load Google News pretrain embedding (model phụ trách việc embedding từ)
file_name="./GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
model_embed=KeyedVectors.load_word2vec_format(file_name,binary=True)

In [None]:
test_data=pd.read_csv("../input/quora-insincere-questions-classification/test.csv")
train_data=pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
train,val = train_test_split(train_data,test_size=0.2,stratify=train_data.target,random_state=123)
#keep,ignore=train_test_split(train_data,test_size=0.7,stratify=train_data.target,random_state=123)
#train,val = train_test_split(keep,test_size=0.2,stratify=keep.target,random_state=123)

In [None]:
#Lấy toàn bộ câu hỏi trong train và test dataset
total_text=pd.concat([train_data.question_text,test_data.question_text])
#Tiền xử lý 
pre_text=Preprocess(total_text)
#Tạo bộ từ vựng
vocabulary=vocab_build(pre_text)

In [None]:
vocab_size=len(vocabulary)+1
#Khởi tạo độ dài mỗi câu đã được encoding cố định là 40

word_index=get_word_index(vocabulary)

train_text=Preprocess(train.question_text)
val_text=Preprocess(val.question_text)
test_text=Preprocess(test_data.question_text)

train_encode = fit_one_hot(word_index,train_text)
val_encode = fit_one_hot(word_index,val_text)
test_encode = fit_one_hot(word_index,test_text)


train_padded=pad_sequences(train_encode,maxlen=40,padding="post")
val_padded=pad_sequences(val_encode,maxlen=40,padding="post")
test_padded=pad_sequences(test_encode,maxlen=40,padding="post")

In [None]:
train.head()

# Create dataset

In [None]:
train_dataset = QuestionDataset(train,train_padded)
valid_dataset = QuestionDataset(val,val_padded)
test_dataset = QuestionDataset(test_data,test_padded,True)

# Embedding questions

In [None]:
#Khởi tạo ma trận embedding cho các từ trong bộ từ vựng
embedding_mat=np.zeros((vocab_size,300))
for word,i in tqdm(word_index.items()):
    try:
        vec=model_embed[word] 
        embedding_mat[i]=vec
    except KeyError:
        continue
        
weight_mattrix = torch.from_numpy(embedding_mat)

# Create dataloader

In [None]:
train_dataloader = DataLoader(train_dataset,batch_size=512,shuffle=True)
val_dataloader = DataLoader(valid_dataset,batch_size=512,shuffle=False)
test_dataloader = DataLoader(test_dataset,batch_size=512,shuffle=False)

In [None]:
device_name = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device_name)

# Create model,optimizer, loss function

In [None]:
model = bi_lstm(300,128,weight_mattrix).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min',patience=1)
earlystopping = EarlyStopping(patience=1)

# Training and Validating

In [None]:
for epoch in range(10):
    print('Epoch{epoch} of 10:'.format(epoch=epoch))
    train_loss , train_acc = train_model(train_dataloader,train_dataset,optimizer,device,model,criterion)
    print('Trainloss:{loss:.3f},Train-accuracy:{acc:.3f}'.format(loss=train_loss,acc=train_acc))
    val_loss, val_acc  = valid_model(val_dataloader,valid_dataset,model,device,criterion)
    print('Valloss:{loss:.3f},Val-accuracy:{acc:.3f}'.format(loss=val_loss,acc=val_acc))
    scheduler.step(val_loss)
    earlystopping(val_loss)
    if earlystopping.early_stop:
        break


In [None]:
all_qid = []
all_preds = []
for batch,data in tqdm(enumerate(test_dataloader),total = len(test_dataset)/test_dataloader.batch_size):
    question_text = data['question_text'].to(device)
    qid = data['qid']
    out = model(question_text)
    _,preds = out.max(1)
    all_qid += qid
    all_preds += preds.tolist()


# To submission file

In [None]:
submit = pd.DataFrame()
submit['qid'] = all_qid
submit['prediction'] = all_preds

submit.to_csv("submission.csv",index=False)