In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import regex
from tqdm import tqdm_notebook, trange, tqdm

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
print(device)

In [None]:
import torch
import torch.utils.data
import torchvision


class ImbalancedDatasetSampler(torch.utils.data.sampler.Sampler):
    """Samples elements randomly from a given list of indices for imbalanced dataset
    Arguments:
        indices (list, optional): a list of indices
        num_samples (int, optional): number of samples to draw
    """

    def __init__(self, dataset, indices=None, num_samples=None):
                
        # if indices is not provided, 
        # all elements in the dataset will be considered
        self.indices = list(range(len(dataset))) \
            if indices is None else indices
            
        # if num_samples is not provided, 
        # draw `len(indices)` samples in each iteration
        self.num_samples = len(self.indices) \
            if num_samples is None else num_samples
            
        # distribution of classes in the dataset 
        label_to_count = {}
        for idx in self.indices:
            label = self._get_label(dataset, idx)
            if label in label_to_count:
                label_to_count[label] += 1
            else:
                label_to_count[label] = 1
                
        # weight for each sample
        weights = [1.0 / label_to_count[self._get_label(dataset, idx)]
                   for idx in self.indices]
        self.weights = torch.DoubleTensor(weights)

    def _get_label(self, dataset, idx):
        dataset_type = type(dataset)
        if dataset_type is torchvision.datasets.MNIST:
            return dataset.train_labels[idx].item()
        elif dataset_type is torchvision.datasets.ImageFolder:
            return dataset.imgs[idx][1]
        else:
            return dataset.labels[idx]
            #raise NotImplementedError
                
    def __iter__(self):
        return (self.indices[i] for i in torch.multinomial(
            self.weights, self.num_samples, replacement=True))

    def __len__(self):
        return self.num_samples

In [None]:
raw_csv = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
test_csv = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')
raw_csv.head(3)

In [None]:
raw_csv['len_question'] = raw_csv['question_text'].apply(lambda x : len(x))

In [None]:
len_table = raw_csv[raw_csv['len_question'] < 120]

In [None]:
sns.distplot(len_table['len_question'])

In [None]:
trainset, valset = train_test_split(raw_csv, test_size = 0.3)

In [None]:
#model = gensim.models.KeyedVectors.load_word2vec_format('embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
train_questions = trainset['question_text'].tolist()
train_labels = trainset['target'].tolist()
val_questions = valset['question_text'].tolist()
val_labels = valset['target'].tolist()
test_questions = test_csv['question_text'].tolist()

In [None]:
def clean_text(text):
    
    # Common
    text = regex.sub("(?s)<ref>.+?</ref>", "", text) # remove reference links
    text = regex.sub("(?s)<[^>]+>", "", text) # remove html tags
    text = regex.sub("&[a-z]+;", "", text) # remove html entities
    text = regex.sub("(?s){{.+?}}", "", text) # remove markup tags
    text = regex.sub("(?s){.+?}", "", text) # remove markup tags
    text = regex.sub("(?s)\[\[([^]]+\|)", "", text) # remove link target strings
    text = regex.sub("(?s)\[\[([^]]+\:.+?]])", "", text) # remove media links
    
    text = regex.sub("[']{5}", "", text) # remove italic+bold symbols
    text = regex.sub("[']{3}", "", text) # remove bold symbols
    text = regex.sub("[']{2}", "", text) # remove italic symbols
    
    return text

In [None]:
new_train_questions = []
for question in train_questions:
    new_train_questions.append(clean_text(question))
new_val_questions = []
for question in val_questions:
    new_val_questions.append(clean_text(question))
new_test_questions = []
for question in test_questions:
    new_test_questions.append(clean_text(question))

all_sentences = new_train_questions + new_val_questions + new_test_questions

In [None]:
def make_dict(sen_list):
    word2ix = {'<pad>': 0, '<UNK>': 1}
    ix2word = {0: '<pad>', 1: '<UNK>'}
    cnt = 0
    
    for sentence in sen_list:
        for word in sentence.split():
            if word in word2ix:
                pass
            else:
                word2ix[word] = cnt
                ix2word[cnt] = word
                cnt += 1
    
    return word2ix, ix2word, cnt+1

In [None]:
word2ix, ix2word, num_word = make_dict(all_sentences)

In [None]:
print(num_word)
word2ix

In [None]:
class QuestionData(data.Dataset):
    
    def __init__(self, questions, labels, word2ix, ix2word, num_word, max_length):
        self.questions = questions
        self.labels = labels
        self.word2ix = word2ix
        self.ix2word = ix2word
        self.num_word = num_word
        self.max_length = max_length
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, ix):
        question = self.questions[ix]
        q_ix = []
        for word in question.split():
            q_ix.append(self.word2ix[word])
        if len(q_ix) < self.max_length:
            q_ix += [0 for i in range(self.max_length - len(q_ix))]
        else:
            q_ix = q_ix[:self.max_length]
            
        return torch.tensor(q_ix, dtype=torch.long), torch.tensor([self.labels[ix]], dtype=torch.float)

In [None]:
trainset = QuestionData(new_train_questions, train_labels, word2ix, ix2word, num_word, 75)
valset = QuestionData(new_val_questions, val_labels, word2ix, ix2word, num_word, 75)

In [None]:
trainloader = data.DataLoader(trainset, sampler=ImbalancedDatasetSampler(trainset), batch_size=16)
valloader = data.DataLoader(valset, batch_size=16, shuffle=False)

In [None]:
class BiLSTMClassifier(nn.Module):
    
    def __init__(self, num_word, embed_dim, hidden_dim, num_layer, dropout=0.2, pretrained=None, bidirectional=True):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(num_word, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layer, dropout=dropout, batch_first=True, bidirectional=bidirectional)
        self.fc1 = nn.Linear(hidden_dim*2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        out, (hidden, cell) = self.lstm(x)
        #dropout = F.dropout(out[:,-1,:])
        dropout = out[:,-1,:]
        logit = F.relu(self.fc1(dropout))
        logit = self.fc2(logit)
        return logit

In [None]:
class AttentionalClassifier(nn.Module):
    
    def __init__(self, num_word, embed_dim, hidden_dim, num_layer, dropout=0.2, pretrained=None, bidirectional=True):
        super(AttentionalClassifier, self).__init__()
        self.embedding = nn.Embedding(num_word, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layer, dropout=dropout, batch_first=True, bidirectional=bidirectional)
        self.fc1 = nn.Linear(hidden_dim*4, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        out, (hidden, cell) = self.lstm(x) # 1 x 75 x 600
        #out[:, -1, :] #1 x 600
        out_T = out[:, -1, :].unsqueeze(1).transpose(1,2)
        atten_weight = torch.bmm(out, out_T) #8 x 75 x 1
        atten_applied = torch.bmm(atten_weight.transpose(1,2), out).squeeze(1)        
        atten_combined = torch.cat([atten_applied, out[:, -1, :]], dim=1)
        
        dropout = F.dropout(atten_combined)
        #dropout = F.dropout(out[:,-1,:])
        logit = F.relu(self.fc1(dropout))
        logit = self.fc2(logit)
        return logit

In [None]:
class CNNClassifier(nn.Module):
    
    def __init__(self, num_word, embed_dim, hidden_dim, pretrained=None):
        super(CNNClassifier, self).__init__()
        self.embedding = nn.Embedding(num_word, embed_dim)
        
        self.cnn_1 = nn.Conv1d(embed_dim, hidden_dim, kernel_size=3)
        self.cnn_2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, stride=2)
        
        self.fc1 = nn.Linear(36, 30)
        self.fc2 = nn.Linear(30, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(1,2)
        x = F.relu(self.cnn_1(x))
        out = F.relu(self.cnn_2(x))
        dropout = F.dropout(out[:,-1,:])
        logit = F.relu(self.fc1(dropout))
        logit = self.fc2(logit)
        return logit

In [None]:
#lstm_model = BiLSTMClassifier(num_word, 300, 64, 2).to(device)
lstm_model = AttentionalClassifier(num_word, 300, 64, 2).to(device)
cnn_model = CNNClassifier(num_word, 300, 64).to(device)

In [None]:
def train(model, num_epoch=5, print_every=1000):
    optimizer = optim.Adam(params=model.parameters(), lr=1e-2)
    model.train()
    
    for epoch in tqdm(range(num_epoch)):
    
        training_loss = 0.0
        for i, (data, label) in enumerate(tqdm_notebook(trainloader)):
            data, label = data.to(device), label.to(device)
            #label = label.unsqueeze(1)

            #out = lstm_model(data)
            out = model(data)
            loss = F.binary_cross_entropy_with_logits(out, label)
            training_loss += loss.item()
            
            loss.backward()
            #nn.utils.clip_grad_norm(model.parameters(), 2)
            optimizer.zero_grad()
            optimizer.step()

            if (i+1) % print_every == 0:
                print('Step %d | loss %0.4f' %(i+1, training_loss/print_every))
                training_loss = 0.0
                
    return model

In [None]:
def test(model, loader):
    model.eval()
    with torch.no_grad():
        total = 0
        correct = 0
        
        for i, (data, label) in enumerate(loader):
            data, label = data.to(device), label.to(device)
            out = torch.sigmoid(model(data))
            result = out.detach()#.squeeze(1)
            result[result >= 0.5] = 1
            result[result < 0.5] = 0
            correct += (result == label).sum().item()
            total += result.shape[0]

    print('Total Accuracy : %0.4f' %(correct / total))

In [None]:
model = train(lstm_model, 3)

In [None]:
test(lstm_model, valloader)