## IMPORTS 

In [1]:
import random
import pandas as pd
import numpy as np
import re
import torch
from torchtext import data
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')
from collections import Counter
import torchnlp
from textblob import TextBlob

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torchtext.data import Example
from sklearn.metrics import f1_score


def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x


SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# load spacy tokenizer
nlp = spacy.load('en',disable=['parser', 'tagger', 'ner'])

## LOAD PROCESSED TRAINING DATA FROM DISK

In [2]:
# Uncomment to reload the Data frame
# df = pd.read_pickle('lastDFTWOCOLUMNS.pkl') #to load back to the dataframe df
# df.head()

# Uncomment to save to pickle
# new.to_pickle('lastDFTWOCOLUMNS.pkl')
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


df_train["question_text"] = df_train["question_text"].fillna("_na_").values
df_test["question_text"] = df_test["question_text"].fillna("_na_").values

# df_train["question_text"] = df_train["question_text"].progress_apply(lambda x: x.split())
# df_test["question_text"] = df_test["question_text"].progress_apply(lambda x: x.split())

# df_train["question_text"] = df_train["question_text"].progress_apply(lambda x: clean_text(x))
# df_test["question_text"] = df_test["question_text"].progress_apply(lambda x: clean_text(x))

# df_train["question_text"] = df_train["question_text"].progress_apply(lambda x: clean_numbers(x))
# df_test["question_text"] = df_test["question_text"].progress_apply(lambda x: clean_numbers(x))

# df_train["question_text"] = df_train["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
# df_test["question_text"] = df_test["question_text"].progress_apply(lambda x: replace_typical_misspell(x))

In [4]:
sin = len(df_train[df_train["target"]==0])
insin = len(df_train[df_train["target"]==1])
persin = (sin/(sin+insin))*100
perinsin = (insin/(sin+insin))*100            
print("# insincere questions: {:,}({:.2f}%) and # sincere questions: {:,}({:.2f}%)".format(sin,persin,insin,perinsin))
# print("Sinsere:{}% Insincere: {}%".format(round(persin,2),round(perinsin,2)))
print("# Test samples: {:,}({:.3f}% of train samples)".format(len(df_test),len(df_test)/len(df_train)))

# insincere questions: 1,225,312(93.81%) and # sincere questions: 80,810(6.19%)
# Test samples: 56,370(0.043% of train samples)


## Rename columns and remove id column

In [5]:
df_train = df_train[["question_text","target"]]
df_train["text"] = df_train["question_text"]
df_train["label"] = df_train["target"]
df_train = df_train.drop(["question_text","target"],axis=1)
df_train.head()

Unnamed: 0,text,label
0,How did Quebec nationalists see their province...,0
1,"Do you have an adopted dog, how would you enco...",0
2,Why does velocity affect time? Does velocity a...,0
3,How did Otto von Guericke used the Magdeburg h...,0
4,Can I convert montra helicon D to a mountain b...,0


In [6]:
df_test = df_test[["question_text"]]
df_test["text"] = df_test["question_text"]
df_test["label"] = 0
df_test = df_test.drop(["question_text"],axis=1)
df_test.head()

Unnamed: 0,text,label
0,My voice range is A2-C5. My chest voice goes u...,0
1,How much does a tutor earn in Bangalore?,0
2,What are the best made pocket knives under $20...,0
3,Why would they add a hypothetical scenario tha...,0
4,What is the dresscode for Techmahindra freshers?,0


In [7]:
df_ince = df_train[df_train["label"]==1]
df_since = df_train[df_train["label"]==0]

## Create a pytorch dataset from the train samples and build a vocabulary using embedding vectors

In [None]:
# df_ince.to_csv("ince.csv",index=False)
# df_since.to_csv("since.csv",index=False)

# TEXT_IN = data.Field(tokenize='spacy', fix_length=100)
# TEXT_SIN = data.Field(tokenize='spacy', fix_length=100)

# ince_dataset = data.TabularDataset("ince.csv", "csv", fields=[('text', TEXT_IN), ('label', LABEL)],skip_header=True)
# since_dataset = data.TabularDataset("since.csv", "csv", fields=[('text', TEXT_SIN), ('label', LABEL)],skip_header=True)

In [8]:
# # load dataframe to csv
# new_test.to_csv("test.csv",index=False)
df_train.to_csv("trainnew.csv",index=False)
df_test.to_csv("testnew.csv",index=False)

TEXT = data.Field(tokenize='spacy', fix_length=100)#preprocessing=generate_bigrams)
LABEL = data.LabelField(dtype=torch.float)

#processed data to Pytorch data set
# test_dataset = data.TabularDataset("test.csv", "csv", fields=[('text', TEXT), ('label', LABEL)],skip_header=True)
train_dataset = data.TabularDataset("trainnew.csv", "csv", fields=[('text', TEXT), ('label', LABEL)],skip_header=True)
final_test_dataset = data.TabularDataset("testnew.csv", "csv", fields=[('text', TEXT), ('label', LABEL)],skip_header=True)

In [10]:
# def get_vector(embeddings, word):
#     assert word in embeddings.stoi, f'*{word}* is not in the vocab!'
#     return embeddings.vectors[embeddings.stoi[word]]

# def isin(embeddings, word):
#     if word in embeddings.stoi:
#         return True
#     else:
#         return False

# def closest_words(embeddings, vector, n=10):
#     distances = [(w, torch.dist(vector, get_vector(embeddings, w)).item()) for w in embeddings.itos]
#     return sorted(distances, key = lambda w: w[1])[:n]

# vars(train_dataset[0])["text"]

# reliable_vector = get_vector(vec, 'reliable')
# reliable_misspellings = ['relieable', 'relyable', 'realible', 'realiable', 'relable', 'relaible', 'reliabe', 'relaiable']
# diff_reliable = [(reliable_vector - get_vector(vec, s)).unsqueeze(0) for s in reliable_misspellings]

# misspelling_vector = torch.cat(diff_reliable, dim=0).mean(dim=0)
# closest_words(vec, get_vector(vec, 'becuase') + misspelling_vector)[0][0]

In [11]:
# closest_words(vec, get_vector(vec, 'yuo') + misspelling_vector)[0][0]
# #get_vector(vec,"Eye")

In [12]:
import torchtext
vec = torchtext.vocab.Vectors('glove.840B.300d/glove.840B.300d.txt', cache='./cache/')

In [13]:
TEXT.build_vocab(train_dataset, max_size=50000, vectors=vec)
LABEL.build_vocab(train_dataset)

# TEXT_IN.build_vocab(ince_dataset, max_size=50000, vectors=vec)
# TEXT_SIN.build_vocab(since_dataset, max_size=50000, vectors=vec)

In [14]:
# # TEXT.vocab.vectors.shape
# sincere = TEXT_SIN.vocab.freqs.most_common(500)
# insincere = TEXT_IN.vocab.freqs.most_common(500)

# sincere = [s for s,c in sincere]
# insincere = [s for s,c in insincere]

# common_in = []
# common_sin = []
# for s in insincere:
#     if not(s in sincere):
#         common_in.append(s)
# for s in sincere:
#     if not(s in insincere):
#         common_sin.append(s)

In [15]:
# from textblob import TextBlob

# def isin(embeddings, word):
#     if word in embeddings.stoi:
#         return True
#     else:
#         return False


# # b = TextBlob("hodl")
# # # print(b.correct())
# # # print(vars(ince_dataset[0])["text"])
# # for t in ince_dataset:
# #     text = vars(t)["text"]
# #     label = int(vars(t)["label"])
# #     for word in text:
# #         if not isin(vec,word):
# # #             w = TextBlob(word)
# # #             c = w.correct()
# #             if c in common_in:
# #                 print(c)

## SPLIT DATA TO TRAINiNG AND VALIDATION SETS

In [16]:
train_dataset, test_dataset = train_dataset.split(random_state=random.seed(SEED))
train_dataset, valid_dataset = train_dataset.split(random_state=random.seed(SEED))

In [17]:
BATCH_SIZE = 512

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = data.BucketIterator(
    train_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text), 
    device=device)
test_iterator = data.BucketIterator(
    test_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text), 
    device=device)
valid_iterator = data.BucketIterator(
    valid_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text), 
    device=device)

In [18]:
TEXT.vocab.vectors.shape

torch.Size([50002, 300])

In [19]:
import torch
import torch.nn as nn


class Attention(nn.Module):
    """ Applies attention mechanism on the `context` using the `query`.
    **Thank you** to IBM for their initial implementation of :class:`Attention`. Here is
    their `License
    <https://github.com/IBM/pytorch-seq2seq/blob/master/LICENSE>`__.
    Args:
        dimensions (int): Dimensionality of the query and context.
        attention_type (str, optional): How to compute the attention score:
            * dot: :math:`score(H_j,q) = H_j^T q`
            * general: :math:`score(H_j, q) = H_j^T W_a q`
    Example:
         >>> attention = Attention(256)
         >>> query = torch.randn(5, 1, 256)
         >>> context = torch.randn(5, 5, 256)
         >>> output, weights = attention(query, context)
         >>> output.size()
         torch.Size([5, 1, 256])
         >>> weights.size()
         torch.Size([5, 1, 5])
    """

    def __init__(self, dimensions, attention_type='general'):
        super(Attention, self).__init__()

        if attention_type not in ['dot', 'general']:
            raise ValueError('Invalid attention type selected.')

        self.attention_type = attention_type
        if self.attention_type == 'general':
            self.linear_in = nn.Linear(dimensions, dimensions, bias=False)

        self.linear_out = nn.Linear(dimensions * 2, dimensions, bias=False)
        self.softmax = nn.Softmax(dim=-1)
        self.tanh = nn.Tanh()

    def forward(self, query, context):
        """
        Args:
            query (:class:`torch.FloatTensor` [batch size, output length, dimensions]): Sequence of
                queries to query the context.
            context (:class:`torch.FloatTensor` [batch size, query length, dimensions]): Data
                overwhich to apply the attention mechanism.
        Returns:
            :class:`tuple` with `output` and `weights`:
            * **output** (:class:`torch.LongTensor` [batch size, output length, dimensions]):
              Tensor containing the attended features.
            * **weights** (:class:`torch.FloatTensor` [batch size, output length, query length]):
              Tensor containing attention weights.
        """
        batch_size, output_len, dimensions = query.size()
        query_len = context.size(1)

        if self.attention_type == "general":
            query = query.view(batch_size * output_len, dimensions)
            query = self.linear_in(query)
            query = query.view(batch_size, output_len, dimensions)

        # TODO: Include mask on PADDING_INDEX?

        # (batch_size, output_len, dimensions) * (batch_size, query_len, dimensions) ->
        # (batch_size, output_len, query_len)
        attention_scores = torch.bmm(query, context.transpose(1, 2).contiguous())

        # Compute weights across every context sequence
        attention_scores = attention_scores.view(batch_size * output_len, query_len)
        attention_weights = self.softmax(attention_scores)
        attention_weights = attention_weights.view(batch_size, output_len, query_len)

        # (batch_size, output_len, query_len) * (batch_size, query_len, dimensions) ->
        # (batch_size, output_len, dimensions)
        mix = torch.bmm(attention_weights, context)

        # concat -> (batch_size * output_len, 2*dimensions)
        combined = torch.cat((mix, query), dim=2)
        combined = combined.view(batch_size * output_len, 2 * dimensions)

        # Apply linear_out on every 2nd dimension of concat
        # output -> (batch_size, output_len, dimensions)
        output = self.linear_out(combined).view(batch_size, output_len, dimensions)
        output = self.tanh(output)

        return output, attention_weights

In [32]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs,embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        x = x.permute(1, 0)
        #x = [batch size, sent len]
        embedded = self.embedding(x)
        #embedded = [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)
        #embedded = [batch size, 1, sent len, emb dim]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        #pooled_n = [batch size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim=1))
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)
    
class BiLSTM(nn.Module):
    def __init__(self, pretrained_lm, padding_idx, static=True, hidden_dim=128, lstm_layer=2, dropout=0.2):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(p=dropout)
        self.embedding = nn.Embedding.from_pretrained(pretrained_lm)
        self.embedding.padding_idx = padding_idx
        if static:
            self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(input_size=self.embedding.embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=lstm_layer, 
                            dropout = dropout,
                            bidirectional=True)
        self.hidden2label = nn.Linear(hidden_dim*lstm_layer*2, 1)
    
    def forward(self, sents):
        x = self.embedding(sents)
        x = torch.transpose(x, dim0=1, dim1=0)
        lstm_out, (h_n, c_n) = self.lstm(x)
        y = self.hidden2label(self.dropout(torch.cat([c_n[i,:, :] for i in range(c_n.shape[0])], dim=1)))
        return y    


In [21]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5

# model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
model = BiLSTM(TEXT.vocab.vectors, lstm_layer=2, padding_idx=TEXT.vocab.stoi[TEXT.pad_token], hidden_dim=128).cuda()

In [22]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0869,  0.1916,  0.1091,  ..., -0.0152,  0.1111,  0.2065],
        ...,
        [-0.7941, -0.6765,  0.3061,  ...,  0.0766,  0.4567,  0.1807],
        [ 0.0925, -0.3773, -0.1634,  ..., -0.3725,  0.1818, -0.4889],
        [-0.2422, -0.4782, -0.0560,  ...,  0.1407,  0.4678,  0.0412]],
       device='cuda:0')

### Training

In [23]:
import torch.optim as optim
from sklearn.metrics import f1_score

# optimizer = optim.Adam(model.parameters())
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                    lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

def binary_accuracy(preds, y, th = 0.5):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

def f1_score_model(preds, y,th = 0.5):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded = torch.sigmoid(preds).cpu().apply_(lambda x: 1 if x>=th else 0)
    rounded_preds = rounded

    return sklearn.metrics.f1_score(y.cpu().numpy(),rounded_preds.cpu().numpy())


In [34]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluateFinal(model, iterator):
       
    model.eval()
    all_predictions = []
    with torch.no_grad():

        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            all_predictions = all_predictions + list(predictions)
    return all_predictions

def evaluate_f1(model, iterator, criterion,th=0.5):
    
    f1_scores = []
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            f1 = f1_score_model(predictions, batch.label,th=th)
            f1_scores.append(f1)
        
    return np.array(f1_scores).mean()


In [35]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

ValueError: Target size (torch.Size([512])) must be the same as input size (torch.Size([100]))

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

In [None]:
score = evaluate_f1(model, test_iterator, criterion,th = 0.31)
print(f'| f1 score: {score:.3f}')

## PREDICTION

In [None]:
# final_test_iterator = data.BucketIterator(
#     final_test_dataset, 
#     batch_size=1024,sort_key=lambda x: len(x.text), 
#     device=device)


# m = evaluateFinal(model,final_test_iterator)

In [None]:
# df_t = pd.read_csv("test.csv")
# out_df = pd.DataFrame({"qid":df_t["qid"].values})
# out_df["prediction"] = m
# out_df.head()

In [None]:
# out_df["prediction"] = out_df["prediction"].apply(lambda x: int((torch.round(torch.sigmoid(x))).item()))
# out_df.to_csv("submission.csv",index=False)

In [None]:
# out_df.head()
# TEXT.tokenize()

In [None]:
df_t = pd.read_csv("test.csv")

### The prediction function

The function accepts a minimum length argument. If the tokenized input text is less than the minimum length 
we append the tokens with `<pad>` to make it the minimum length.

In [None]:
import spacy
nlp = spacy.load('en')

def predict(sentence, min_len=5):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    tokenized = tokenized[:100]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction =0
    if torch.sigmoid(model(tensor)) >= 0.31:
        prediction = 1
    else:
        prediction = 0
#     prediction = torch.round(torch.sigmoid(model(tensor)))
    return prediction

In [None]:
test_meta = np.zeros(test_X.shape[0])


In [None]:
out_df = pd.DataFrame({"qid":df_t["qid"].values})
out_df["prediction"] = df_t["question_text"].apply(lambda x: predict(x))
out_df["prediction"] = out_df["prediction"] 

In [None]:
out_df.to_csv("submission.csv",index=False)

In [None]:
(len(out_df[out_df["prediction"]==1])/len(out_df))*100

In [None]:
out_df.head()