## IMPORTS 

In [1]:
import random
import pandas as pd
import numpy as np
import re
import torch
from torchtext import data
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')
from collections import Counter

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torchtext.data import Example

def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

TEXT = data.Field(tokenize='spacy', fix_length=100)#preprocessing=generate_bigrams)
LABEL = data.LabelField(dtype=torch.float)


SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# load spacy tokenizer
nlp = spacy.load('en',disable=['parser', 'tagger', 'ner'])

## LOAD PROCESSED TRAINING DATA FROM DISK

In [2]:
# Uncomment to reload the Data frame
# df = pd.read_pickle('lastDFTWOCOLUMNS.pkl') #to load back to the dataframe df
# df.head()

# Uncomment to save to pickle
# new.to_pickle('lastDFTWOCOLUMNS.pkl')
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


df_train["question_text"] = df_train["question_text"].fillna("_na_").values
df_test["question_text"] = df_test["question_text"].fillna("_na_").values

# df_train["question_text"] = df_train["question_text"].progress_apply(lambda x: x.split())
# df_test["question_text"] = df_test["question_text"].progress_apply(lambda x: x.split())

# df_train["question_text"] = df_train["question_text"].progress_apply(lambda x: clean_text(x))
# df_test["question_text"] = df_test["question_text"].progress_apply(lambda x: clean_text(x))

# df_train["question_text"] = df_train["question_text"].progress_apply(lambda x: clean_numbers(x))
# df_test["question_text"] = df_test["question_text"].progress_apply(lambda x: clean_numbers(x))

# df_train["question_text"] = df_train["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
# df_test["question_text"] = df_test["question_text"].progress_apply(lambda x: replace_typical_misspell(x))

In [4]:
sin = len(df_train[df_train["target"]==0])
insin = len(df_train[df_train["target"]==1])
persin = (sin/(sin+insin))*100
perinsin = (insin/(sin+insin))*100            
print("# insincere questions: {:,}({:.2f}%) and # sincere questions: {:,}({:.2f}%)".format(sin,persin,insin,perinsin))
# print("Sinsere:{}% Insincere: {}%".format(round(persin,2),round(perinsin,2)))
print("# Test samples: {:,}({:.3f}% of train samples)".format(len(df_test),len(df_test)/len(df_train)))

# insincere questions: 1,225,312(93.81%) and # sincere questions: 80,810(6.19%)
# Test samples: 56,370(0.043% of train samples)


## Rename columns and remove id column

In [5]:
df_train = df_train[["question_text","target"]]
df_train["text"] = df_train["question_text"]
df_train["label"] = df_train["target"]
df_train = df_train.drop(["question_text","target"],axis=1)
df_train.head()

Unnamed: 0,text,label
0,How did Quebec nationalists see their province...,0
1,"Do you have an adopted dog, how would you enco...",0
2,Why does velocity affect time? Does velocity a...,0
3,How did Otto von Guericke used the Magdeburg h...,0
4,Can I convert montra helicon D to a mountain b...,0


In [6]:
df_test = df_test[["question_text"]]
df_test["text"] = df_test["question_text"]
df_test["label"] = 0
df_test = df_test.drop(["question_text"],axis=1)
df_test.head()

Unnamed: 0,text,label
0,My voice range is A2-C5. My chest voice goes u...,0
1,How much does a tutor earn in Bangalore?,0
2,What are the best made pocket knives under $20...,0
3,Why would they add a hypothetical scenario tha...,0
4,What is the dresscode for Techmahindra freshers?,0


## Create a pytorch dataset from the train samples and build a vocabulary using embedding vectors

In [7]:
# # load dataframe to csv
# new_test.to_csv("test.csv",index=False)
df_train.to_csv("trainnew.csv",index=False)
df_test.to_csv("testnew.csv",index=False)

#processed data to Pytorch data set
# test_dataset = data.TabularDataset("test.csv", "csv", fields=[('text', TEXT), ('label', LABEL)],skip_header=True)
train_dataset = data.TabularDataset("trainnew.csv", "csv", fields=[('text', TEXT), ('label', LABEL)],skip_header=True)
final_test_dataset = data.TabularDataset("testnew.csv", "csv", fields=[('text', TEXT), ('label', LABEL)],skip_header=True)

In [8]:
import torchtext
vec = torchtext.vocab.Vectors('glove.840B.300d/glove.840B.300d.txt', cache='./cache/')

In [9]:
TEXT.build_vocab(train_dataset, max_size=50000, vectors=vec)

In [10]:
LABEL.build_vocab(train_dataset)

In [12]:
TEXT.vocab.vectors.shape

torch.Size([50002, 300])

## SPLIT DATA TO TRAINiNG AND VALIDATION SETS

In [13]:
train_dataset, test_dataset = train_dataset.split(random_state=random.seed(SEED))
train_dataset, valid_dataset = train_dataset.split(random_state=random.seed(SEED))

In [14]:
BATCH_SIZE = 512

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator = data.BucketIterator(
    train_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text), 
    device=device)
test_iterator = data.BucketIterator(
    test_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text), 
    device=device)
valid_iterator = data.BucketIterator(
    valid_dataset, 
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text), 
    device=device)

In [15]:
LABEL.vocab

<torchtext.vocab.Vocab at 0x7fcc606fb710>

In [16]:
import torch.nn as nn
import torch.nn.functional as F

class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.embedding(x)
                
        #embedded = [sent len, batch size, emb dim]
        
        embedded = embedded.permute(1, 0, 2)
        
        #embedded = [batch size, sent len, emb dim]
        
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        
        #pooled = [batch size, embedding_dim]
                
        return self.fc(pooled)

In [17]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
OUTPUT_DIM = 1

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)

In [18]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0869,  0.1916,  0.1091,  ..., -0.0152,  0.1111,  0.2065],
        ...,
        [-0.7941, -0.6765,  0.3061,  ...,  0.0766,  0.4567,  0.1807],
        [ 0.0925, -0.3773, -0.1634,  ..., -0.3725,  0.1818, -0.4889],
        [-0.2422, -0.4782, -0.0560,  ...,  0.1407,  0.4678,  0.0412]])

### Training

In [19]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [20]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)

        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluateFinal(model, iterator):
       
    model.eval()
    all_predictions = []
    with torch.no_grad():

        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            all_predictions = all_predictions + list(predictions)
    return all_predictions




In [21]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')


| Epoch: 01 | Train Loss: 0.191 | Train Acc: 94.13% | Val. Loss: 0.141 | Val. Acc: 94.71% |
| Epoch: 02 | Train Loss: 0.129 | Train Acc: 95.06% | Val. Loss: 0.128 | Val. Acc: 95.05% |
| Epoch: 03 | Train Loss: 0.118 | Train Acc: 95.45% | Val. Loss: 0.124 | Val. Acc: 95.22% |
| Epoch: 04 | Train Loss: 0.112 | Train Acc: 95.64% | Val. Loss: 0.124 | Val. Acc: 95.33% |
| Epoch: 05 | Train Loss: 0.108 | Train Acc: 95.81% | Val. Loss: 0.124 | Val. Acc: 95.29% |


In [22]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.124 | Test Acc: 95.28% |


## PREDICTION

In [23]:
# final_test_iterator = data.BucketIterator(
#     final_test_dataset, 
#     batch_size=1024,sort_key=lambda x: len(x.text), 
#     device=device)


# m = evaluateFinal(model,final_test_iterator)

In [24]:
# df_t = pd.read_csv("test.csv")
# out_df = pd.DataFrame({"qid":df_t["qid"].values})
# out_df["prediction"] = m
# out_df.head()

In [25]:
# out_df["prediction"] = out_df["prediction"].apply(lambda x: int((torch.round(torch.sigmoid(x))).item()))
# out_df.to_csv("submission.csv",index=False)

In [26]:
# out_df.head()

In [34]:
df_t = pd.read_csv("test.csv")

### The prediction function

The function accepts a minimum length argument. If the tokenized input text is less than the minimum length 
we append the tokens with `<pad>` to make it the minimum length.

In [32]:
import spacy
nlp = spacy.load('en')

def predict(sentence, min_len=5):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    tokenized = tokenized[:100]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - 100)
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.round(torch.sigmoid(model(tensor)))
    return int(prediction.item())

In [35]:
out_df = pd.DataFrame({"qid":df_t["qid"].values})
out_df["prediction"] = df_t["question_text"].apply(lambda x: predict(x))

In [None]:
out_df.to_csv("submission.csv",index=False)