# Question Classifier using BERT
------

In [1]:
!pip install -q kaggle
!pip install transformers==3.0.0



In [None]:
from google.colab import files

files.upload()

In [3]:
!mkdir ~/.kaggle

!cp kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets list

mkdir: cannot create directory ‘/root/.kaggle’: File exists
ref                                                         title                                              size  lastUpdated          downloadCount  
----------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  
gpreda/reddit-vaccine-myths                                 Reddit Vaccine Myths                              237KB  2021-11-10 18:11:20          15343  
crowww/a-large-scale-fish-dataset                           A Large Scale Fish Dataset                          3GB  2021-04-28 17:03:01           9331  
imsparsh/musicnet-dataset                                   MusicNet Dataset                                   22GB  2021-02-18 14:12:19           4171  
dhruvildave/wikibooks-dataset                               Wikibooks Dataset                                   2GB  2021-10-22 10:48:21           3416  
promptcloud/care

In [4]:
!kaggle competitions download -c quora-question-pairs --force

Downloading test.csv.zip to /content
 94% 107M/114M [00:00<00:00, 136MB/s] 
100% 114M/114M [00:00<00:00, 130MB/s]
Downloading test.csv.zip to /content
 94% 163M/173M [00:03<00:00, 18.5MB/s]
100% 173M/173M [00:03<00:00, 48.0MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/4.95M [00:00<?, ?B/s]
100% 4.95M/4.95M [00:00<00:00, 44.8MB/s]
Downloading train.csv.zip to /content
 94% 20.0M/21.2M [00:00<00:00, 21.4MB/s]
100% 21.2M/21.2M [00:00<00:00, 47.7MB/s]


In [5]:
!unzip test.csv.zip
!unzip train.csv.zip

Archive:  test.csv.zip
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: test.csv                y

Archive:  train.csv.zip
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: train.csv               


In [6]:
!wc -l train.csv

404302 train.csv


In [7]:
!head -n 10 train.csv

"id","qid1","qid2","question1","question2","is_duplicate"
"0","1","2","What is the step by step guide to invest in share market in india?","What is the step by step guide to invest in share market?","0"
"1","3","4","What is the story of Kohinoor (Koh-i-Noor) Diamond?","What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?","0"
"2","5","6","How can I increase the speed of my internet connection while using a VPN?","How can Internet speed be increased by hacking through DNS?","0"
"3","7","8","Why am I mentally very lonely? How can I solve it?","Find the remainder when [math]23^{24}[/math] is divided by 24,23?","0"
"4","9","10","Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?","Which fish would survive in salt water?","0"
"5","11","12","Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?","I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?","1"


In [8]:
!wc -l test.csv

3563490 test.csv


In [9]:
!head -n 10 test.csv

"test_id","question1","question2"
0,"How does the Surface Pro himself 4 compare with iPad Pro?","Why did Microsoft choose core m3 and not core i3 home Surface Pro 4?"
1,"Should I have a hair transplant at age 24? How much would it cost?","How much cost does hair transplant require?"
2,"What but is the best way to send money from China to the US?","What you send money to China?"
3,"Which food not emulsifiers?","What foods fibre?"
4,"How ""aberystwyth"" start reading?","How their can I start reading?"
5,"How are the two wheeler insurance from Bharti Axa insurance?","I admire I am considering of buying insurance from them"
6,"How can I reduce my belly fat through a diet?","How can I reduce my lower belly fat in one month?"
7,"By scrapping the 500 and 1000 rupee notes, how is RBI planning to fight against issue black money?","How will the recent move to declare 500 and 1000 denomination lewin illegal will curb black money?"
8,"What are the how best books of all time?","What are some of the

In [10]:
import os, sys, random
import itertools

import numpy as np
import pandas as pd

import torch
from torch import optim
import torch.nn.functional as F

from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

## Dataloader and utils function

In [11]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    NUM_LABELS = 2
    
    def __init__(self, dataset_path, tokenizer, split, row_indexes=None):
        if (split == 'train' or split == 'dev'):
          df = pd.read_csv(dataset_path, sep=",")
          df['is_duplicate'] = pd.to_numeric(df['is_duplicate'], errors='ignore')
        else:
          df = pd.read_csv(dataset_path, sep=",")
          df['is_duplicate'] = 0

        if(row_indexes != None):
          df = df.iloc[row_indexes,:]
          df.reset_index(drop=True, inplace=True) 

        df['question1'] = df['question1'].str.lower()
        df['question2'] = df['question2'].str.lower()

        self.data = df
        self.tokenizer = tokenizer
    
    def __getitem__(self, index):
        data = self.data.loc[index,:]
        text1, text2, label = data['question1'], data['question2'], data['is_duplicate']
        subwords = self.tokenizer(text1, text2, padding='max_length', truncation=True, max_length=360)
        item = {key: torch.tensor(val) for key, val in subwords.items()}
        item['labels'] = torch.tensor(label)
        return item
    
    def __len__(self):
        return len(self.data)    

In [12]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
set_seed(42)

## Load model

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = CustomDataset.NUM_LABELS

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Load data

In [14]:
train_dataset_path = 'train.csv'
test_dataset_path = 'test.csv'

In [28]:
randomed_indexes = random.sample(range(0, 400000), 24000)

train_indexes = randomed_indexes[:20000]
dev_indexes = randomed_indexes[20000:]
test_indexes = random.sample(range(0, 100000), 4000)

In [29]:
train_dataset = CustomDataset(train_dataset_path, tokenizer, 'train', train_indexes)
dev_dataset = CustomDataset(train_dataset_path, tokenizer, 'dev', dev_indexes)
test_dataset = CustomDataset(test_dataset_path, tokenizer, 'test', test_indexes)

train_loader = DataLoader(dataset=train_dataset,  batch_size=16, shuffle=True)
dev_loader = DataLoader(dataset=dev_dataset,  batch_size=16, shuffle=False) 
test_loader = DataLoader(dataset=test_dataset,  batch_size=16,  shuffle=False) 

  if self.run_code(code, result):


In [30]:
print(train_dataset[0])
print(len(train_dataset))
print(len(dev_dataset))
print(len(test_dataset))

{'input_ids': tensor([  101,  2129,  2079,  1045,  3556,  2062,  2084,  6109,  1003,  1999,
         1996,  6131,  2604,  5940,  1029,   102,  2129,  2064,  1045,  3556,
         2062,  2084,  3938,  1003,  6017,  1999,  5940,  6568,  2063,  2966,
         1029,  2747,  1045,  2572, 11828,  2026,  6252,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

## Train

In [31]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [32]:
optimizer = optim.Adam(model.parameters(), lr=3e-5)
model = model.cuda()

In [33]:
device = 'cuda'
n_epochs = 4
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch in enumerate(train_pbar):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f}".format((epoch+1),
            total_train_loss/(i+1)))
        
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(dev_loader, leave=True, total=len(dev_loader))
    for i, batch in enumerate(pbar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]

        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        pbar.set_description("DEV LOSS:{:.4f} ".format(total_loss/(i+1)))

        logits = outputs[1]
        batch_hyp = torch.argmax(logits, dim=-1)
        list_hyp += batch_hyp.cpu().numpy().tolist()
        list_label += labels.cpu().numpy().tolist()
    
    acc = accuracy_score(list_label, list_hyp)
    f1 = f1_score(list_label, list_hyp, average='macro')
    rec = recall_score(list_label, list_hyp, average='macro')
    prec = precision_score(list_label, list_hyp, average='macro')

    print(f"epoch: {epoch}")
    print("Acc: ", acc)
    print("F1: ", f1)
    print("recall: ", rec)
    print("precision: ", prec)

(Epoch 1) TRAIN LOSS:0.4270: 100%|██████████| 1250/1250 [41:16<00:00,  1.98s/it]
DEV LOSS:0.3551 : 100%|██████████| 250/250 [03:11<00:00,  1.31it/s]


epoch: 0
Acc:  0.83775
F1:  0.8312507820739707
recall:  0.846050563754856
precision:  0.8261173808479176


(Epoch 2) TRAIN LOSS:0.2679: 100%|██████████| 1250/1250 [41:11<00:00,  1.98s/it]
DEV LOSS:0.3698 : 100%|██████████| 250/250 [03:10<00:00,  1.31it/s]


epoch: 1
Acc:  0.82475
F1:  0.8023346366041428
recall:  0.791385377479604
precision:  0.8234292293429828


(Epoch 3) TRAIN LOSS:0.1527: 100%|██████████| 1250/1250 [40:59<00:00,  1.97s/it]
DEV LOSS:0.4431 : 100%|██████████| 250/250 [03:10<00:00,  1.31it/s]


epoch: 2
Acc:  0.8465
F1:  0.8365696012537827
recall:  0.8409400072367108
precision:  0.8331518094860493


(Epoch 4) TRAIN LOSS:0.0906: 100%|██████████| 1250/1250 [41:02<00:00,  1.97s/it]
DEV LOSS:0.5025 : 100%|██████████| 250/250 [03:11<00:00,  1.30it/s]

epoch: 3
Acc:  0.85075
F1:  0.8409899534016665
recall:  0.8451585030947312
precision:  0.837675321649265





In [34]:
model.eval()
torch.save(model.state_dict(),'question_pair_model.bin')

## Evaluate on test

In [39]:
loaded_model = BertForSequenceClassification.from_pretrained('question_pair_model.bin', config=config)

In [37]:
# test_indexes = random.sample(range(0, 100000), 200)
# test_dataset = CustomDataset(test_dataset_path, tokenizer, 'test', test_indexes)
# test_loader = DataLoader(dataset=test_dataset,  batch_size=16,  shuffle=False) 

  if self.run_code(code, result):


In [40]:
model = loaded_model
model.eval()
torch.set_grad_enabled(False)
device = 'cpu'
total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch in enumerate(pbar):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs[0]

    logits = outputs[1]
    batch_hyp = torch.argmax(logits, dim=-1)
    list_hyp += batch_hyp.cpu().numpy().tolist()
    list_label += labels.cpu().numpy().tolist()
    
print(list_label)
print(list_hyp)

df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('qp_pred.csv', index=False)

print(df)

100%|██████████| 13/13 [03:45<00:00, 17.32s/it]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,




In [41]:
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = 2

model = BertForSequenceClassification.from_pretrained('question_pair_model.bin', config=config)
print(model.device)

def predict(text1, text2):
  subwords = tokenizer.encode(text1.lower(), text2.lower())
  subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

  logits = model(subwords)[0]
  label = torch.argmax(logits, dim=-1).item()
  return label

cpu


In [42]:
predict("when the sun rises?", "when the sun sets?")

1

In [43]:
predict("when benjamin franklin died?", "when adolf hitler died?")

0

In [44]:
predict("when i wake up today?", "when i brush my teeth today?")

0

In [45]:
predict("when i wake up today?", "when i sleep today?")

1