# Install required libraries


In [1]:
!pip3 install transformers

import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel, AutoTokenizer, DistilBertForSequenceClassification
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

from google.colab import drive
drive.mount('/content/drive/') # mount your drive
%cd '/content/drive/My Drive/Notability/AI6127 Deep Neural Networks For Natural Language Processing/AI6127_Project-main' 


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.3 transformers-4.27.4
Mounted at /content/drive/
/content/drive/My Driv

# Read csv file and remove duplicates

In [2]:
data_df = pd.read_csv("qna_sampled.csv")
display(data_df)
data_df2 = data_df.drop_duplicates(subset=['QuestionText'])
print(f'Number of unique questions: {len(data_df2)}')

Unnamed: 0,QuestionID,AnswerText,AnswererID,AnswerTime,AnswerType,AnswerScore,QuestionType,Category,AskerID,QuestionTime,QuestionText
0,C1Q4,It will work on obd 1 pre 95 and obd2 after 95,A2O27R8PDI3COS,"July 21, 2014",,,yes/no,Automotive,A367QVRWPWFTLT,"September 3, 2013",will this work on 95 bonneville and 94 camaro
1,C1Q4,"Hi Dave:Sorry, but it won't; it claims to be a...",A7JW7QYDB0OUA,"July 22, 2014",,,yes/no,Automotive,A367QVRWPWFTLT,"September 3, 2013",will this work on 95 bonneville and 94 camaro
2,C1Q4,this unit comes with a variety of connectors. ...,A7EX1INTXN1WC,"July 23, 2014",,,yes/no,Automotive,A367QVRWPWFTLT,"September 3, 2013",will this work on 95 bonneville and 94 camaro
3,C1Q4,it is obd 1 and obd 2 compatable the kit has a...,A2O27R8PDI3COS,"May 2, 2014",?,0.9005,yes/no,Automotive,A367QVRWPWFTLT,"September 3, 2013",will this work on 95 bonneville and 94 camaro
4,C1Q4,Yes it will.,A3QPOMZXVYZXU2,"May 2, 2014",Y,0.9928,yes/no,Automotive,A367QVRWPWFTLT,"September 3, 2013",will this work on 95 bonneville and 94 camaro
...,...,...,...,...,...,...,...,...,...,...,...
45092,C17Q1063,I am running Puzzle Master 4 on my Windows 8 P...,APU42APYPYSIS,"July 19, 2014",,,yes/no,Video Games,A1IF5KV75EPYNU,"June 5, 2014",will this work on microsoft vista?
45093,C17Q1063,Yes it will wook on windows 7,AB6XQXOXUBP2Y,"June 19, 2014",,,yes/no,Video Games,A1IF5KV75EPYNU,"June 5, 2014",will this work on microsoft vista?
45094,C17Q1063,"I have it on windows 7, it works perfectly;I h...",A8MIP0LCX3XO5,"June 19, 2014",,,yes/no,Video Games,A1IF5KV75EPYNU,"June 5, 2014",will this work on microsoft vista?
45095,C17Q1063,"i'm using Microsoft 7, no idea about vista, mi...",A5N3BEUCQR6UA,"June 6, 2014",?,0.3766,yes/no,Video Games,A1IF5KV75EPYNU,"June 5, 2014",will this work on microsoft vista?


Number of unique questions: 1956


# Tokenizer for Dataset

In [33]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Original is cased
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

labels = {'yes/no':0,
          'open-ended':1
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['QuestionType']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 256, truncation=True,
                                return_tensors="pt") for text in df['QuestionText']]
        self.qn_ids = [qn_id for qn_id in df["QuestionID"]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def get_batch_qn_ids(self, idx):
        # Fetch a batch of inputs
        return self.qn_ids[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx) # encoded question text
        batch_y = self.get_batch_labels(idx) # question type label [yes/no, open-ended]
        batch_qn_ids = self.get_batch_qn_ids(idx) # question id

        return batch_texts, batch_y, batch_qn_ids

# train/val/test split

In [25]:
np.random.seed(112)
# train_data, val_data, test_data = np.split(data_df.sample(frac=1, random_state=42), 
#                                      [int(0.7*len(data_df)), int(0.85*len(data_df))])

train_data, val_data, test_data = np.split(data_df2.sample(frac=1, random_state=42), 
                                     [int(0.7*len(data_df2)), int(0.85*len(data_df2))])


print(f'Number of train data: {len(train_data)}')
print(f'Number of val data: {len(val_data)}')
print(f'Number of test data: {len(test_data)}')

Number of train data: 1369
Number of val data: 293
Number of test data: 294


# Define Bert classifier model

In [26]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        # self.bert = BertModel.from_pretrained('bert-base-uncased') # original is cased
        self.bert = DistilBertModel.from_pretrained('distilbert-base-cased')
        # self.bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        # For BERT
        # _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False) # [batch_size, hidden_size]
        # dropout_output = self.dropout(pooled_output)

        # For DistilBERT
        pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False) # [[batch_size, sequence_length, hidden_size]]
        dropout_output = self.dropout(pooled_output[0][:,0])

        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

# Train model

In [27]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=32)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.to(device)
            criterion = criterion.to(device)

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label, _ in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label, _ in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
            
    # Save model weights
    torch.save(model.state_dict(), 'QuestionType_Classifier.pt')
                  
EPOCHS = 20 # 5 for original BERT
model = BertClassifier()
LR = 1e-6 #1e-6 for original BERT
              
train(model, train_data, val_data, LR, EPOCHS)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 43/43 [00:28<00:00,  1.51it/s]


Epochs: 1 | Train Loss:  0.022                 | Train Accuracy:  0.491                 | Val Loss:  0.024                 | Val Accuracy:  0.549


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 2 | Train Loss:  0.022                 | Train Accuracy:  0.503                 | Val Loss:  0.023                 | Val Accuracy:  0.560


100%|██████████| 43/43 [00:26<00:00,  1.61it/s]


Epochs: 3 | Train Loss:  0.022                 | Train Accuracy:  0.495                 | Val Loss:  0.023                 | Val Accuracy:  0.495


100%|██████████| 43/43 [00:27<00:00,  1.59it/s]


Epochs: 4 | Train Loss:  0.022                 | Train Accuracy:  0.493                 | Val Loss:  0.023                 | Val Accuracy:  0.522


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 5 | Train Loss:  0.021                 | Train Accuracy:  0.499                 | Val Loss:  0.023                 | Val Accuracy:  0.570


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 6 | Train Loss:  0.021                 | Train Accuracy:  0.523                 | Val Loss:  0.022                 | Val Accuracy:  0.601


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 7 | Train Loss:  0.020                 | Train Accuracy:  0.603                 | Val Loss:  0.021                 | Val Accuracy:  0.727


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 8 | Train Loss:  0.016                 | Train Accuracy:  0.840                 | Val Loss:  0.015                 | Val Accuracy:  0.846


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 9 | Train Loss:  0.010                 | Train Accuracy:  0.898                 | Val Loss:  0.010                 | Val Accuracy:  0.887


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 10 | Train Loss:  0.007                 | Train Accuracy:  0.929                 | Val Loss:  0.008                 | Val Accuracy:  0.904


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 11 | Train Loss:  0.005                 | Train Accuracy:  0.952                 | Val Loss:  0.007                 | Val Accuracy:  0.915


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 12 | Train Loss:  0.004                 | Train Accuracy:  0.962                 | Val Loss:  0.006                 | Val Accuracy:  0.942


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 13 | Train Loss:  0.003                 | Train Accuracy:  0.971                 | Val Loss:  0.006                 | Val Accuracy:  0.939


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 14 | Train Loss:  0.003                 | Train Accuracy:  0.974                 | Val Loss:  0.006                 | Val Accuracy:  0.939


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 15 | Train Loss:  0.002                 | Train Accuracy:  0.980                 | Val Loss:  0.006                 | Val Accuracy:  0.945


100%|██████████| 43/43 [00:26<00:00,  1.59it/s]


Epochs: 16 | Train Loss:  0.002                 | Train Accuracy:  0.980                 | Val Loss:  0.005                 | Val Accuracy:  0.939


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 17 | Train Loss:  0.002                 | Train Accuracy:  0.985                 | Val Loss:  0.005                 | Val Accuracy:  0.942


100%|██████████| 43/43 [00:26<00:00,  1.59it/s]


Epochs: 18 | Train Loss:  0.002                 | Train Accuracy:  0.986                 | Val Loss:  0.005                 | Val Accuracy:  0.942


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 19 | Train Loss:  0.001                 | Train Accuracy:  0.988                 | Val Loss:  0.005                 | Val Accuracy:  0.952


100%|██████████| 43/43 [00:26<00:00,  1.60it/s]


Epochs: 20 | Train Loss:  0.001                 | Train Accuracy:  0.991                 | Val Loss:  0.005                 | Val Accuracy:  0.949


# Evaluate against test set

In [34]:
def evaluate(model, test_data):

    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=32)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0

    qn_ids = []
    qn_encoded_text = torch.zeros(0,256).to(device) # 256 is the max length of tokenizer
    qn_type = torch.zeros(0).to(device)
    qn_predicted_type = torch.zeros(0).to(device)

    with torch.no_grad():

        for test_input, test_label, test_qn_id in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device) # Tokenized sentences

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc

              qn_ids = np.append(qn_ids, test_qn_id, axis=0)
              qn_encoded_text = torch.cat((qn_encoded_text, input_id), dim=0)
              qn_type = torch.cat((qn_type, test_label), dim=0)
              qn_predicted_type = torch.cat((qn_predicted_type, output.argmax(dim=1)), dim=0)
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .4f}')

    return qn_ids, qn_encoded_text, qn_type, qn_predicted_type

# # Comment this out if NOT testing other models
# model = BertClassifier()
# model.load_state_dict(torch.load('QuestionType_Classifier.pt'))


# evaluate(model, test_data)
qn_ids, qn_encoded_text, qn_type, qn_predicted_type = evaluate(model, test_data) # save output
'''
qn_ids -> QuestionID
qn_encoded_text -> Encoded QuestionText
qn_type -> QuestionType
qn_predicted_type -> Prediction of QuestionType
'''

print("Test_qn_id | QuestionType | Predicted QuestionType")
for j in range(10):
    idx = np.random.randint(0, len(test_data))
    print(f"{qn_ids[idx]} | {qn_type[idx]} | {qn_predicted_type[idx]}")

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Test Accuracy:  0.9728
Test_qn_id | QuestionType | Predicted QuestionType
C1Q7903 | 1.0 | 1.0
C14Q12883 | 1.0 | 1.0
C9Q5778 | 1.0 | 1.0
C14Q12431 | 0.0 | 1.0
C15Q9712 | 1.0 | 1.0
C6Q3258 | 1.0 | 1.0
C5Q582 | 1.0 | 1.0
C1Q9962 | 0.0 | 0.0
C3Q3047 | 1.0 | 1.0
C6Q15996 | 1.0 | 1.0


# Test random question

In [51]:
d = {'QuestionText': ['what is the production quality'],
     'QuestionType': ['open-ended'], #open-ended or yes/no
     'QuestionID': ['rgrd']}
minitest = pd.DataFrame(data=d)
display(minitest)
test2 = Dataset(minitest)
test2_loader = torch.utils.data.DataLoader(test2, batch_size=16)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model = BertClassifier()
model.load_state_dict(torch.load(model_name))


if use_cuda:

    model = model.cuda()

total_acc_test = 0
with torch.no_grad():
    for test_input, test_label, test_qn_id in test2_loader:

        test_label = test_label.to(device)
        mask = test_input['attention_mask'].to(device)
        input_id = test_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)

        acc = (output.argmax(dim=1) == test_label).sum().item()
        total_acc_test += acc

print("Test_qn_id | Test_label | output from model")
for j in range(len(test_label)):
    print(f"{test_qn_id[j]} | {test_label[j]} | {output.argmax(dim=1)[j]}")


Unnamed: 0,QuestionText,QuestionType,QuestionID
0,what is the production quality,open-ended,rgrd


Test_qn_id | Test_label | output from model
rgrd | 1 | 1
