In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel,BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AdamW
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import logging
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs

In [2]:
!pip install simpletransformers streamlit



In [3]:
kb_df = pd.read_excel('/content/ClientABC _ ATB Financial_Knowledge Base.xlsx',"Data Sheet")
q_df = pd.read_excel('/content/Industry Standard Questionnaires.xlsx',header=None)

In [4]:
def preprocess(df):
    cols = list(df)[:2]
    df[cols] = df[cols].ffill(axis=0)
    df = df.iloc[4:]
    #df = df.dropna().reset_index(drop=True)
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])
    return df

In [5]:
kb_df = preprocess(kb_df)
q_df = q_df.rename(columns={0: 'question'})
q_df.head()

Unnamed: 0,question
0,Do you restrict access to cardholder data by b...
1,Do you identify and authenticate access to sys...
2,Do you restrict physical access to cardholders...
3,Does your business use network segmentation to...
4,Do you install and maintain a firewall configu...


In [6]:
kb_df['Answer'] = kb_df['Answer'].fillna('unanswerable')
kb_df.head()

4,Section Heading,Control Heading,Original ID,Question Text,Answer,Notes/Comment,identifier
5,Service Overview,Service Scope Question,3.3,What technology languages/platforms/stacks/com...,"""ClientABC"" database, Java, JavaScript, GO. Cl...",,527c246f-dc2e-4873-a1d3-64a8ce1dda1b
6,Service Overview,Service Hosting,4.1,Is your service run from your own:\n- A. Data ...,B. The Cloud,,909e4666-14a8-4a3f-8122-3f978c32caab
7,Service Overview,Service Hosting,4.2,Data Centre Location(s) (relative to services ...,"""ClientABC"" is available within the Amazon We...",,9fd246fc-5ece-45e2-a0f8-9c34e6f3a8a9
8,Service Overview,Service Hosting,4.3,Which cloud providers do you rely on?,Our data centers are hosted on the cloud via A...,,9e7ff978-612d-40d2-a18d-059389e820bb
9,Service Overview,Service Hosting,4.4,Have you researched your cloud providers best ...,Yes,,a0aa89b4-8bd4-4b8d-a887-8d3d68da63c3


In [7]:
section_label_encoder = LabelEncoder()
control_label_encoder = LabelEncoder()

kb_df['Section Heading Encoded'] = section_label_encoder.fit_transform(kb_df['Section Heading'])
kb_df['Control Heading Encoded'] = control_label_encoder.fit_transform(kb_df['Control Heading'])

reverse_section_label_mapping = dict(enumerate(section_label_encoder.classes_))
reverse_control_label_mapping = dict(enumerate(control_label_encoder.classes_))

class CustomDataset(Dataset):
    def __init__(self, questions, section_labels, control_labels, tokenizer, max_len):
        self.questions = questions
        self.section_labels = section_labels
        self.control_labels = control_labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        section_label = self.section_labels[idx]
        control_label = self.control_labels[idx]

        encoding = self.tokenizer.encode_plus(
            question,
            max_length=self.max_len,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'section_labels': torch.tensor(section_label, dtype=torch.long),
            'control_labels': torch.tensor(control_label, dtype=torch.long),
        }

train_df, test_df = train_test_split(kb_df, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(section_label_encoder.classes_) + len(control_label_encoder.classes_))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_dataset = CustomDataset(
    train_df['Question Text'].values,
    train_df['Section Heading Encoded'].values,
    train_df['Control Heading Encoded'].values,
    tokenizer,
    max_len=128
)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 25

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        section_labels = batch['section_labels'].to(device)
        control_labels = batch['control_labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        section_logits = logits[:, :len(section_label_encoder.classes_)]
        control_logits = logits[:, len(section_label_encoder.classes_):]

        loss_fn = torch.nn.CrossEntropyLoss()
        section_loss = loss_fn(section_logits, section_labels)
        control_loss = loss_fn(control_logits, control_labels)
        loss = (section_loss + control_loss) / 2

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Loss: {total_loss / len(train_loader)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/25
Loss: 2.8853137898952403
Epoch 2/25
Loss: 2.71793210760076
Epoch 3/25
Loss: 2.5385308240322355
Epoch 4/25
Loss: 2.306800479584552
Epoch 5/25
Loss: 2.066044754170357
Epoch 6/25
Loss: 1.8382386755436024
Epoch 7/25
Loss: 1.6354548347757218
Epoch 8/25
Loss: 1.4652294605336291
Epoch 9/25
Loss: 1.2921198923537072
Epoch 10/25
Loss: 1.143654744675819
Epoch 11/25
Loss: 1.0272834675109133
Epoch 12/25
Loss: 0.9242236221090276
Epoch 13/25
Loss: 0.8379020887486478
Epoch 14/25
Loss: 0.7895138149565839
Epoch 15/25
Loss: 0.7120596290902889
Epoch 16/25
Loss: 0.6256768906370123
Epoch 17/25
Loss: 0.6009265000515795
Epoch 18/25
Loss: 0.5417224359005055
Epoch 19/25
Loss: 0.5039685130753415
Epoch 20/25
Loss: 0.4814125327987874
Epoch 21/25
Loss: 0.4281958969349557
Epoch 22/25
Loss: 0.3913832107122908
Epoch 23/25
Loss: 0.3757855283453109
Epoch 24/25
Loss: 0.3332609894110801
Epoch 25/25
Loss: 0.31463202041514376


In [8]:
def predict_section_control(model, tokenizer, question, threshold=0.5, max_len=128):
    model.eval()
    encoding = tokenizer.encode_plus(
        question,
        max_length=max_len,
        add_special_tokens=True,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    section_logits = logits[:, :len(section_label_encoder.classes_)]
    control_logits = logits[:, len(control_label_encoder.classes_):]

    section_probs = F.softmax(section_logits, dim=1)
    control_probs = F.softmax(control_logits, dim=1)

    section_confidence, section_pred = torch.max(section_probs, dim=1)
    control_confidence, control_pred = torch.max(control_probs, dim=1)

    if section_confidence.item() < threshold:
        section_label = "Unanswerable"
    else:
        section_label = reverse_section_label_mapping[section_pred.item()]

    if control_confidence.item() < threshold:
        control_label = "Unanswerable"
    else:
        control_label = reverse_control_label_mapping[control_pred.item()]

    return section_label, control_label

def process_question(question, model, tokenizer):
  section_pred, control_pred = predict_section_control(model, tokenizer, question)
  return section_pred, control_pred

q_df[['section_pred', 'control_pred']] = q_df['question'].apply(process_question, args=(model, tokenizer)).tolist()


In [9]:
q_df.head()

Unnamed: 0,question,section_pred,control_pred
0,Do you restrict access to cardholder data by b...,Data Protection & Access Control,Background Checks
1,Do you identify and authenticate access to sys...,Proactive Security,Confidentiality
2,Do you restrict physical access to cardholders...,Data Protection & Access Control,Background Checks
3,Does your business use network segmentation to...,Proactive Security,Confidentiality
4,Do you install and maintain a firewall configu...,Proactive Security,Confidentiality


In [10]:
model.save_pretrained("./sectioncontrol/model")

tokenizer.save_pretrained("./sectioncontrol/tokenizer")

('./sectioncontrol/tokenizer/tokenizer_config.json',
 './sectioncontrol/tokenizer/special_tokens_map.json',
 './sectioncontrol/tokenizer/vocab.txt',
 './sectioncontrol/tokenizer/added_tokens.json')

In [11]:
train_questions_answers = []
train_contexts = []

for index, row in kb_df.iterrows():
    if pd.isna(row['Notes/Comment']):
        context = f"{row['Answer']}"
    else:
        context = f"{row['Notes/Comment']} | {row['Answer']}"

    question_answer = {
        "context": context,
        "question": row['Question Text'],
        "answer": row['Answer']
    }

    train_questions_answers.append(question_answer)
    if context not in train_contexts:
        train_contexts.append(context)

train_data = []

for context in train_contexts:
    qas = []
    for qa in train_questions_answers:
        if qa["context"] == context:
            answer_start = context.find(qa["answer"])
            if answer_start != -1:
                qas.append({
                    "id": str(len(qas) + 1).zfill(5),
                    "is_impossible": False,
                    "question": qa["question"],
                    "answers": [
                        {
                            "text": qa["answer"],
                            "answer_start": answer_start,
                        }
                    ],
                })
    train_data.append({
        "context": context,
        "qas": qas,
    })

print(train_data[0])

{'context': '"ClientABC" database, Java, JavaScript, GO. Cloud infrastructure is provided by AWS, Azure, and GCP.  Cluster nodes run CentOS and host the "ClientABC" database. "ClientABC"  is written in Java, JavaScript, and Go.\n', 'qas': [{'id': '00001', 'is_impossible': False, 'question': 'What technology languages/platforms/stacks/components are utilized in the scope of the application? (AWS? MySQL? Ruby on Rails? Go? Javascript?)', 'answers': [{'text': '"ClientABC" database, Java, JavaScript, GO. Cloud infrastructure is provided by AWS, Azure, and GCP.  Cluster nodes run CentOS and host the "ClientABC" database. "ClientABC"  is written in Java, JavaScript, and Go.\n', 'answer_start': 0}]}]}


In [12]:
import json

with open('train.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

In [13]:
train_args = {
    'overwrite_output_dir': True,
    "evaluate_during_training": True,
    "max_seq_length": 128,
    "num_train_epochs": 50,
    "evaluate_during_training_steps": 500,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "n_best_size":16,
    "train_batch_size": 16,
    "eval_batch_size": 16
}

In [14]:
model_qa = QuestionAnsweringModel("bert",
                               "bert-base-uncased",
                               args = train_args,
                               use_cuda=True)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"finetune-BERT-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=25,
    weight_decay=0.01,
)



In [16]:
traindata, valdata = train_test_split(train_data, test_size=0.15, random_state=42)

In [17]:
model_qa.train_model(traindata, eval_data=valdata)

  self.pid = os.fork()
convert squad examples to features: 100%|██████████| 74/74 [00:00<00:00, 373.79it/s]
add example index and unique id: 100%|██████████| 74/74 [00:00<00:00, 197945.47it/s]


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 1 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 964.19it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 289959.92it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 2 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 952.62it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 297615.63it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 3 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 430.92it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 260628.72it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 4 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 463.95it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 27039.74it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 5 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features:   0%|          | 0/43 [00:00<?, ?it/s][A
convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 373.33it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 295180.15it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 6 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 743.98it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 274095.85it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 7 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 671.38it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 310421.81it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 8 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 958.47it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 302609.18it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 9 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 829.18it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 311494.08it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 10 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 896.48it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 301093.61it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 11 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 728.91it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 280490.00it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 12 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 796.07it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 303117.77it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 13 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 912.77it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 94229.40it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 14 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 782.18it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 368824.28it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 15 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 890.55it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 316968.49it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 16 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 470.75it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 252244.86it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 17 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 565.94it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 290895.28it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 18 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 529.41it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 287647.64it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 19 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 735.09it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 267589.13it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 20 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 903.58it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 215735.73it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 21 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 640.58it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 259130.85it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 22 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 774.15it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 282245.81it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 23 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 959.70it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 151431.63it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 24 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 763.40it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 347504.96it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 25 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 950.85it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 248422.96it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 26 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 740.80it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 209471.63it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 27 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 881.68it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 89152.28it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 28 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 961.01it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 303628.07it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 29 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 775.83it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 303628.07it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 30 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 472.82it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 251892.56it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 31 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 560.58it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 244715.16it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 32 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 758.14it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 304140.09it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 33 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 869.52it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 227147.45it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 34 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 836.25it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 223765.60it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 35 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 985.33it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 302609.18it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 36 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 792.67it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 276194.60it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 37 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 762.91it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 301093.61it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 38 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 852.15it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 284921.12it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 39 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 923.12it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 261763.53it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 40 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 847.52it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 218877.51it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 41 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 770.48it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 313116.44it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 42 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 777.12it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 276194.60it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 43 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 912.30it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 115834.99it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 44 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 540.01it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 296636.63it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 45 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 504.05it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 149796.57it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 46 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 475.61it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 232416.33it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 47 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 833.74it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 224601.58it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 48 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 981.35it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 287647.64it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 49 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 994.56it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 303117.77it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 50 of 50:   0%|          | 0/5 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 756.13it/s]

add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 292309.68it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

(250,
 {'global_step': [5,
   10,
   15,
   20,
   25,
   30,
   35,
   40,
   45,
   50,
   55,
   60,
   65,
   70,
   75,
   80,
   85,
   90,
   95,
   100,
   105,
   110,
   115,
   120,
   125,
   130,
   135,
   140,
   145,
   150,
   155,
   160,
   165,
   170,
   175,
   180,
   185,
   190,
   195,
   200,
   205,
   210,
   215,
   220,
   225,
   230,
   235,
   240,
   245,
   250],
  'correct': [29,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33,
   33],
  'similar': [4,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   

In [18]:
result, texts = model_qa.eval_model(valdata)
print(result)

convert squad examples to features: 100%|██████████| 43/43 [00:00<00:00, 826.64it/s]
add example index and unique id: 100%|██████████| 43/43 [00:00<00:00, 299096.31it/s]


Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

{'correct': 33, 'similar': 0, 'incorrect': 0, 'eval_loss': -6.54296875}


In [27]:
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs

model = QuestionAnsweringModel("bert", "/content/outputs/best_model")

classifications = []
unanswerable_questions = []

In [26]:
def get_context(section_heading, control_heading, df):
    filtered_df = df[(df['Section Heading'] == section_heading) | (df['Control Heading'] == control_heading)]
    notes = filtered_df['Notes/Comment'].dropna().tolist()

    a= ' '.join(notes) if notes else ''
    return a

def classify_question(question, section_heading, control_heading, df):
    context = get_context(section_heading, control_heading, df)
    if not context.strip():
        return 'unanswerable'
    to_predict = [
        {
            "context": context,
            "qas": [
                {
                    "question": question,
                    "id": "0",
                }
            ],
        }
    ]

    answers, probabilities = model.predict(to_predict, n_best_size=2)
    print(answers)
    print(probabilities)
    if answers[0]['answer'] == '':
        return 'unanswerable'
    elif probabilities[0]['probability'][0] > 0.6:
        return 'answerable'
    else:
        return 'ambiguous'


In [28]:
classifications = []
unanswerable_questions = []

for _, row in q_df.iterrows():
    question = row['question']
    section_heading = row['section_pred']
    control_heading = row['control_pred']

    classification = classify_question(question, section_heading, control_heading, kb_df)
    classifications.append(classification)

    if classification == 'unanswerable':
        unanswerable_questions.append(question)



convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 46.64it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 7781.64it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['All']}]
[{'id': '0', 'probability': [0.838117441299773]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 11.27it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 6831.11it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['All', 'Our']}]
[{'id': '0', 'probability': [0.4703343931810777, 0.4657636383365959]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 26.94it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 10058.28it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['Paper', 'All data in transit is encrypted using TLS 1.2 or higher and all data at rest is encrypted using AES-256. There is a small range of paper-based information assets in use within "ClientABC", including "ClientABC" contracts, legal and procurement forms, general working information (printed documents, e-mails etc.). Paper']}]
[{'id': '0', 'probability': [0.4629010720260712, 0.46244924085451833]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00,  7.88it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 9341.43it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['Our', "Our Customer's data will reside in the cloud and not onsite. The data our customers store in is arbitrary and unstructured. Also, we do not inspect this data. Therefore"]}]
[{'id': '0', 'probability': [0.4692758480592847, 0.4557269711520748]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 19.31it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 11244.78it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['All', 'Therefore']}]
[{'id': '0', 'probability': [0.47412564764376, 0.4622388463738294]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 36.78it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 8192.00it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['Paper', 'We']}]
[{'id': '0', 'probability': [0.7339245479682784, 0.2255898951563501]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 90.06it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 10866.07it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['When a customer terminates an Cluster, the following happens: it will become unavailable immediately; "ClientABC", Inc. may retain a copy of the data for up to 5 days; the backup associated with the managed cluster is also terminated. If', 'When']}]
[{'id': '0', 'probability': [0.495626379332873, 0.4548123313988001]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 23.28it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 10699.76it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['All data at rest is encrypted using AES-256. User credentials for the "ClientABC" UI are stored using industry standard and audited one-way hashes. "ClientABC"', 'All data is fully encrypted from a filesystem perspective In transit, our data is encrypted with TLS 1.2. All data at rest is encrypted using AES-256. User credentials for the "ClientABC" UI are stored using industry standard and audited one-way hashes. "ClientABC"']}]
[{'id': '0', 'probability': [0.4861140348609743, 0.4593454819893264]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 61.67it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 11554.56it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['In']}]
[{'id': '0', 'probability': [0.8370879625470882]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 22.61it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 9404.27it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['All', "Our Customer's data will reside in the cloud and not onsite. The data our customers store in is arbitrary and unstructured. Also, we do not inspect this data. Therefore"]}]
[{'id': '0', 'probability': [0.4698517075670312, 0.4630192785719352]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 39.13it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 10951.19it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['All data in transit is encrypted using TLS 1.2 or higher and all data at rest is encrypted using AES-256. There is a small range of paper-based information assets in use within "ClientABC", including "ClientABC" contracts, legal and procurement forms, general working information (printed documents, e-mails etc.). Paper']}]
[{'id': '0', 'probability': [0.8397339676722393]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00,  8.92it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 10180.35it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['All', 'We']}]
[{'id': '0', 'probability': [0.4524129555240598, 0.4462697126630327]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 55.41it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 11748.75it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['All']}]
[{'id': '0', 'probability': [0.8378522729193348]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 36.65it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 8867.45it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['All']}]
[{'id': '0', 'probability': [0.8453369372549742]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 19.71it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 8128.50it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['Our', 'All']}]
[{'id': '0', 'probability': [0.4746532300761897, 0.4641109119232209]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 24.32it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 10255.02it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['Our', 'All']}]
[{'id': '0', 'probability': [0.495640680824592, 0.44646412184370843]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 12.92it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 12595.51it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['Therefore', 'All data at rest is encrypted using AES-256. User']}]
[{'id': '0', 'probability': [0.49285272089245313, 0.45983809094919303]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 24.73it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 9157.87it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['All new features and changes made to the product and its platform undergo a formal design and scoping process. As', 'We follow NIST and OWASP guidelines for our software development. Engineers test for vulnerabilities, including dependencies and third-party packages, before they release code. All software projects undergo security review annually or more often by "ClientABC"\'s dedicated security team or third-party security service, using fuzz testing and static analysis. All new features and changes made to the product and its platform undergo a formal design and scoping process. As']}]
[{'id': '0', 'probability': [0.45365102407562463, 0.4479281860304597]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00,  9.66it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 11715.93it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ["Our Customer's data will reside in the cloud and not onsite. The data our customers store in is arbitrary and unstructured. Also, we do not inspect this data. Therefore", 'Also, we do not inspect this data. Therefore']}]
[{'id': '0', 'probability': [0.4800297475659502, 0.4562618266919518]}]


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 38.40it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 11428.62it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['Officer. In']}]
[{'id': '0', 'probability': [0.8017030553199359]}]


In [29]:
total_questions = len(classifications)
answerable_questions = classifications.count('answerable')
completion_percentage = (answerable_questions / total_questions) * 100

print(f"Completion Percentage: {completion_percentage:.2f}%")
print(f"\nTotal Questions: {total_questions}")
print(f"Answerable Questions: {answerable_questions}")
print(f"Unanswerable Questions: {classifications.count('unanswerable')}")
print(f"Ambiguous Questions: {classifications.count('ambiguous')}")

print("\nUnanswerable Questions:")
for q in unanswerable_questions:
    print(f"- {q}")

Completion Percentage: 26.92%

Total Questions: 26
Answerable Questions: 7
Unanswerable Questions: 6
Ambiguous Questions: 13

Unanswerable Questions:
- Do you track and monitor all access to network resources and cardholder data?
- Which types of facilities, such as retail outlets, corporate offices, data centers, call centers, and so on are included in the Payment Card Industry Data Security Standard (PCI DSS) review, and could you provide a summary of their locations?
- Can you list services that were included in the scope of the Payment Card Industry Data Security Standard (PCI DSS) Assessment?
- Can you list services that are provided by the service provider but were not included in the scope of the Payment Card Industry Data Security Standard (PCI DSS) Assessment?
- Does your company have a relationship with a Qualified Integrator and Reseller (QIR) for the purpose of the services being validated? If yes, can you provide the name of the Qualified Integrator and Reseller (QIR) comp

In [23]:
!npm install localtunnel

[K[?25h
added 22 packages, and audited 23 packages in 2s

3 packages are looking for funding
  run `npm fund` for details

2 [33m[1mmoderate[22m[39m severity vulnerabilities

To address all issues, run:
  npm audit fix

Run `npm audit` for details.


In [24]:
!streamlit run app.py &>/content/logs.txt &

In [25]:
!npx localtunnel --port 8501

your url is: https://quiet-laws-sort.loca.lt
^C
