In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pwd

'/content'

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue May 21 05:03:16 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
cd /content/drive/MyDrive/Chatbot_InsuranceQA

/content/drive/.shortcut-targets-by-id/1z6j7eyH00Do8s4ncqmY2gDD8bpCZSNGQ/Chatbot_InsuranceQA


In [None]:
pwd

'/content/drive/.shortcut-targets-by-id/1z6j7eyH00Do8s4ncqmY2gDD8bpCZSNGQ/Chatbot_InsuranceQA'

In [None]:
!pip install transformers==4.38.2

In [None]:
!pip install -U PyPDF2
!pip install python-docx



In [None]:
!pip install transformers[torch]==4.38.2

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
from PyPDF2 import PdfReader
import os
import docx
import random
import torch
from tqdm import tqdm
import shutil
from torch.autograd import Variable
from tqdm.notebook import tqdm_notebook as tqdmnb
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import EarlyStoppingCallback, TrainerCallback
from transformers import get_cosine_schedule_with_warmup
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TrainerCallback

In [None]:
# Functions to read different file types
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory, type = "train"):
    combined_text = ""
    for filename in os.listdir(directory):
        if not filename.startswith(f"{type}_QA"):
            continue
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text


In [None]:
ls

AkashUI.ipynb                    [0m[01;34mlogs[0m/                      train_dataset.csv
app.ipynb                        [01;34mmodel[0m/                     validation_dataset.csv
[01;34mdata[0m/                            project_architecture.gdoc  vectorstore.index
fine_tuned_distilbert_model.pth  RAJ_Copy_of_app.ipynb      [01;34mwandb[0m/
htmlTemplates.py                 test_dataset.csv


# Data Processing & Augmentation

In [None]:


"""
Pool is generated by SOLR.
File name that includes raw contains text with original version.
File name that includes token contains text tokenized with Stanford Tokenizer.
We split the whole corpus into train/valid/test three parts. File name that includes train/valid/test corresponds to each part.

For all tokens starting with idx_ , please refer to the vocabulary file for the corresponding word.

For all train/valid/test files, format is same, with various answer pool size:
 <Domain><TAB><QUESTION><TAB><Groundtruth><TAB><Pool>

For InsuranceQA.question.anslabel.*:
<Domain><TAB><QUESTION><TAB><Groundtruth>

For InsuranceQA.label2answer.*
<Answer Label><TAB><Answer Text>

For vocabulary file:
<word index><TAB><original word>
"""


def load_vocabulary(vocab_path, label_path):
    id_to_word = {}
    with open(vocab_path) as f:
        lines = f.readlines()
        for l in tqdm(lines, desc='Loading Vocabulary'):
            d = l.rstrip().split('\t')
            if d[0] not in id_to_word:
                id_to_word[d[0]] = d[1]

    label_to_ans = {}
    label_to_ans_text = {}
    with open(label_path) as f:
        lines = f.readlines()
        for l in tqdm(lines, desc='Loading Labels'):
            label, answer = l.rstrip().split('\t')
            if label not in label_to_ans:
                label_to_ans[label] = answer
                label_to_ans_text[label] = [id_to_word[t] for t in answer.split(' ')]
    return id_to_word, label_to_ans, label_to_ans_text


def data_load(fpath, id_to_word, label_to_ans_text, type = "Training", label = False):
    data = []
    with open(fpath) as f:
        lines = f.readlines()
        for l in tqdm(lines, desc = f'Loading {type} Data'):
            d = l.rstrip().split('\t')
            q = [id_to_word[t] for t in d[1].split(' ')] # question
            if label == True:
              poss = [t for t in d[2].split(' ')] # ground-truth
              cands = [t for t in d[3].split(' ')] # candidate-pool
            else:
              poss = [label_to_ans_text[t] for t in d[2].split(' ')] # ground-truth
              cands = [label_to_ans_text[t] for t in d[3].split(' ') if t not in d[2]] # candidate-pool without ground-truth
            data.append((q, poss, cands))
    return data


# data load main
paths = {
    'vocab_path': 'data/V2/vocabulary',
    'label_path': 'data/V2/InsuranceQA.label2answer.token.encoded',
    'train_path': 'data/V2/InsuranceQA.question.anslabel.token.500.pool.solr.train.encoded',
}

id_to_word, _, label_to_ans_text = load_vocabulary(paths['vocab_path'], paths['label_path'])
train = data_load(paths['train_path'], id_to_word, label_to_ans_text)
train_df = pd.DataFrame(train, columns=['Question', 'Groundtruth', 'Candidate_Pool'])

Loading Vocabulary: 100%|██████████| 68580/68580 [00:00<00:00, 475970.85it/s]
Loading Labels: 100%|██████████| 27413/27413 [00:02<00:00, 9883.47it/s] 
Loading Training Data: 100%|██████████| 12889/12889 [00:10<00:00, 1284.96it/s]


In [None]:
train_df.head(2)

Unnamed: 0,Question,Groundtruth,Candidate_Pool
0,"[Is, Disability, Insurance, Required, By, Law, ?]","[[Not, generally, ., There, are, five, states,...","[[Life, insurance, unlike, health, insurance, ..."
1,"[Can, Creditors, Take, Life, Insurance, After,...","[[If, the, person, who, passed, away, was, the...","[[Creditors, can, not, go, after, the, life, i..."


In [None]:
"""
Get Complete Training QA data in txt format
"""

os.makedirs('data/final', exist_ok=True)

# Path to the output text file
train_file_path = 'data/final/train_complete_QA.txt'
open(train_file_path, 'x').close() if not os.path.exists(train_file_path) else None

# Open the text file for writing
with open(train_file_path, 'w') as file:
    # Iterate over rows in the DataFrame
    for index, row in train_df.iterrows():
        # Format the line with '[A]' between the columns
        line = "[Q] "+" ".join(row['Question'])+"\n[A] "+ " ".join(row['Groundtruth'][0])+"\n\n"
        # Write the formatted line to the file
        file.write(line)

print("Sample Train file has been written successfully for 12889 records.")

Sample Train file has been written successfully for 12889 records.


In [None]:
"""
Get Subset of Training QA data in txt format for Valid and Test Set
"""

train_file_path = 'data/final/train_6050_QA.txt'
open(train_file_path, 'x').close() if not os.path.exists(train_file_path) else None

with open(train_file_path, 'w') as file:
    for index, row in train_df.iloc[:6050].iterrows():
        line = "[Q] "+" ".join(row['Question'])+"\n[A] "+ " ".join(row['Groundtruth'][0])+"\n\n"
        file.write(line)

print("Sample Train file has been written successfully for 6050 records.")

Sample Train file has been written successfully for 6050 records.


In [None]:
"""
Spliting the training data into multiple Chunks with only Question which would later used for Data Augmentation
by converting original question into paraphrased version of questions.
"""

os.makedirs('data/only_questions', exist_ok=True)

for i in range(0,6050,50):
    train_file_path = f'data/only_questions/train_Q_{i}.txt'
    open(train_file_path, 'x').close() if not os.path.exists(train_file_path) else None
    with open(train_file_path, 'w') as file:
        for index, row in train_df.iterrows():
            if(index>=i and index<i+50):
                line = "[Q] "+" ".join(row['Question'])+"\n"
                file.write(line)
            elif(index>=i+50):
                break
    print("Train "+str(i)+" file has been written successfully.")

In [None]:
"""
After Paraphrasing all this question files generated above using Open AI API ~ Done Locally
"""
# step 1: Merge all this files to generate file with all Paraphrased Question
# 50 paraphrased questions per file
directory = 'data/final_questions'
txt_files = [f for f in os.listdir(directory) if f.endswith('.txt')]
txt_files = [f for f in txt_files if f.split('.')[0].isdigit()]
txt_files.sort(key=lambda x: int(x.split('.')[0]))

print(txt_files)
with open('data/final/merged_questions.txt', 'w') as outfile: # file with all Paraphrased Question #Dimen: 6050 Q
    for txt_file in txt_files:
        with open(os.path.join(directory, txt_file), 'r') as infile:
            outfile.write(infile.read())
            outfile.write('\n')

['0.txt', '50.txt', '100.txt', '150.txt', '200.txt', '250.txt', '300.txt', '350.txt', '400.txt', '450.txt', '500.txt', '550.txt', '600.txt', '650.txt', '700.txt', '750.txt', '800.txt', '850.txt', '900.txt', '950.txt', '1000.txt', '1050.txt', '1100.txt', '1150.txt', '1200.txt', '1250.txt', '1300.txt', '1350.txt', '1400.txt', '1450.txt', '1500.txt', '1550.txt', '1600.txt', '1650.txt', '1700.txt', '1750.txt', '1800.txt', '1850.txt', '1900.txt', '1950.txt', '2000.txt', '2050.txt', '2100.txt', '2150.txt', '2200.txt', '2250.txt', '2300.txt', '2350.txt', '2400.txt', '2450.txt', '2500.txt', '2550.txt', '2600.txt', '2650.txt', '2700.txt', '2750.txt', '2800.txt', '2850.txt', '2900.txt', '2950.txt', '3000.txt', '3050.txt', '3100.txt', '3150.txt', '3200.txt', '3250.txt', '3300.txt', '3350.txt', '3400.txt', '3450.txt', '3500.txt', '3550.txt', '3600.txt', '3650.txt', '3700.txt', '3750.txt', '3800.txt', '3850.txt', '3900.txt', '3950.txt', '4000.txt', '4050.txt', '4100.txt', '4150.txt', '4200.txt', '4

In [None]:
"""
Step 2: merge answer to the paraphrased question based on train_6050_QA.txt, and merged_questions.txt
"""

def read_questions_paraphrased(file_path):
    with open(file_path, 'r') as file:
        questions_paraphrased = [line.strip() for line in file.readlines()]
    return questions_paraphrased

def read_questions_and_answers(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        questions = [line.strip()[4:] for line in lines if line.startswith('[Q]')]
        answers = [line.strip()[4:] for line in lines if line.startswith('[A]')]
    return questions, answers

def write_main_test_file(questions_paraphrased, questions, answers, output_file):
    with open(output_file, 'w') as file:
        for paraphrased, question, answer in zip(questions_paraphrased, questions, answers):
            file.write(f'[Q] {paraphrased}\n[A] {answer}\n\n')

paraphrased_file = 'data/final/merged_questions.txt' #Q-Paraphrased #6050
qa_file = 'data/final/train_6050_QA.txt' #Q-Question_Actual.  A-Answer
output_file = 'data/final/main_test.txt' #Q-Paraphrased.  A-Answer

questions_paraphrased = read_questions_paraphrased(paraphrased_file)
questions, answers = read_questions_and_answers(qa_file)
write_main_test_file(questions_paraphrased, questions, answers, output_file)
print(f"Generated {output_file} file")

Generated data/final/main_test.txt file


In [None]:
ls data/final

main_test.txt  merged_questions.txt  train_6050_QA.txt  train_complete_QA.txt


In [None]:
with open("data/final/train_complete_QA.txt", 'r') as file:
  for line in file.readlines()[:5]: # top 5 lines
    print(line)

[Q] Is Disability Insurance Required By Law ?

[A] Not generally . There are five states that require most all employers carry short term disability insurance on their employees . These states are : California , Hawaii , New Jersey , New York , and Rhode Island . Besides this mandatory short term disability law , there is no other legislative imperative for someone to purchase or be covered by disability insurance .



[Q] Can Creditors Take Life Insurance After Death ?

[A] If the person who passed away was the one with the debt , creditors generally can not take the life insurance proceeds left as long as the beneficiary was a person . The money then belongs to that beneficiary , and as long as creditors do not have a claim against the beneficiary , they can not take life insurance proceeds from them .



In [None]:
with open("data/final/main_test.txt", 'r') as file:
  for line in file.readlines()[:5]: # top 5 lines
    print(line)

[Q] Is it mandatory by law to have disability insurance?

[A] Not generally . There are five states that require most all employers carry short term disability insurance on their employees . These states are : California , Hawaii , New Jersey , New York , and Rhode Island . Besides this mandatory short term disability law , there is no other legislative imperative for someone to purchase or be covered by disability insurance .



[Q] Can life insurance be seized by creditors after someone dies?

[A] If the person who passed away was the one with the debt , creditors generally can not take the life insurance proceeds left as long as the beneficiary was a person . The money then belongs to that beneficiary , and as long as creditors do not have a claim against the beneficiary , they can not take life insurance proceeds from them .



In [None]:
ls data/final

main_test.txt  merged_questions.txt  train_6050_QA.txt  train_complete_QA.txt


In [None]:
def read_main_test_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return lines

def write_file(file_path, lines):
    with open(file_path, 'w') as file:
        file.writelines(lines)

def generate_valid_and_test_files(main_test_lines):
    # Extract question-answer pairs
    qa_pairs = []
    current_pair = ""
    for line in main_test_lines:
        if line.startswith("[Q]"):
            current_pair = line
        elif line.startswith("[A]"):
            current_pair += line + '\n'
            qa_pairs.append(current_pair)

    # Shuffle the pairs
    random.shuffle(qa_pairs)

    # Determine split index for 50% each for validation and testing
    split_index = int(0.5 * len(qa_pairs))

    # Split into validation and testing sets
    validation_data = qa_pairs[:split_index]
    testing_data = qa_pairs[split_index:split_index*2]

    return validation_data, testing_data

# Function to count the number of questions and answers in a file
def count_qa_in_file(file_path):
    question_count = 0
    answer_count = 0
    with open(file_path, 'r') as file:
        for line in file:
            if "[Q]" in line:
                question_count += 1
            elif "[A]" in line:
                answer_count += 1
    return question_count, answer_count

def main():
    import os

    # File paths
    main_test_file = "data/final/main_test.txt"
    valid_file = "data/final/valid_complete_QA.txt"
    test_file = "data/final/test_complete_QA.txt"

    # Read the main test file and generate valid/test data
    main_test_lines = read_main_test_file(main_test_file)
    validation_data, testing_data = generate_valid_and_test_files(main_test_lines)

    # Write the generated data to files
    open(valid_file, 'x').close() if not os.path.exists(valid_file) else None
    open(test_file, 'x').close() if not os.path.exists(test_file) else None
    write_file(valid_file, validation_data)
    write_file(test_file, testing_data)

    # Print the first few lines of validation and testing data
    print("First 2 QA of Validation data\n")
    with open(valid_file, 'r') as file:
        for line in file.readlines()[:5]:
            print(line)

    print("\n\nFirst 2 QA of Test data\n")
    with open(test_file, 'r') as file:
        for line in file.readlines()[:5]:
            print(line)

    # Count the number of questions and answers in the validation and test files
    valid_q_count, valid_a_count = count_qa_in_file(valid_file)
    test_q_count, test_a_count = count_qa_in_file(test_file)

    # Print the counts
    print(f"\nValidation File: {valid_file}")
    print(f"Number of Questions: {valid_q_count}")
    print(f"Number of Answers: {valid_a_count}")

    print(f"\nTest File: {test_file}")
    print(f"Number of Questions: {test_q_count}")
    print(f"Number of Answers: {test_a_count}")


main()

First 2 QA of Validation data

[Q] When is it possible to switch auto insurance providers?

[A] That is a great question ! Your life insurance policy through your work place is a group life term policy . It is a group policy because the same conditions and terms generally apply to all members of your workforce , and it is a term policy because it ends one year after enrolling - which is why there is an enrollment period each fall that you re-up the coverage or change it . I hope that helps , thanks for asking !



[Q] Are ED medications covered by health insurance?

[A] If your health insurance plan has it 's pharmacy benefits integrated then it probably will . The type of plan and the scope of it 's coverage will dictate how exactly such a medication is handled . If one has a `` rich '' level of coverage then one should have no problems what-so-ever , however with plans that offer `` thinner '' coverage and benefit options one may have to have their Dr. contact the insurance company t

In [None]:
!pip list | grep -E 'transformers|accelerate|torch|tensorflow'

accelerate                       0.30.0
tensorflow                       2.15.0
tensorflow-datasets              4.9.4
tensorflow-estimator             2.15.0
tensorflow-gcs-config            2.15.0
tensorflow-hub                   0.16.1
tensorflow-io-gcs-filesystem     0.37.0
tensorflow-metadata              1.15.0
tensorflow-probability           0.23.0
torch                            2.2.1+cu121
torchaudio                       2.2.1+cu121
torchdata                        0.7.1
torchsummary                     1.5.1
torchtext                        0.17.1
torchvision                      0.17.1+cu121
transformers                     4.38.2
