In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

def parse_xml_files(folder_path):
    qapairs = []
    for xml_root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.xml'):
                file_path = os.path.join(xml_root, file)
                print('Parsing file:', file_path)
                tree = ET.parse(file_path)
                root = tree.getroot()
                # do something with the XML data here
                for qapair in root.findall('QAPairs/QAPair'):
                    pid = qapair.attrib['pid']
                    question = qapair.find('Question').text
                    qid = qapair.find('Question').attrib['qid']
                    qtype = qapair.find('Question').attrib['qtype']
                    answer = qapair.find('Answer').text
                    qapairs.append({'pid': pid, 'question': question, 'qid': qid, 'qtype': qtype, 'answer': answer})
    df = pd.DataFrame(qapairs, columns=['pid', 'question', 'qid', 'qtype', 'answer'])
    df.to_csv('parsed_data.csv', index=False)


In [2]:
parse_xml_files('/content/drive/MyDrive/4090_AI_Biomedical/QA/')

Parsing file: /content/drive/MyDrive/4090_AI_Biomedical/QA/5_NIDDK_QA/0000033.xml
Parsing file: /content/drive/MyDrive/4090_AI_Biomedical/QA/5_NIDDK_QA/0000043.xml
Parsing file: /content/drive/MyDrive/4090_AI_Biomedical/QA/5_NIDDK_QA/0000044.xml
Parsing file: /content/drive/MyDrive/4090_AI_Biomedical/QA/5_NIDDK_QA/0000119.xml
Parsing file: /content/drive/MyDrive/4090_AI_Biomedical/QA/5_NIDDK_QA/0000205.xml
Parsing file: /content/drive/MyDrive/4090_AI_Biomedical/QA/5_NIDDK_QA/0000093.xml
Parsing file: /content/drive/MyDrive/4090_AI_Biomedical/QA/5_NIDDK_QA/0000136.xml
Parsing file: /content/drive/MyDrive/4090_AI_Biomedical/QA/5_NIDDK_QA/0000098.xml
Parsing file: /content/drive/MyDrive/4090_AI_Biomedical/QA/5_NIDDK_QA/0000171.xml
Parsing file: /content/drive/MyDrive/4090_AI_Biomedical/QA/5_NIDDK_QA/0000208.xml
Parsing file: /content/drive/MyDrive/4090_AI_Biomedical/QA/5_NIDDK_QA/0000127.xml
Parsing file: /content/drive/MyDrive/4090_AI_Biomedical/QA/5_NIDDK_QA/0000060.xml
Parsing file: /c

In [3]:
data = pd.read_csv('parsed_data.csv')

In [4]:
data

Unnamed: 0,pid,question,qid,qtype,answer
0,1,What is (are) Diabetic Kidney Disease ?,0000033-1,information,"Diabetic kidney disease, also called diabetic ..."
1,2,What is (are) Diabetic Kidney Disease ?,0000033-2,information,"The kidneys are two bean-shaped organs, each a..."
2,3,What is (are) Diabetic Kidney Disease ?,0000033-3,information,Diabetes is a complex group of diseases with a...
3,6,What are the symptoms of Diabetic Kidney Disea...,0000033-6,symptoms,People with diabetic kidney disease do not hav...
4,7,How to diagnose Diabetic Kidney Disease ?,0000033-7,exams and tests,A health care provider diagnoses diabetic kidn...
...,...,...,...,...,...
11013,6,What is the outlook for Male Breast Cancer ?,0000027_2-6,outlook,Survival for men with breast cancer is similar...
11014,7,What are the stages of Male Breast Cancer ?,0000027_2-7,stages,Key Points\n - After breast...
11015,8,What are the treatments for Male Breast Cancer ?,0000027_2-8,treatment,Key Points\n - There are di...
11016,1,What is (are) Endometrial Cancer ?,0000014_4-1,information,Key Points\n - Endometrial ...


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

# Define a list of stop words to remove
stop_words = stopwords.words('english')

# Initialize a stemmer to reduce words to their base form
stemmer = PorterStemmer()

# Preprocess a piece of text
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    
    # Tokenize the text into individual words
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words from the text
    # tokens = [word for word in tokens if not word in stop_words]
    
    # Stem each word in the text
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join the tokens back into a string
    text = ' '.join(tokens)
    
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('parsed_data.csv')

# Apply the preprocess_text() function to the 'question' and 'answer' columns
df['question'] = df['question'].apply(preprocess_text)
df['answer'] = df['answer'].apply(preprocess_text)

# Save the preprocessed data back to a CSV file
df.to_csv('preprocessed_data.csv', index=False)

In [8]:
newData = pd.read_csv('preprocessed_data.csv')

In [9]:
newData

Unnamed: 0,pid,question,qid,qtype,answer
0,1,what is ( are ) diabet kidney diseas ?,0000033-1,information,"diabet kidney diseas , also call diabet nephro..."
1,2,what is ( are ) diabet kidney diseas ?,0000033-2,information,"the kidney are two bean-shap organ , each abou..."
2,3,what is ( are ) diabet kidney diseas ?,0000033-3,information,diabet is a complex group of diseas with a var...
3,6,what are the symptom of diabet kidney diseas ?,0000033-6,symptoms,peopl with diabet kidney diseas do not have sy...
4,7,how to diagnos diabet kidney diseas ?,0000033-7,exams and tests,a health care provid diagnos diabet kidney dis...
...,...,...,...,...,...
11013,6,what is the outlook for male breast cancer ?,0000027_2-6,outlook,surviv for men with breast cancer is similar t...
11014,7,what are the stage of male breast cancer ?,0000027_2-7,stages,key point - after breast cancer ha been diagno...
11015,8,what are the treatment for male breast cancer ?,0000027_2-8,treatment,key point - there are differ type of treatment...
11016,1,what is ( are ) endometri cancer ?,0000014_4-1,information,key point - endometri cancer is a diseas in wh...


In [10]:
import numpy as np

In [11]:
# Split the data into input and target sequences
input_seqs = np.array(data['question'])
target_seqs = np.array(data['answer'])

# Split the data into training, validation, and test sets
train_input = input_seqs[:8000]
train_target = target_seqs[:8000]
val_input = input_seqs[8000:9000]
val_target = target_seqs[8000:9000]
test_input = input_seqs[9000:]
test_target = target_seqs[9000:]

In [14]:
import pandas as pd
import json

# Load the preprocessed data into a pandas DataFrame
df = pd.read_csv('preprocessed_data.csv')

# Create a list of JSON objects in SQuAD format
squad_data = []
for i, row in df.iterrows():
    context = row['qtype']
    question = row['question']
    answer = row['answer']
    squad_data.append({
        'paragraphs': [{
            'context': context,
            'qas': [{
                'question': question,
                'id': str(i),
                'answers': [{
                    'text': answer,
                    'answer_start': context.find(answer)
                }]
            }]
        }]
    })

# Save the list of JSON objects as a JSON file
with open('squad_data.json', 'w') as f:
    json.dump({'data': squad_data}, f)

In [16]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Load the BERT-base tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Load the BERT-base model for question answering
model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')

# Load the SQuAD-formatted training data
with open('squad_data.json', 'r', encoding='utf-8') as f:
    squad_data = json.load(f)

# Extract the paragraphs list from the SQuAD data
paragraphs = [q['context'] for p in squad_data['data'] for q in p['paragraphs']]

# Tokenize the input data
tokenized_data = tokenizer(paragraphs, padding='longest', truncation=True, max_length=512, return_tensors='pt')

# Get the input IDs, attention masks, and token type IDs from the tokenized data
input_ids = tokenized_data['input_ids']
attention_masks = tokenized_data['attention_mask']
token_type_ids = tokenized_data['token_type_ids']


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [18]:
input_ids.shape, token_type_ids.shape

(torch.Size([11018, 6]), torch.Size([11018, 6]))

In [19]:
import torch.nn as nn

# Define the loss function
loss_fn = nn.CrossEntropyLoss()

start_positions = []
end_positions = []

for data in squad_data['data']:
    for paragraph in data.get('paragraphs', []):
        for qas in paragraph.get('qas', []):
            for answer in qas.get('answers', []):
                start_positions.append(answer['answer_start'])
                end_positions.append(answer['answer_start'] + len(answer['text']))

start_positions = torch.tensor(start_positions, dtype=torch.long)
end_positions = torch.tensor(end_positions, dtype=torch.long)


In [20]:
start_positions, end_positions

(tensor([-1, -1, -1,  ..., -1, -1, -1]),
 tensor([ 589, 1130, 1034,  ..., 7545, 1408,  804]))

In [21]:
# Create the input features
input_features = []
for i in range(len(input_ids)):
    input_feature = {}
    input_feature['input_ids'] = input_ids[i]
    input_feature['attention_mask'] = attention_masks[i]
    input_feature['token_type_ids'] = tokenizer.create_token_type_ids_from_sequences(input_ids[i].tolist())
    input_feature['start_positions'] = start_positions[i]
    input_feature['end_positions'] = end_positions[i]
    input_features.append(input_feature)

In [22]:
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW

# Convert the input features to PyTorch tensors
input_ids = torch.stack([f['input_ids'] for f in input_features])
attention_masks = torch.stack([f['attention_mask'] for f in input_features])
start_positions = torch.tensor([f['start_positions'] for f in input_features])
end_positions = torch.tensor([f['end_positions'] for f in input_features])

# Set the token type IDs to a fixed value based on the input sequence's position in the batch
token_type_ids = torch.zeros_like(input_ids)
for i in range(input_ids.shape[0]):
    for j in range(input_ids.shape[1]):
        token_type_ids[i][j] = j

In [23]:
input_ids.shape, token_type_ids.shape

(torch.Size([11018, 6]), torch.Size([11018, 6]))

In [24]:
import torch
torch.cuda.init()

In [25]:
torch.cuda.is_available()

True

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [27]:
# Create a TensorDataset from the input tensors
dataset = TensorDataset(input_ids, attention_masks, token_type_ids, start_positions, end_positions)

# Define the batch size
batch_size = 16

# Create a DataLoader to load the input data in batches
dataloader = DataLoader(dataset, batch_size=batch_size)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Set the model to training mode and move to device
model.to(device)
model.train()

# Define the number of training epochs
num_epochs = 10



In [28]:
import os
os.environ['TORCH_USE_CUDA_DSA'] = '1'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [29]:
import random
import numpy as np
import torch

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [30]:
# Loop over the epochs
for epoch in range(num_epochs):
    # Loop over the batches in the dataloader
    for batch in dataloader:
        # Move the batch to device
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks, token_type_ids, start_positions, end_positions = batch

        # Clear the gradients
        optimizer.zero_grad()

        # Forward pass
        start_logits, end_logits = model(input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids)

        # Compute the loss
        loss = loss_fn(start_logits, start_positions) + loss_fn(end_logits, end_positions)

        # Backward pass
        loss.backward()

        # Clip the gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update the parameters
        optimizer.step()

RuntimeError: ignored