In [None]:
import pandas as pd
import re

def clean_answer(text):
    if not isinstance(text, str):
        return ""
    # Remove answers containing bullet/list points
    if re.search(r"(•|- |\* |\d+\.)", text):
        return ""
    sentences = [s.strip() for s in re.split(r'\.\s*', text) if s.strip()]
    seen = set()
    cleaned_sentences = []
    for sent in sentences:
        if len(sent) < 10:
            continue
        if sent in seen:
            continue
        cleaned_sentences.append(sent)
        seen.add(sent)
    cleaned_text = '. '.join(cleaned_sentences)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace(" - ", " • ").replace("- ", "• ")
    if not cleaned_text.endswith('.'):
        cleaned_text += '.'
    return cleaned_text.strip()

def preprocess_medquad(file_path):
    df = pd.read_csv(file_path)
    df_unique = df.drop_duplicates(subset=['question'], keep='first').reset_index(drop=True)
    df_unique['answer'] = df_unique['answer'].apply(clean_answer)  # overwrite answer with cleaned
    # Remove rows where answer is empty after cleaning
    df_unique = df_unique[df_unique['answer'].str.strip() != ""].reset_index(drop=True)
    return df_unique  # return all columns with cleaned answer

if __name__ == "__main__":
    input_file = "Dataset/medquad.csv"
    output_file = "medquad_cleaned.csv"
    
    cleaned_df = preprocess_medquad(input_file)
    cleaned_df.to_csv(output_file, index=False)
    
    print(f"Processed {len(cleaned_df)} unique Q&A pairs saved to {output_file}")


Processed 8822 unique Q&A pairs saved to medquad_clean_unique.csv


In [None]:
import json

def csv_to_jsonl_qa_only(input_csv, output_jsonl):
    df = pd.read_csv(input_csv)

    # Only keep question and answer columns
    if 'question' not in df.columns or 'answer' not in df.columns:
        raise ValueError("The input CSV must have 'question' and 'answer' columns.")

    with open(output_jsonl, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            qa_pair = {
                "question": str(row['question']),
                "answer": str(row['answer'])
            }
            f.write(json.dumps(qa_pair, ensure_ascii=False) + '\n')

if __name__ == "__main__":
    input_csv = 'medquad_cleaned.csv'
    output_jsonl = 'medquad_cleaned.jsonl'
    
    csv_to_jsonl_qa_only(input_csv, output_jsonl)
    print(f"JSONL file with only question and answer saved as {output_jsonl}")


JSONL file with only question and answer saved as medquad_cleaned.jsonl


In [4]:
import json

def csv_to_jsonl_qa_only(input_csv, output_jsonl):
    df = pd.read_csv(input_csv)
    df_small = df.head(60)  # Display first 60 rows for debugging

    # Only keep question and answer columns
    if 'question' not in df.columns or 'answer' not in df.columns:
        raise ValueError("The input CSV must have 'question' and 'answer' columns.")

    with open(output_jsonl, 'w', encoding='utf-8') as f:
        for _, row in df_small.iterrows():
            qa_pair = {
                "question": str(row['question']),
                "answer": str(row['answer'])
            }
            f.write(json.dumps(qa_pair, ensure_ascii=False) + '\n')

if __name__ == "__main__":
    input_csv = 'medquad_cleaned.csv'
    output_jsonl = 'medquad_cleaned_small.jsonl'
    
    csv_to_jsonl_qa_only(input_csv, output_jsonl)
    print(f"JSONL file with only question and answer saved as {output_jsonl}")

JSONL file with only question and answer saved as medquad_cleaned_small.jsonl


In [5]:
dataset = pd.read_csv('medquad.csv')
dataset.isnull().sum()

question       0
answer         5
source         0
focus_area    14
dtype: int64

In [None]:
import pandas as pd

df = pd.read_csv('medquad_clean_unique.csv')
df.isnull().sum()  # Check for any null values in the DataFrame

question       0
answer         5
source         0
focus_area    13
dtype: int64

In [4]:
df = df.dropna(subset=['answer'])  # Drop rows with any null values
df.to_csv('medquad_clean_unique.csv', index=False)  # Save the cleaned DataFrame

In [None]:
import faiss
import numpy as np
import torch
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv('medquad_clean_unique.csv')

# Example corpus

corpus = df['answer'].tolist()  # your list of documents

# Load model and encode corpus
vectorizer = TfidfVectorizer()
corpus_vec = vectorizer.fit_transform(corpus).toarray()

index = faiss.IndexFlatL2(corpus_vec.shape[1])  # L2 distance index
index.add(np.array(corpus_vec, dtype=np.float32))  # Add vectors to the index

# Save index
faiss.write_index(index, "medquad_faiss.index")

# Later: load index
index_loaded = faiss.read_index("medquad_faiss.index")


In [9]:
from datasets import Dataset
import json

data = []
with open('medquad_qa_only.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

dataset = Dataset.from_list(data)

In [2]:
for i, record in enumerate(data):
    try:
        Dataset.from_list([record])
    except Exception as e:
        print(f"Error at row {i}: {record}")
        print(e)
        break


In [4]:
from collections import defaultdict

types_per_key = defaultdict(set)
for record in data:
    for k, v in record.items():
        types_per_key[k].add(type(v).__name__)

print(types_per_key)


defaultdict(<class 'set'>, {'question': {'str'}, 'answer': {'str', 'float'}})
