In [None]:
import pandas as pd
import json

# Load Excel files
faq_df = pd.read_excel("Questions_Dataset.xlsx")
symptom_df = pd.read_excel("Symptons-Disease-Treatment.xlsx")
therapy_df = pd.read_excel("Therapy_Chat.xlsx")

# Convert to proper format

faq_data = [
    {"source": "faq", "input": row["input"], "output": row["output"]}
    for _, row in faq_df.iterrows()
    if pd.notnull(row["input"]) and pd.notnull(row["output"])
]

symptom_data = [
    {"source": "symptom", "input": f"{row['Symptom']} may indicate {row['Disease']}", "output": row["Treatment"]}
    for _, row in symptom_df.iterrows()
    if pd.notnull(row["Symptom"]) and pd.notnull(row["Disease"]) and pd.notnull(row["Treatment"])
]

therapy_data = [
    {"source": "therapy", "input": row["user_message"], "output": row["bot_message"]}
    for _, row in therapy_df.iterrows()
    if pd.notnull(row["user_message"]) and pd.notnull(row["bot_message"])
]

# Save as JSONL
def save_jsonl(filename, data):
    with open(filename, "w", encoding="utf-8") as f:
        for item in data:
            json.dump(item, f, ensure_ascii=False)
            f.write("\n")

save_jsonl("faq_data.jsonl", faq_data)
save_jsonl("symptom_data.jsonl", symptom_data)
save_jsonl("therapy_data.jsonl", therapy_data)

print("✅ All 3 datasets saved as JSONL!")

✅ All 3 datasets saved as JSONL!


In [None]:
from sentence_transformers import SentenceTransformer
import json
import pickle

# Load sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to load JSONL file
def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

# Function to generate embeddings and save
def generate_embeddings(jsonl_path, output_pkl_path):
    data = load_jsonl(jsonl_path)
    texts = [entry["input"] for entry in data]
    embeddings = model.encode(texts, show_progress_bar=True)

    with open(output_pkl_path, "wb") as f:
        pickle.dump({"entries": data, "embeddings": embeddings}, f)

    print(f"✅ Embeddings saved to: {output_pkl_path}")

# Generate for all three
generate_embeddings("faq_data.jsonl", "faq_embeddings.pkl")
generate_embeddings("symptom_data.jsonl", "symptom_embeddings.pkl")
generate_embeddings("therapy_data.jsonl", "therapy_embeddings.pkl")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/980 [00:00<?, ?it/s]

✅ Embeddings saved to: faq_embeddings.pkl


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

✅ Embeddings saved to: symptom_embeddings.pkl


Batches:   0%|          | 0/25415 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
!pip install -U sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer
import pickle
import json

model = SentenceTransformer("all-MiniLM-L6-v2")

def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

therapy_data = load_jsonl("therapy_data.jsonl")
batch_size = 5000

for i in range(0, len(therapy_data), batch_size):
    batch = therapy_data[i:i+batch_size]
    texts = [item["input"] for item in batch]
    embeddings = model.encode(texts, show_progress_bar=True)

    with open(f"therapy_embeddings_part{i//batch_size}.pkl", "wb") as f:
        pickle.dump({"entries": batch, "embeddings": embeddings}, f)

    print(f"✅ Saved therapy_embeddings_part{i//batch_size}.pkl")

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part0.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part1.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part2.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part3.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part4.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part5.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part6.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part7.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part8.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part9.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part10.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part11.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part12.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part13.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part14.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part15.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part16.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part17.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part18.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part19.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part20.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part21.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part22.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part23.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part24.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part25.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part26.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part27.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part28.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part29.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part30.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part31.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part32.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part33.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part34.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part35.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part36.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part37.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part38.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part39.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part40.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part41.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part42.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part43.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part44.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part45.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part46.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part47.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part48.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part49.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part50.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part51.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part52.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part53.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part54.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part55.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part56.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part57.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part58.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part59.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part60.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part61.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part62.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part63.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part64.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part65.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part66.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part67.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part68.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part69.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part70.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part71.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part72.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part73.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part74.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part75.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part76.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part77.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part78.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part79.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved therapy_embeddings_part80.pkl


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir -p /content/drive/MyDrive/therapy_embeddings_chunks/
!cp therapy_embeddings_part*.pkl /content/drive/MyDrive/therapy_embeddings_chunks/

In [None]:
import os
import json
import pickle
from sentence_transformers import SentenceTransformer
from google.colab import drive

# 🔐 Step 1: Mount Google Drive
drive.mount('/content/drive')

# 📁 Step 2: Setup Drive backup folder
drive_path = "/content/drive/MyDrive/therapy_embeddings_chunks/"
os.makedirs(drive_path, exist_ok=True)

# 🧠 Step 3: Load sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# 📄 Step 4: Load therapy data
with open("therapy_data.jsonl", "r") as f:
    therapy_data = [json.loads(line) for line in f]

# 🔢 Step 5: Resume from part 80
batch_size = 5000
resume_index = 80 * batch_size  # => 400000

# 🔁 Step 6: Start chunking from part 80 onward with Drive save
for i in range(resume_index, len(therapy_data), batch_size):
    batch_data = therapy_data[i:i + batch_size]
    inputs = [item["input"] for item in batch_data]
    outputs = [item["output"] for item in batch_data]
    embeddings = model.encode(inputs, show_progress_bar=True)

    save_dict = {
        "entries": [{"input": inp, "output": out} for inp, out in zip(inputs, outputs)],
        "embeddings": embeddings
    }

    part_num = i // batch_size
    filename = f"therapy_embeddings_part{part_num}.pkl"

    # Save locally
    with open(filename, "wb") as f:
        pickle.dump(save_dict, f)

    # Copy to Drive immediately
    os.system(f"cp {filename} {drive_path}")

    print(f"✅ Chunk {part_num} complete — backed up to Drive ✅")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 80 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 81 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 82 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 83 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 84 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 85 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 86 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 87 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 88 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 89 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 90 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 91 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 92 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 93 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 94 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 95 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 96 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 97 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 98 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 99 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 100 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 101 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 102 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 103 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 104 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 105 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 106 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 107 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 108 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 109 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 110 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 111 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 112 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 113 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 114 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 115 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 116 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 117 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 118 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 119 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 120 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 121 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 122 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 123 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 124 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 125 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 126 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 127 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 128 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 129 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 130 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 131 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 132 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 133 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 134 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 135 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 136 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 137 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 138 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 139 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 140 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 141 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 142 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 143 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 144 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 145 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 146 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 147 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 148 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 149 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 150 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 151 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 152 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 153 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 154 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 155 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 156 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 157 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 158 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 159 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 160 complete — backed up to Drive ✅


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Chunk 161 complete — backed up to Drive ✅


Batches:   0%|          | 0/103 [00:00<?, ?it/s]

✅ Chunk 162 complete — backed up to Drive ✅


In [None]:
import os
import pickle
from google.colab import drive

# 🔐 Step 1: Mount Google Drive
drive.mount('/content/drive')

# 📁 Step 2: Directory jahan chunk files stored hain
chunk_dir = "/content/drive/MyDrive/therapy_embeddings_chunks/"
output_path = "/content/drive/MyDrive/final_merged_therapy_data.pkl"  # ✅ Custom name

# 📦 Step 3: Load and merge all chunk files
all_entries = []
all_embeddings = []

chunk_files = sorted(
    [f for f in os.listdir(chunk_dir) if f.startswith("therapy_embeddings_part") and f.endswith(".pkl")],
    key=lambda x: int(x.split("part")[1].split(".")[0])
)

print(f"📂 Total files to merge: {len(chunk_files)}")

for file in chunk_files:
    with open(os.path.join(chunk_dir, file), "rb") as f:
        data = pickle.load(f)
        all_entries.extend(data["entries"])
        all_embeddings.extend(data["embeddings"])
    print(f"✅ Merged {file}")

# 🧩 Step 4: Save final merged file
final_data = {"entries": all_entries, "embeddings": all_embeddings}
with open(output_path, "wb") as f:
    pickle.dump(final_data, f)

print(f"\n✅✅ Merged file saved to Drive as: final_merged_therapy_data.pkl")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📂 Total files to merge: 163
✅ Merged therapy_embeddings_part0.pkl
✅ Merged therapy_embeddings_part1.pkl
✅ Merged therapy_embeddings_part2.pkl
✅ Merged therapy_embeddings_part3.pkl
✅ Merged therapy_embeddings_part4.pkl
✅ Merged therapy_embeddings_part5.pkl
✅ Merged therapy_embeddings_part6.pkl
✅ Merged therapy_embeddings_part7.pkl
✅ Merged therapy_embeddings_part8.pkl
✅ Merged therapy_embeddings_part9.pkl
✅ Merged therapy_embeddings_part10.pkl
✅ Merged therapy_embeddings_part11.pkl
✅ Merged therapy_embeddings_part12.pkl
✅ Merged therapy_embeddings_part13.pkl
✅ Merged therapy_embeddings_part14.pkl
✅ Merged therapy_embeddings_part15.pkl
✅ Merged therapy_embeddings_part16.pkl
✅ Merged therapy_embeddings_part17.pkl
✅ Merged therapy_embeddings_part18.pkl
✅ Merged therapy_embeddings_part19.pkl
✅ Merged therapy_embeddings_part20.pkl
✅ Merged therapy_embeddings_part21

In [None]:
!pip install faiss-gpu
!pip install faiss-cpu

In [None]:
import faiss
import pickle
import numpy as np
from google.colab import drive

# 🔐 Step 1: Mount Google Drive
drive.mount('/content/drive')

# 📂 Step 2: Load merged embedding file
with open("/content/drive/MyDrive/final_merged_therapy_data.pkl", "rb") as f:
    data = pickle.load(f)

entries = data["entries"]
embeddings = np.array(data["embeddings"]).astype("float32")

print(f"📦 Total entries: {len(entries)}")
print(f"📐 Embedding shape: {embeddings.shape}")

# 🔍 Step 3: Normalize embeddings (cosine similarity)
faiss.normalize_L2(embeddings)

# ⚙️ Step 4: Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # IP = Inner Product (cosine sim after L2 norm)
index.add(embeddings)

print(f"✅ FAISS index created with {index.ntotal} vectors")

# 💾 Step 5: Save FAISS index and entries list
faiss.write_index(index, "/content/drive/MyDrive/therapy_faiss_index.index")

with open("/content/drive/MyDrive/therapy_entry_map.pkl", "wb") as f:
    pickle.dump(entries, f)

print("✅ FAISS index and entry map saved to Drive")

ModuleNotFoundError: No module named 'faiss'

In [None]:
import pandas as pd
import pickle
from sentence_transformers import SentenceTransformer

# Step 1: Load Excel Files (change filenames if needed)
faq_df = pd.read_excel("/content/Questions_Dataset.xlsx")  # Make sure this matches your filename
symptom_df = pd.read_excel("/content/Symptons-Disease-Treatment.xlsx")  # Make sure this matches your filename

# Step 2: Load Embedding Model
model = SentenceTransformer("all-MiniLM-L6-v2")

# ================== FAQ Dataset Processing ==================
faq_questions = faq_df["input"].astype(str).tolist()
faq_answers = faq_df["output"].astype(str).tolist()
faq_embeddings = model.encode(faq_questions)

faq_data = {
    "questions": faq_questions,
    "answers": faq_answers,
    "embeddings": faq_embeddings
}

with open("faq_embeddings.pkl", "wb") as f:
    pickle.dump(faq_data, f)

print("✅ faq_embeddings.pkl created")

# ================== Symptom Dataset Processing ==================
symptom_queries = symptom_df["Symptom"].astype(str).tolist()
symptom_diseases = symptom_df["Disease"].astype(str).tolist()
symptom_treatments = symptom_df["Treatment"].astype(str).tolist()
symptom_embeddings = model.encode(symptom_queries)

symptom_data = {
    "queries": symptom_queries,
    "diseases": symptom_diseases,
    "treatments": symptom_treatments,
    "embeddings": symptom_embeddings
}

with open("symptom_embeddings.pkl", "wb") as f:
    pickle.dump(symptom_data, f)

print("✅ symptom_embeddings.pkl created")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ faq_embeddings.pkl created
✅ symptom_embeddings.pkl created
