In [2]:
pip install langchain-huggingface -U

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Downloading langchain_huggingface-0.3.1-py3-none-any.whl (27 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.3.1


In [3]:

!pip install -q langchain langchain-community langchain-google-genai
!pip install faiss-cpu
!pip install gensim
!pip install -q langchain langchain_community langchain_google_genai redis sentence-transformers openai tiktoken langchain-huggingface

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/2.5 MB[0m [31m24.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.5/2.5 MB[0m [31m45.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resol

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/363.4 MB[0m [31m132.2 MB/s[0m eta [36m0:00:03[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0m^C


In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
from dotenv import load_dotenv
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import pipeline

DATA_PATH = "/content/drive/MyDrive/clinicalTrials.csv"   # Set your dataset path
FAISS_INDEX_PATH = "vectorstore"


In [3]:
def create_vectorstore():
    print("Loading and prepping clinical trials dataset...")
    df = pd.read_csv(DATA_PATH)
    text_fields = ['Study Title', 'Conditions', 'Interventions', 'Brief Summary']
    df['combined_text'] = df[text_fields].astype(str).agg(' '.join, axis=1)
    loader = DataFrameLoader(df, page_content_column="combined_text")
    documents = loader.load()
    print(f"Loaded {len(documents)} documents.")
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    print("Building FAISS vectorstore...")
    vectorstore = FAISS.from_documents(documents, embedding_model)
    vectorstore.save_local(FAISS_INDEX_PATH)
    print(f"Vectorstore saved at '{FAISS_INDEX_PATH}'.")


In [4]:
def load_retriever():
    print("Loading retriever from FAISS index...")
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.load_local(FAISS_INDEX_PATH, embedding_model, allow_dangerous_deserialization=True)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
    print("Retriever loaded.")
    return retriever

def load_dataframe():
    df = pd.read_csv(DATA_PATH)
    return df.fillna("")


In [5]:
def get_trial_metadata(page_content, df):
    for idx, row in df.iterrows():
        concat = f"{row.get('Study Title','')} {row.get('Conditions','')} {row.get('Interventions','')} {row.get('Brief Summary','')}".strip()
        if page_content.strip()[:100] in concat[:200]:
            return {
                "Study Title": row.get('Study Title', ''),
                "NCT Number": row.get('NCT Number', ''),
                "Study Design": row.get('Study Design', ''),
                "Interventions": row.get('Interventions', ''),
                "Brief Summary": row.get('Brief Summary', ''),
            }
    return {}


In [None]:
# Initialize your text generation pipeline ONCE
pipe = pipeline("text-generation", model="distilgpt2")

def main():
    if not os.path.exists(os.path.join(FAISS_INDEX_PATH, "index.faiss")):
        print("FAISS index not found; creating vectorstore...")
        create_vectorstore()

    retriever = load_retriever()
    df = load_dataframe()

    print("\nClinical Trials Text Generator is ready! Type 'exit' to quit.\n")
    while True:
        query = input("You: ").strip()
        if query.lower() == "exit":
            print("Goodbye!")
            break

        docs = retriever.get_relevant_documents(query)
        if not docs:
            print("No matching trials found. Try rephrasing your query.\n")
            continue

        best_doc = docs[0]
        meta = get_trial_metadata(best_doc.page_content, df)
        if not meta:
            print("Matching trial found but details extraction failed.\n")
            continue

        # Build the prompt with context and user question
        prompt = (
            f"User question: {query}\n\n"
            f"Using this clinical trial info:\n"
            f"Title: {meta['Study Title']}\n"
            f"Study Design: {meta['Study Design']}\n"
            f"Interventions: {meta['Interventions']}\n"
            f"Brief Summary: {meta['Brief Summary']}\n\n"
            "Provide a summary of the trial procedure relevant to the question."
        )
        output = pipe(prompt, max_new_tokens=256)
        generated_text = output[0]['generated_text']

        print(f"\nTop matching trial: {meta['Study Title']} (NCT: {meta['NCT Number']})")
        print("Generated summary:")
        print(generated_text, "\n")

main()


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


FAISS index not found; creating vectorstore...
Loading and prepping clinical trials dataset...
Loaded 117980 documents.


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Building FAISS vectorstore...


  return forward_call(*args, **kwargs)


Vectorstore saved at 'vectorstore'.
Loading retriever from FAISS index...
Retriever loaded.

Clinical Trials Text Generator is ready! Type 'exit' to quit.

You: type 2 diabetes 


  docs = retriever.get_relevant_documents(query)
  return forward_call(*args, **kwargs)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Top matching trial: A Study of the Current Medical Practice and Outcomes in the Treatment of Type 2 Diabetes Mellitus in an Office Setting (MK-0431-199) (NCT: NCT01409213)
Generated summary:
User question: type 2 diabetes

Using this clinical trial info:
Title: A Study of the Current Medical Practice and Outcomes in the Treatment of Type 2 Diabetes Mellitus in an Office Setting (MK-0431-199)
Study Design: Observational Model: |Time Perspective: p
Interventions: 
Brief Summary: The purpose of this study is to collect information of the risk profile of patients with type 2 diabetes mellitus, their treatment concerning meeting the guidelines for treatment of diabetic patients type 2 published by the Deutsche Diabetes Gesellschaft (DDG) on October 13, 2008

Provide a summary of the trial procedure relevant to the question.
Expect patients to be able to continue treatment if they have a history of diabetes.
The researchers will follow the results of the study.
Milder diabetes mellitus (DM)

  return forward_call(*args, **kwargs)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Top matching trial: A Study of Health Care Use and Costs in Participants With Early Stage Alzheimer's Disease (AD) (NCT: NCT02951598)
Generated summary:
User question: A 40 year old Alzheimer's Disease Patient

Using this clinical trial info:
Title: A Study of Health Care Use and Costs in Participants With Early Stage Alzheimer's Disease (AD)
Study Design: Observational Model: |Time Perspective: p
Interventions: DRUG: Florbetapir F 18 PET Scan
Brief Summary: The purpose of this study is to learn about health care use, costs, and clinical outcomes over time for amyloid positive participants with early stages of AD in the United States. This study is for research purposes only, and is not intended to treat any medical condition. No study therapy(ies) for AD will be administered.

Provide a summary of the trial procedure relevant to the question.
Author Contributions:

Tate was provided by the National Institute of Health and Human Services (NIH) and the University of Maryland Hospitals.

  return forward_call(*args, **kwargs)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Top matching trial: Efficacy and Safety of Sinutab on Subjects in the Setting of a Common Cold (NCT: NCT00378144)
Generated summary:
User question: Headache, cold, sinus

Using this clinical trial info:
Title: Efficacy and Safety of Sinutab on Subjects in the Setting of a Common Cold
Study Design: Allocation: RANDOMIZED|Intervention Model: PARALLEL|Masking: DOUBLE (PARTICIPANT, INVESTIGATOR)|Primary Purpose: TREATMENT
Interventions: DRUG: Pseudoephedrine/Paracetamol
Brief Summary: The purpose of this study is to investigate the efficacy and safety for a marketed sinus allergy product, Sinutab, in the treatment of nasal congestion and headache.

Provide a summary of the trial procedure relevant to the question.
Provide a summary of the study procedure relevant to the question.
Provide a summary of the trial procedure relevant to the question. The study. Phase 3 of the study to examine the efficacy of Sinutab.
Provide a summary of the study procedure relevant to the question. The study.