In [1]:
from langchain.document_loaders import PyPDFLoader

def documents_loader(path: str):
    """
    Load documents from a given path.
    Args:
        path (str): The path to the document.
    Returns:
        list: A list of loaded documents.
    """
    loader = PyPDFLoader(path)
    documents = loader.load()
    return documents


In [2]:
documents = documents_loader("./Data/PDF_Files/the-gale-encyclopedia-of-medicine_compress.pdf")
print(len(documents))

637


In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into smaller chunks.
    Args:
        documents (list): List of documents to split.
        chunk_size (int): Size of each chunk.
        chunk_overlap (int): Overlap between chunks.
    Returns:
        list: List of split documents.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    text_chunks = text_splitter.split_documents(documents)
    return text_chunks

In [4]:
chunks = split_documents(documents, chunk_size=1000, chunk_overlap=200)
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 3426


In [12]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

db = FAISS.from_documents(chunks, embedding_model)

In [15]:
query = "breast cancer"
docs = db.similarity_search(query)
print(f"Number of documents found: {len(docs)}")

Number of documents found: 4


In [37]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_huggingface import HuggingFaceEmbeddings
import uuid
import os


def store_documents_in_faiss(chunks, document_name):
    """
    Store documents in a FAISS vector store.
    Args:
        chunks (list): List of document chunks to be stored.
    """
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    index = faiss.IndexFlatL2(len(embedding_model.embed_query("hello world")))

    vector_store = FAISS(
        embedding_function=embedding_model,
        index=index,
        docstore= InMemoryDocstore(),
        index_to_docstore_id={}
    )

    ids = [str(uuid.uuid4()) for _ in chunks]
    vector_store.add_documents(documents=chunks, ids=ids)
    vector_store.save_local(os.path.join('Data/vector_db', document_name))
    vector_store = FAISS.load_local(
        os.path.join('Data/vector_db', document_name),
        embedding_model,
        allow_dangerous_deserialization=True
    )
    return vector_store


In [39]:
def query_faiss_vector_store(vector_store, query, k=3):
    """
    Query the FAISS vector store for similar documents.
    Args:
        vector_store (FAISS): The FAISS vector store instance.
        query (str): The query string to search for.
        k (int): The number of similar documents to return.
    Returns:
        list: List of similar documents.
    """
    results = vector_store.similarity_search(query=query, k=k)
    return results


In [40]:
vector_store = store_documents_in_faiss(chunks, "medical_documents")

results = query_faiss_vector_store(vector_store, query="sex life love", k=3)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]\n\n\n")

* bruises , burns, poisoning , broken bones, and internal
hemorrhages. Physical assault against an adult primarily
occurs with women, usually in the form of domestic vio-
lence. It is estimated that approximately three million
children witness domestic violence every year.
Sexual abuse of a child refers to sexual behavior
between an adult and child or between two children, one
of whom is dominant or significantly older. The sexual
behaviors can include touching breasts, genitals, and but-
tocks; either dressed or undressed. The behavior can also
include exhibitionism, cunnilingus, fellatio, or penetra-
tion of the vagina or anus with sexual organs or objects.
Pornographic photography is also used in sexual
abuse with children. Reported sex offenders are 97%
male. Females are more often perpetrators in child-care
settings, since children may confuse sexual abuse by a
female with normal hygiene care. Sexual abuse by step-
fathers is five times more common than with biological [{'producer

In [22]:
results = vector_store.similarity_search(query="cancer",k=3)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]\n\n\n")

* cancerous cells. Cancer is commonly defined as the
uncontrolled growth of cells, with loss of differentiation
and commonly, with metastasis, spread of the cancer to
other tissues and organs. Cancers are malignant growths.
In contrast, benign growths remain encapsulated and grow
within a well-defined area. Although benign tumors may
be fatal if untreated, due to pressure on essential organs, as
in the case of a benign brain tumor, surgery or radiation
are the preferred methods of treating growths which have a
well defined location. Drug therapy is used when the
tumor has spread, or may spread, to all areas of the body.
Description
Several classes of drugs may be used in cancer treat-
ment, depending on the nature of the organ involved. For
example, breast cancers are commonly stimulated by
estrogens, and may be treated with drugs which inactive
the sex hormones. Similarly, prostate cancer may be
treated with drugs that inactivate androgens, the male sex [{'producer': 'PDFlib+PDI 5.0.0

In [25]:
# ...existing code...
embedding = embedding_model.embed_query("hello world")
print(len(embedding))

384


In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


In [5]:
result = pipe('guess_age_gender.wav')
print(result["text"])




 I heard that you can understand what people say and even know their age and gender. So can you guess my age and gender from my voice?


In [7]:
import os
from groq import Groq

client = Groq(api_key=os.environ.get("GORQ_API"))

speech_file_path = "speech.wav" 
model = "playai-tts"
voice = "Fritz-PlayAI"
text = "I love building and shipping new features for our users!"
response_format = "wav"

response = client.audio.speech.create(
    model=model,
    voice=voice,
    input=text,
    response_format=response_format
)
response.write_to_file(speech_file_path)

In [8]:
def text_to_speech(text, speech_file_path = "speech.wav", model="playai-tts", voice="Fritz-PlayAI"):
    """
    Converts text to speech using the Groq API.
    """
    client = Groq(api_key=os.environ.get("GORQ_API"))
    response = client.audio.speech.create(
        model=model,
        voice=voice,
        input=text,
        response_format="wav"
    )
    audio_file = response.write_to_file(os.path.join('Data/audio_files', speech_file_path))
    return audio_file

# Example usage
text = "Hello, this is a test of the text-to-speech conversion."
audio_file = text_to_speech(text, speech_file_path="test_speech.wav")

Audio file saved at: None


In [9]:
from groq import Groq
import base64
import os

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

# Path to your image
image_path = "./Data/Images/OIP.webp"

# Getting the base64 string
base64_image = encode_image(image_path)

client = Groq(api_key=os.environ.get("GORQ_API"))

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Which desease this image shows?"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                    },
                },
            ],
        }
    ],
    model="meta-llama/llama-4-scout-17b-16e-instruct",
)

print(chat_completion.choices[0].message.content)

The image shows a person with white flakes on their scalp, which is a common symptom of a skin condition known as **dandruff** or **seborrheic dermatitis**. However, the presence of small, white flakes that resemble tiny scales or patches, particularly on the scalp, is more indicative of **dandruff** or **seborrheic dermatitis**, but most specifically **dandruff**.

A more severe and specific condition characterized by a similar appearance is **head lice** infestation, but the image does not show any moving objects (lice or nits close to the scalp) but rather flakes. 

However, the flakes are also a symptom of **psoriasis**. Considering the typical appearance of the condition (white scales), I'd say the image most likely shows **dandruff**.


In [None]:
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate

llm = HuggingFacePipeline.from_model_id(
    model_id="meta-llama/Llama-3.1-8B-Instruct",
    task="text-generation",
    pipeline_kwargs=dict(
        max_new_tokens=512,
        do_sample=False,
        repetition_penalty=1.03,
    ),
)

chat_model = ChatHuggingFace(llm=llm)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files:   0%|          | 0/4 [02:53<?, ?it/s]


In [None]:
from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
)

messages = [
    SystemMessage(content="You're a helpful assistant"),
    HumanMessage(
        content="What happens when an unstoppable force meets an immovable object?"
    ),
]

ai_msg = chat_model.invoke(messages)