In [1]:
import requests
import pandas as pd
import time

BACKEND_URL = "http://localhost:8000"  # FastAPI backend (Docker exposes this)
ES_URL = "http://localhost:9200"
QDRANT_URL = "http://localhost:6333"

#### When using local model (LLAMA 3.1) for example

Run the following after docker compose build has fully run:

```bash
docker exec -it ollama ollama pull llama3.1
```

Then run:

```bash
docker exec -it ollama ollama list
```

You should see something like:  
`llama3.1:latest    46e0c10c039e    4.9 GB    3 seconds ago`

Finally, run:

```bash
docker exec -it ollama ollama run llama3.1 "Say hello from inside Docker."
```

If it returns valid text then you have enough GPU VRAM to run local model otherwise it will print out the error. 
Common Error:  
`Error: 500 Internal Server Error: model requires more system memory than is currently available unable to load full model on GPU`




In [2]:
def wait_for_service(url: str, name: str, timeout=120):
    print(f"⏳ Waiting for {name} to start...")
    start = time.time()
    while time.time() - start < timeout:
        try:
            r = requests.get(url)
            if r.status_code in (200, 400):
                print(f"✅ {name} is up!")
                return True
        except Exception:
            time.sleep(3)
    raise TimeoutError(f"{name} did not start within {timeout}s")

wait_for_service(f"{BACKEND_URL}/docs", "Backend")
wait_for_service(f"{ES_URL}", "Elasticsearch")
wait_for_service(f"{QDRANT_URL}/collections", "Qdrant")


⏳ Waiting for Backend to start...
✅ Backend is up!
⏳ Waiting for Elasticsearch to start...
✅ Elasticsearch is up!
⏳ Waiting for Qdrant to start...
✅ Qdrant is up!


True

In [3]:
# -------------------------------------------------------------
# STEP 1: Define dummy data tables to simulate real-world inputs
# -------------------------------------------------------------

# 'table_1' represents a small dataset containing fruit names and their colors.
# Each row is identified by a unique 'id'. This could represent, for example,
# data from a specific database table or source file.
table_1 = pd.DataFrame({
    "id": [1, 2, 3],                              # Unique identifiers for each record
    "fruit": ["apple", "banana", "mango"],        # Fruit names
    "color": ["red", "yellow", "orange"]          # Known color labels for each fruit
})

# 'table_2' is a second dataset following the same structure as 'table_1'.
# This simulates appending or combining multiple related tables into a single index.
# Each entry again has an 'id', a fruit name, and its corresponding color.
table_2 = pd.DataFrame({
    "id": [1, 2, 4],                              # Unique IDs for this second table
    "fruit": ["grape", "pineapple", "kiwi"],      # Additional fruit names
    "color": ["purple", "brown", "green"]         # Their known colors
})

# 'test_repair' simulates an *incomplete* dataset that requires data repair or imputation.
# In this table:
#   - Some 'color' values are missing (None)
#   - One row intentionally contains an incorrect color ("blue" for banana)
# This dataset will later be passed to the repair endpoint to infer or fix the missing values.
test_repair = pd.DataFrame({
    "fruit": ["apple", "banana", "avocado", "lime"],  # Fruits to repair or validate
    "color": [None, "blue", None, None]               # Missing or incorrect color values
})

# -------------------------------------------------------------
# STEP 2: Save the tables as CSV files for backend upload
# -------------------------------------------------------------

# The backend expects CSV files to simulate uploaded tables. 
# These are written to disk so they can be sent as file uploads in subsequent API calls.
table_1.to_csv("table_1.csv", index=False)       # Save the first dataset
table_2.to_csv("table_2.csv", index=False)       # Save the second dataset
test_repair.to_csv("test_repair.csv", index=False)  # Save the dataset to repair


In [4]:
import requests

# -------------------------------------------------------------
# STEP 0: Backend connection setup
# -------------------------------------------------------------

# The base URL of your running FastAPI backend. 
# This backend exposes endpoints for indexing and repairing data.
BACKEND_URL = "http://localhost:8000"

# Define a unique name for your combined index.
# This index will store semantic and syntactic representations of the uploaded tables
# inside Elasticsearch and Qdrant (both managed through your backend).
INDEX_NAME = "fruit_index_combined_final"

# -------------------------------------------------------------
# STEP 1: CREATE a new index and upload the first CSV
# -------------------------------------------------------------

# The `/index/` endpoint handles both creation of a new index and
# initial population with one or more CSV files.
# Here we start by uploading the first dataset (`table_1.csv`).
create_url = f"{BACKEND_URL}/index/"

# Open the first CSV file in binary mode ('rb') since files must be uploaded
# as multipart form-data for FastAPI to process them correctly.
with open("table_1.csv", "rb") as f1:
    # Send a POST request to the backend to create a new index
    # and populate it with the first dataset.
    # - `data` sends form fields (here the index name).
    # - `files` sends the actual CSV file as multipart upload.
    resp = requests.post(
        create_url,
        data={"index_name": INDEX_NAME},
        files={"files": f1}
    )

# Print backend response to confirm whether index creation was successful.
# Expected: HTTP 200 OK (or 400 if index already exists)
print("CREATE:", resp.status_code, resp.text)


# -------------------------------------------------------------
# STEP 2: UPDATE (append) the same index with another CSV
# -------------------------------------------------------------

# The same `/index/` endpoint supports PUT requests to *append* new tables
# to an existing index. This allows you to combine multiple datasets into
# one searchable index.
update_url = f"{BACKEND_URL}/index/"

# Prepare additional CSVs you want to merge into the same index.
# Each file will be opened and attached as multipart form-data.
files_to_add = ["table_2.csv"]

# The 'files' list stores tuples in the format (field_name, file_object),
# which is required by `requests.put()` to handle multiple uploads.
files = [("files", open(fname, "rb")) for fname in files_to_add]

try:
    # Send a PUT request to append these new files to the existing index.
    # The backend will:
    #   1. Load and embed the CSV data using the SentenceTransformer model.
    #   2. Store textual data in Elasticsearch for syntactic search.
    #   3. Store vector embeddings in Qdrant for semantic search.
    resp = requests.put(
        update_url,
        data={"index_name": INDEX_NAME},
        files=files
    )

    # Print backend response to confirm successful update.
    # Expected: HTTP 200 OK on success.
    print("UPDATE:", resp.status_code, resp.text)

finally:
    # Always close open file handles to avoid resource leaks.
    for _, fh in files:
        fh.close()


CREATE: 200 {"status":"success"}
UPDATE: 200 {"status":"success"}


In [5]:
print({"index_name": INDEX_NAME})


{'index_name': 'fruit_index_combined_final'}


In [6]:
import pandas as pd
import requests

# -------------------------------------------------------------
# STEP 0: Backend setup
# -------------------------------------------------------------

# Base URL for your FastAPI backend. 
# This backend provides the `/repair/` endpoint for performing automatic 
# data imputation (repairing missing or incorrect attribute values).
BACKEND_URL = "http://localhost:8000"


# -------------------------------------------------------------
# STEP 1: Load and preprocess the test (incomplete) dataset
# -------------------------------------------------------------

# Load the CSV containing incomplete or inconsistent data.
# Example: "test_repair.csv" has a 'fruit' column and a 'color' column,
# but some color values are missing or incorrect.
df = pd.read_csv("test_repair.csv")

# Replace all NaN (missing) values with Python None for JSON compatibility.
# This ensures FastAPI receives proper `null` values in the request.
# Convert the DataFrame into a list of dictionaries — one dict per row.
records = df.where(pd.notnull(df), None).to_dict(orient="records")


# -------------------------------------------------------------
# STEP 2: Construct the repair request payload
# -------------------------------------------------------------

# The backend's `/repair/` endpoint expects a structured JSON object
# describing what needs to be repaired and how to reason about it.
payload = {
    # High-level description of what this dataset represents.
    # This helps the model (especially LLMs) understand the semantic context.
    "entity_description": "A small table of fruits and their colors",

    # Name of the column that contains missing or incorrect values.
    "target_name": "color",

    # The list of target entries (the column to repair), where:
    #   - 'id' is a unique identifier for each row.
    #   - 'value' is the current value (may be None or incorrect).
    "target_data": [
        {"id": i, "value": r["color"]} for i, r in enumerate(records)
    ],

    # The contextual columns (attributes that can help infer missing values).
    # Here, 'fruit' acts as the pivot context used for reasoning.
    "pivot_names": ["fruit"],

    # The actual contextual values, structured the same way as 'target_data'.
    # Each 'values' list contains the pivot column's value(s) for that row.
    "pivot_data": [
        {"id": i, "values": [r["fruit"]]} for i, r in enumerate(records)
    ],

    # Name of the model or reasoning engine to use for inference.
    # This must match one of the initialized models from your backend: must be from ['GPT-4-AzureOpenAI', 'GPT-4-OpenAI', 'Llama 3.1']
    "reasoner_name": "GPT-4-AzureOpenAI",

    # Name of the index to use for context retrieval.
    # This should correspond to the one previously created with /index/ (e.g., "fruit_index_combined_2").
    "index_name": INDEX_NAME,

    # Which retrieval engine(s) to use:
    #   - "semantic" uses Qdrant (vector search)
    #   - "syntactic" uses Elasticsearch (keyword/fuzzy search)
    #   - "both" uses a combination of both
    "index_type": "semantic",

    # Optional reranker model (None disables reranking).
    # Could later support models like "cross-encoder/ms-marco-MiniLM-L-6-v2"
    "reranker_type": None
}


# -------------------------------------------------------------
# STEP 3: Send the repair request to the backend
# -------------------------------------------------------------

# POST the payload to the backend’s `/repair/` endpoint.
# The backend will:
#   1. Retrieve relevant context rows from Qdrant/Elasticsearch.
#   2. Pass the context and missing entries to the reasoning model.
#   3. Return imputed (repaired) values for the missing attributes.
resp = requests.post(f"{BACKEND_URL}/repair/", json=payload)


# -------------------------------------------------------------
# STEP 4: Display the results
# -------------------------------------------------------------

# Print HTTP status code to confirm whether the repair succeeded.
# Expected: 200 OK with a JSON payload containing imputed values.
print("REPAIR:", resp.status_code)

# Print the full JSON response for inspection.
# Example output:
# {
#   "status": "success",
#   "results": [
#       {"value": "red", "table_name": null, "row_number": null, "tuple": null},
#       {"value": "yellow", ...}, ...
#   ]
# }
print(resp.text)


REPAIR: 200
{"status":"success","results":[{"value":"red","table_name":"table_1.csv","row_number":"0","tuple":{"id":1,"fruit":"apple","color":"red"}},{"value":"yellow","table_name":"table_1.csv","row_number":"1","tuple":{"id":2,"fruit":"banana","color":"yellow"}},{"value":null,"table_name":null,"row_number":null,"tuple":null},{"value":"green","table_name":null,"row_number":null,"tuple":null}]}


In [7]:
### How to Parse the model response
eval(resp.text.replace('null', 'None'))['results']

[{'value': 'red',
  'table_name': 'table_1.csv',
  'row_number': '0',
  'tuple': {'id': 1, 'fruit': 'apple', 'color': 'red'}},
 {'value': 'yellow',
  'table_name': 'table_1.csv',
  'row_number': '1',
  'tuple': {'id': 2, 'fruit': 'banana', 'color': 'yellow'}},
 {'value': None, 'table_name': None, 'row_number': None, 'tuple': None},
 {'value': 'green', 'table_name': None, 'row_number': None, 'tuple': None}]

In [8]:
records

[{'fruit': 'apple', 'color': None},
 {'fruit': 'banana', 'color': 'blue'},
 {'fruit': 'avocado', 'color': None},
 {'fruit': 'lime', 'color': None}]