Testin the tapas-large-finetuned-wtq model on the data

In [10]:
import pandas as pd
from transformers import pipeline

# Load the CSV file
file_path = "../data/PARAMETROS_TJ2_ORDENADOS.csv"  # Replace with your file path
data = pd.read_csv(file_path, delimiter=";", encoding="latin1")

# Handle missing values and convert all columns to strings
data.fillna("N/A", inplace=True)
data = data.astype(str)

# Reduce table size to relevant columns if needed
columns_to_keep = ["N_DESCARGA", "fecha", "hora", "comentarioDesc"]
data = data[columns_to_keep]

# Load the TAPAS pipeline with the base model
pipe = pipeline("table-question-answering", model="google/tapas-base-finetuned-wtq", device=-1)

# Define questions
questions = [
    "CUAL ES LA FECHA PARA EL 4?",
    "What is the hora for N_DESCARGA 5?",
    "What is the comentarioDesc for N_DESCARGA 12?"
]

# Process each question
print("\nResults:")
for question in questions:
    # Dynamically filter rows relevant to the question (if possible)
    try:
        # Filter rows dynamically based on the query
        query_value = 4  # Example value extracted from the question
        filtered_table = data[data["N_DESCARGA"] == str(query_value)]  # Reduce the table to relevant rows
    except ValueError:
        filtered_table = data  # Default to the full table if no filtering is possible

    # Run the pipeline with the filtered table
    answer = pipe(table=filtered_table, query=question)
    print(f"Question: {question}")
    print(f"Answer: {answer['answer']}\n")

  data = pd.read_csv(file_path, delimiter=";", encoding="latin1")
  data.fillna("N/A", inplace=True)
Device set to use cpu



Results:
Question: CUAL ES LA FECHA PARA EL 4?
Answer: 19971126.0

Question: What is the hora for N_DESCARGA 5?
Answer: 00:00

Question: What is the comentarioDesc for N_DESCARGA 12?
Answer: N/A



  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


Now let´s see if it works if we ask for a bigger number. 

Now we will try using a binary search algorithm to efficiently locate the N_DESCARGA value in the table, and try to reduce the search time. 

I am going to try a different approach: use a model to convert the question into SQL and query the dataset directly.

Loggin using huggingface-cli login, and put your access token.

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pickle

# Load the model and tokenizer from Hugging Face
model_name = "meta-llama/Llama-2-70b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create a text generation pipeline
llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Save the pipeline as a pickle file
with open("llm_pipeline.pkl", "wb") as f:
    pickle.dump(llm_pipeline, f)

print("Pipeline saved as llm_pipeline.pkl")

Downloading shards:  13%|█▎        | 2/15 [54:25<5:53:47, 1632.89s/it]


KeyboardInterrupt: 

In [18]:
import pandas as pd
from langchain_community.llms import Replicate
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Set up Replicate for LLaMA-2
os.environ["REPLICATE_API_TOKEN"] = os.getenv("REPLICATE_API_TOKEN")
llama2_13b_chat = "meta/llama-2-7b-chat"

llm = Replicate(
    model=llama2_13b_chat,
    model_kwargs={"temperature": 0.7, "max_new_tokens": 100}
)

# Load the CSV file
file_path = "../data/PARAMETROS_TJ2_ORDENADOS.csv"  # Replace with your file path
data = pd.read_csv(file_path, delimiter=";", encoding="latin1", low_memory=False)

# Ensure missing values are replaced properly
for column in data.columns:
    if data[column].dtype == "float64":
        data[column] = data[column].fillna(-1)
        if column == "N_DESCARGA":
            data[column] = data[column].astype(int)
    else:
        data[column] = data[column].fillna("N/A")

# Convert the entire DataFrame to strings
data = data.astype(str)

# Define the column names and script as context
script_context = (
    "The table is named 'data' and contains the following important columns:\n"
    "N_DESCARGA, fecha, hora, comentarioDesc, comentarioExp, configuracion, "
    "potencia_radiada, energia_diamagnetica.\n"
    "You must use these column names exactly as they are when writing SQL queries.\n"
    "Always use 'N_DESCARGA' as the column for filtering by number.\n"
    "Respond with only the SQL query, without any explanation or additional text.\n"
)

# Define questions
questions = [
    "What is the fecha for N_DESCARGA 4?",
    "What is the hora for N_DESCARGA 26458?",
    "What is the comentarioDesc for N_DESCARGA 12?"
]

# Helper function to execute SQL queries on the DataFrame
def execute_sql_query(data, sql_query):
    try:
        # Use pandasql to execute SQL queries
        import pandasql as ps
        result = ps.sqldf(sql_query, locals())
        return result
    except Exception as e:
        return f"SQL Execution Error: {e}"

# Helper function to validate SQL query
def is_valid_sql(sql_query):
    return sql_query.strip().upper().startswith("SELECT")

# Process each question
print("\nResults:")
for question in questions:
    try:
        # Provide the script and the question to the LLM
        llm_input = f"{script_context}\nConvert the following question into an SQL query: {question}"
        
        # Get the generated SQL query from LLaMA-2
        response = llm.invoke(input=llm_input).strip()  # Extract and clean the response

        # Debugging: Print the generated SQL query
        print(f"Generated SQL Query: {response}")

        # Validate the SQL query
        if not is_valid_sql(response):
            print(f"Invalid SQL query generated for question: {question}")
            continue

        # Execute the SQL query on the DataFrame
        result = execute_sql_query(data, response)

        # Output the result
        print(f"Question: {question}")
        print(f"Answer: {result}\n")

    except Exception as e:
        print(f"Error during processing for question '{question}': {e}\n")


Results:
Generated SQL Query: SELECT fecha FROM data WHERE N_DESCARGA = 4;
Question: What is the fecha for N_DESCARGA 4?
Answer:         fecha
0  19971126.0

Generated SQL Query: SELECT hora FROM data WHERE N_DESCARGA = 26458;
Question: What is the hora for N_DESCARGA 26458?
Answer:     hora
0  14:14

Generated SQL Query: SELECT comentarioDesc FROM data WHERE N_DESCARGA = 12;
Question: What is the comentarioDesc for N_DESCARGA 12?
Answer:   comentarioDesc
0            N/A



In [2]:
import pandas as pd
from langchain_community.llms import Replicate
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Set up Replicate for LLaMA-2
os.environ["REPLICATE_API_TOKEN"] = os.getenv("REPLICATE_API_TOKEN")
llama2_13b_chat = "meta/llama-2-7b-chat"

llm = Replicate(
    model=llama2_13b_chat,
    model_kwargs={"temperature": 0.7, "max_new_tokens": 100}
)

# Load the CSV file
file_path = "../data/PARAMETROS_TJ2_ORDENADOS.csv"  # Replace with your file path
data = pd.read_csv(file_path, delimiter=";", encoding="latin1", low_memory=False)

# Ensure missing values are replaced properly
for column in data.columns:
    if data[column].dtype == "float64":
        data[column] = data[column].fillna(-1)
        if column == "N_DESCARGA":
            data[column] = data[column].astype(int)
    else:
        data[column] = data[column].fillna("N/A")

# Convert the entire DataFrame to strings
data = data.astype(str)

# Define the column names and script as context
script_context = (
    "The table is named 'data' and contains the following important columns:\n"
    "N_DESCARGA, fecha, hora, comentarioDesc, comentarioExp, configuracion, "
    "potencia_radiada, energia_diamagnetica.\n"
    "You must use these column names exactly as they are when writing SQL queries.\n"
    "Always use 'N_DESCARGA' as the column for filtering by number.\n"
    "Respond with only the SQL query, without any explanation or additional text.\n"
)

# Define questions
questions = [
    "Cual es la fecha para el numero de descarga 4?",
    "cual es la hora para el numero de descarga 26458?",
    "cual es el comentario para el numero de descarga 12?"
]

# Helper function to execute SQL queries on the DataFrame
def execute_sql_query(data, sql_query):
    try:
        # Use pandasql to execute SQL queries
        import pandasql as ps
        result = ps.sqldf(sql_query, locals())
        return result
    except Exception as e:
        return f"SQL Execution Error: {e}"

# Helper function to validate SQL query
def is_valid_sql(sql_query):
    return sql_query.strip().upper().startswith("SELECT")

# Process each question
print("\nResults:")
for question in questions:
    try:
        # Provide the script and the question to the LLM
        llm_input = f"{script_context}\nConvert the following question into an SQL query: {question}"
        
        # Get the generated SQL query from LLaMA-2
        response = llm.invoke(input=llm_input).strip()  # Extract and clean the response

        # Debugging: Print the generated SQL query
        print(f"Generated SQL Query: {response}")

        # Validate the SQL query
        if not is_valid_sql(response):
            print(f"Invalid SQL query generated for question: {question}")
            continue

        # Execute the SQL query on the DataFrame
        result = execute_sql_query(data, response)

        # Output the result
        print(f"Question: {question}")
        print(f"Answer: {result}\n")

    except Exception as e:
        print(f"Error during processing for question '{question}': {e}\n")


Results:
Error during processing for question 'Cual es la fecha para el numero de descarga 4?': ReplicateError Details:
title: Free time limit reached
status: 402
detail: You have reached the free time limit. To continue using Replicate, set up billing at https://replicate.com/account/billing#billing.

Error during processing for question 'cual es la hora para el numero de descarga 26458?': ReplicateError Details:
title: Free time limit reached
status: 402
detail: You have reached the free time limit. To continue using Replicate, set up billing at https://replicate.com/account/billing#billing.

Error during processing for question 'cual es el comentario para el numero de descarga 12?': ReplicateError Details:
title: Free time limit reached
status: 402
detail: You have reached the free time limit. To continue using Replicate, set up billing at https://replicate.com/account/billing#billing.



Testing with different quesitons

In [None]:
import pandas as pd
from langchain_community.llms import Replicate
from dotenv import load_dotenv
from googletrans import Translator
import asyncio
import os
import nest_asyncio
nest_asyncio.apply()

# Load environment variables
load_dotenv()

# Set up Replicate for LLaMA-2
os.environ["REPLICATE_API_TOKEN"] = os.getenv("REPLICATE_API_TOKEN")
llama2_13b_chat = "meta/llama-2-7b-chat"

llm = Replicate(
    model=llama2_13b_chat,
    model_kwargs={"temperature": 0.7, "max_new_tokens": 100}
)

# Load the CSV file
file_path = "../data/PARAMETROS_TJ2_ORDENADOS.csv"  # Replace with your file path
data = pd.read_csv(file_path, delimiter=";", encoding="latin1", low_memory=False)

# Ensure missing values are replaced properly
for column in data.columns:
    if data[column].dtype == "float64":
        data[column] = data[column].fillna(-1)
        if column == "N_DESCARGA":
            data[column] = data[column].astype(int)
    else:
        data[column] = data[column].fillna("N/A")

# Convert the entire DataFrame to strings
data = data.astype(str)

# Define the column names and script as context
script_context = (
    "The table is named 'data' and contains the following important columns:\n"
    "N_DESCARGA, fecha, hora, comentarioDesc, comentarioExp, configuracion, "
    "potencia_radiada, energia_diamagnetica.\n"
    "You must use these column names exactly as they are when writing SQL queries.\n"
    "Always use 'N_DESCARGA' as the column for filtering by number.\n"
    "Questions may be in Spanish or English, but the output must always be a valid SQL query.\n"
    "Do not include any explanations, greetings, or additional text in your response. Only output the SQL query."
)

# Spanish questions
questions = [
    "Cual es la fecha para el numero de descarga 4?",
    "cual es la hora para el numero de descarga 26458?",
    "cual es el comentario para el numero de descarga 12?"
]

# Helper function to execute SQL queries on the DataFrame
def execute_sql_query(data, sql_query):
    try:
        # Use pandasql to execute SQL queries
        import pandasql as ps
        result = ps.sqldf(sql_query, locals())
        return result
    except Exception as e:
        return f"SQL Execution Error: {e}"

# Helper function to validate SQL query
def is_valid_sql(sql_query):
    return sql_query.strip().upper().startswith("SELECT")

# Translate questions and process them
async def process_questions():
    translator = Translator()

    print("\nResults:")
    for question in questions:
        try:
            # Translate the question to English
            translated_question = await translator.translate(question, src='es', dest='en')

            # Provide the script and the translated question to the LLM
            llm_input = f"{script_context}\nConvert the following question into an SQL query: {translated_question.text}"

            # Get the generated SQL query from LLaMA-2
            response = llm.invoke(input=llm_input).strip()  # Extract and clean the response

            # Debugging: Print the generated SQL query
            print(f"Generated SQL Query: {response}")

            # Validate the SQL query
            if not is_valid_sql(response):
                print(f"Invalid SQL query generated for question: {question}")
                continue

            # Execute the SQL query on the DataFrame
            result = execute_sql_query(data, response)

            # Output the result
            print(f"Question (Original): {question}")
            print(f"Question (Translated): {translated_question.text}")
            print(f"Answer: {result}\n")

        except Exception as e:
            print(f"Error during processing for question '{question}': {e}\n")

# Run the processing function in an existing event loop
try:
    loop = asyncio.get_running_loop()
    task = loop.create_task(process_questions())
    loop.run_until_complete(task)
except RuntimeError:
    asyncio.run(process_questions())


Results:
Generated SQL Query: SELECT fecha FROM data WHERE N_DESCARGA = 4;
Question (Original): Cual es la fecha para el numero de descarga 4?
Question (Translated): What is the date for download number 4?
Answer:         fecha
0  19971126.0

Generated SQL Query: Sure, I'd be happy to help! Here is the SQL query to answer the question:

SELECT hora FROM data WHERE N_DESCARGA = 26458;
Invalid SQL query generated for question: cual es la hora para el numero de descarga 26458?
Generated SQL Query: SELECT comentarioDesc FROM data WHERE N_DESCARGA = 12;
Question (Original): cual es el comentario para el numero de descarga 12?que numero de descarga tiene este comentario: RAYOS - X
Question (Translated): What is the comment for download number 12? What download number does this comment have: RAYOS - X
Answer:   comentarioDesc
0            N/A



In [4]:
import pandas as pd
from langchain_community.llms import Replicate
from dotenv import load_dotenv
from googletrans import Translator
import asyncio
import os
import nest_asyncio
nest_asyncio.apply()

# Load environment variables
load_dotenv()

# Set up Replicate for LLaMA-2
os.environ["REPLICATE_API_TOKEN"] = os.getenv("REPLICATE_API_TOKEN")
llama2_13b_chat = "meta/llama-2-7b-chat"

llm = Replicate(
    model=llama2_13b_chat,
    model_kwargs={"temperature": 0.7, "max_new_tokens": 100}
)

# Load the CSV file
file_path = "../data/PARAMETROS_TJ2_ORDENADOS.csv"  # Replace with your file path
data = pd.read_csv(file_path, delimiter=";", encoding="latin1", low_memory=False)

# Ensure missing values are replaced properly
for column in data.columns:
    if data[column].dtype == "float64":
        data[column] = data[column].fillna(-1)
        if column == "N_DESCARGA":
            data[column] = data[column].astype(int)
    else:
        data[column] = data[column].fillna("N/A")

# Convert the entire DataFrame to strings
data = data.astype(str)

# Define the column names and script as context
script_context = (
    "The table is named 'data' and contains the following important columns:\n"
    "N_DESCARGA, fecha, hora, comentarioDesc, comentarioExp, configuracion, "
    "potencia_radiada, energia_diamagnetica.\n"
    "You must use these column names exactly as they are when writing SQL queries.\n"
    "Always use 'N_DESCARGA' as the column for filtering by number.\n"
    "Questions may be in Spanish or English, but the output must always be a valid SQL query.\n"
    "Do not include any explanations, greetings, or additional text in your response. Only output the SQL query."
)

# Spanish questions
questions = [
    "Cual es la fecha para el numero de descarga 4?",
    "cual es la hora para el numero de descarga 26458?",
    "cual es el comentario para el numero de descarga 8621?"
]

# Helper function to execute SQL queries on the DataFrame
def execute_sql_query(data, sql_query):
    try:
        # Use pandasql to execute SQL queries
        import pandasql as ps
        result = ps.sqldf(sql_query, locals())
        return result
    except Exception as e:
        return f"SQL Execution Error: {e}"

# Translate questions and process them
async def process_questions():
    translator = Translator()

    print("\nResults:")
    for question in questions:
        try:
            # Translate the question to English
            translated_question = await translator.translate(question, src='es', dest='en')

            # Provide the script and the translated question to the LLM
            llm_input = f"{script_context}\nConvert the following question into an SQL query: {translated_question.text}"

            # Get the generated SQL query from LLaMA-2
            response = llm.invoke(input=llm_input).strip()  # Extract and clean the response

            # Validate the SQL query
            if not response.strip().upper().startswith("SELECT"):
                print(f"Invalid SQL query generated for question: {question}")
                continue

            # Execute the SQL query on the DataFrame
            result = execute_sql_query(data, response)

            # Output only the question and the result
            print(f"Question (Original): {question}")
            print(f"Question (Translated): {translated_question.text}")
            print(f"Answer: {result}\n")

        except Exception as e:
            print(f"Error during processing for question '{question}': {e}\n")

# Run the processing function in an existing event loop
try:
    loop = asyncio.get_running_loop()
    task = loop.create_task(process_questions())
    loop.run_until_complete(task)
except RuntimeError:
    asyncio.run(process_questions())


Results:
Error during processing for question 'Cual es la fecha para el numero de descarga 4?': ReplicateError Details:
title: Free time limit reached
status: 402
detail: You have reached the free time limit. To continue using Replicate, set up billing at https://replicate.com/account/billing#billing.

Error during processing for question 'cual es la hora para el numero de descarga 26458?': ReplicateError Details:
title: Free time limit reached
status: 402
detail: You have reached the free time limit. To continue using Replicate, set up billing at https://replicate.com/account/billing#billing.

Error during processing for question 'cual es el comentario para el numero de descarga 8621?': ReplicateError Details:
title: Free time limit reached
status: 402
detail: You have reached the free time limit. To continue using Replicate, set up billing at https://replicate.com/account/billing#billing.



Testing with more questions

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

In [3]:
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_groq import ChatGroq

# Path to the CSV file
csv_file_path = "/Users/sofiamorenolasa/Desktop/TFG/Anomaly detection/data/PARAMETROS_TJ2_ORDENADOS.csv"

# Initialize ChatGroq LLM
groq_api = "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"
llm = ChatGroq(temperature=0, model="llama3-70b-8192", api_key=groq_api)

# Fix delimiter issue by passing additional pandas kwargs
agent = create_csv_agent(
    llm,
    csv_file_path,
    verbose=True,
    allow_dangerous_code=True,
    pandas_kwargs={"sep": ";", "on_bad_lines": "skip"}  # Specify delimiter and skip bad lines
)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf3 in position 57571: invalid continuation byte

QUERY PIPELINE OVER PANDAS DATAFRAME

In [1]:
from llama_index.core.query_pipeline import (
    QueryPipeline as QP,
    Link,
    InputComponent,
)
from llama_index.experimental.query_engine.pandas import (
    PandasInstructionParser,
)
from llama_index.llms.openai import OpenAI
from llama_index.core import PromptTemplate

In [6]:
import pandas as pd

file_path = "../data/PARAMETROS_TJ2.csv"

# Use latin1 encoding
df = pd.read_csv(file_path, delimiter=";", encoding="latin1")

  df = pd.read_csv(file_path, delimiter=";", encoding="latin1")


In [7]:
instruction_str = (
    "1. Convert the query to executable Python code using Pandas.\n"
    "2. The final line of code should be a Python expression that can be called with the `eval()` function.\n"
    "3. The code should represent a solution to the query.\n"
    "4. PRINT ONLY THE EXPRESSION.\n"
    "5. Do not quote the expression.\n"
)

pandas_prompt_str = (
    "You are working with a pandas dataframe in Python.\n"
    "The name of the dataframe is `df`.\n"
    "This is the result of `print(df.head())`:\n"
    "{df_str}\n\n"
    "Follow these instructions:\n"
    "{instruction_str}\n"
    "Query: {query_str}\n\n"
    "Expression:"
)
response_synthesis_prompt_str = (
    "Given an input question, synthesize a response from the query results.\n"
    "Query: {query_str}\n\n"
    "Pandas Instructions (optional):\n{pandas_instructions}\n\n"
    "Pandas Output: {pandas_output}\n\n"
    "Response: "
)

pandas_prompt = PromptTemplate(pandas_prompt_str).partial_format(
    instruction_str=instruction_str, df_str=df.head(5)
)
pandas_output_parser = PandasInstructionParser(df)
response_synthesis_prompt = PromptTemplate(response_synthesis_prompt_str)
llm = OpenAI(model="gpt-3.5-turbo")

In [8]:
qp = QP(
    modules={
        "input": InputComponent(),
        "pandas_prompt": pandas_prompt,
        "llm1": llm,
        "pandas_output_parser": pandas_output_parser,
        "response_synthesis_prompt": response_synthesis_prompt,
        "llm2": llm,
    },
    verbose=True,
)
qp.add_chain(["input", "pandas_prompt", "llm1", "pandas_output_parser"])
qp.add_links(
    [
        Link("input", "response_synthesis_prompt", dest_key="query_str"),
        Link(
            "llm1", "response_synthesis_prompt", dest_key="pandas_instructions"
        ),
        Link(
            "pandas_output_parser",
            "response_synthesis_prompt",
            dest_key="pandas_output",
        ),
    ]
)
# add link from response synthesis prompt to llm2
qp.add_link("response_synthesis_prompt", "llm2")

In [None]:
response = qp.run(
    query_str="cual es la hora para el numero de descarga 26458?",
)

In [11]:
# Required imports
import pandas as pd
from llama_index.core.query_pipeline import (
    QueryPipeline as QP,
    Link,
    InputComponent,
)
from llama_index.experimental.query_engine.pandas import (
    PandasInstructionParser,
)
from llama_index.llms.openai import OpenAI
from llama_index.core import PromptTemplate

# File path
file_path = "../data/PARAMETROS_TJ2.csv"

# Load the dataset with latin1 encoding and semicolon delimiter
df = pd.read_csv(file_path, delimiter=";", encoding="latin1")

# Display the first 5 rows of the dataframe to validate loading
print("DataFrame Loaded:\n", df.head())

# Define the instruction and prompts
instruction_str = (
    "1. Convert the query to executable Python code using Pandas.\n"
    "2. The final line of code should be a Python expression that can be called with the `eval()` function.\n"
    "3. The code should represent a solution to the query.\n"
    "4. PRINT ONLY THE EXPRESSION.\n"
    "5. Do not quote the expression.\n"
)

pandas_prompt_str = (
    "You are working with a pandas dataframe in Python.\n"
    "The name of the dataframe is `df`.\n"
    "This is the result of `print(df.head())`:\n"
    "{df_str}\n\n"
    "Follow these instructions:\n"
    "{instruction_str}\n"
    "Query: {query_str}\n\n"
    "Expression:"
)

response_synthesis_prompt_str = (
    "Given an input question, synthesize a response from the query results.\n"
    "Query: {query_str}\n\n"
    "Pandas Instructions (optional):\n{pandas_instructions}\n\n"
    "Pandas Output: {pandas_output}\n\n"
    "Response: "
)

# Instantiate PromptTemplate objects
pandas_prompt = PromptTemplate(pandas_prompt_str).partial_format(
    instruction_str=instruction_str, df_str=df.head(5)
)
pandas_output_parser = PandasInstructionParser(df)
response_synthesis_prompt = PromptTemplate(response_synthesis_prompt_str)

# Initialize LLM
os.environ["REPLICATE_API_TOKEN"] = os.getenv("REPLICATE_API_TOKEN")
llm = "meta/llama-2-7b-chat"

# Build the query pipeline
qp = QP(
    modules={
        "input": InputComponent(),
        "pandas_prompt": pandas_prompt,
        "llm1": llm,
        "pandas_output_parser": pandas_output_parser,
        "response_synthesis_prompt": response_synthesis_prompt,
        "llm2": llm,
    },
    verbose=True,
)

qp.add_chain(["input", "pandas_prompt", "llm1", "pandas_output_parser"])
qp.add_links(
    [
        Link("input", "response_synthesis_prompt", dest_key="query_str"),
        Link(
            "llm1", "response_synthesis_prompt", dest_key="pandas_instructions"
        ),
        Link(
            "pandas_output_parser",
            "response_synthesis_prompt",
            dest_key="pandas_output",
        ),
    ]
)
qp.add_link("response_synthesis_prompt", "llm2")

# Run a query
response = qp.run(
    query_str="What is the correlation between ColumnA and ColumnB?"
)

# Print the final response
print(response.message.content)

  df = pd.read_csv(file_path, delimiter=";", encoding="latin1")


DataFrame Loaded:
    N_DESCARGA       fecha   hora comentarioDesc comentarioExp configuracion  \
0         112  19971217.0  19:05            NaN           NaN           NaN   
1         113  19971217.0  19:09            NaN           NaN           NaN   
2         114  19971217.0  19:55            NaN           NaN           NaN   
3         115  19971218.0  11:08            NaN           NaN           NaN   
4         116  19971218.0  11:28            NaN           NaN           NaN   

   potencia_radiada  energia_diamagnetica  retraso_densidad_girotron  zeff  \
0               NaN                   NaN                        NaN   NaN   
1               NaN                   NaN                        NaN   NaN   
2               NaN                   NaN                        NaN   NaN   
3               NaN                   NaN                        NaN   NaN   
4               NaN                   NaN                        NaN   NaN   

   ...  IAccel_nominal_NBI2  tini_NBI

APIConnectionError: Connection error.

In [None]:
import os
import pandas as pd
from llama_index.core.query_pipeline import QueryPipeline as QP, Link, InputComponent
from llama_index.experimental.query_engine.pandas import PandasInstructionParser
from llama_index.core import PromptTemplate
from replicate import run as replicate_run

# Set up Llama 2 using Replicate API
os.environ["REPLICATE_API_TOKEN"] = os.getenv("REPLICATE_API_TOKEN")
llama2_13b_chat = "meta/llama-2-7b-chat"

# Path to your CSV file
file_path = "/Users/sofiamorenolasa/Desktop/TFG/Anomaly detection/data/PARAMETROS_TJ2_ORDENADOS.csv"

# Load the CSV with appropriate delimiter and encoding
df = pd.read_csv(file_path, delimiter=";", encoding="latin1", on_bad_lines="skip")

# Instruction for Pandas code generation
instruction_str = (
    "1. Convert the query to executable Python code using Pandas.\n"
    "2. The final line of code should be a Python expression that can be called with the `eval()` function.\n"
    "3. The code should represent a solution to the query.\n"
    "4. PRINT ONLY THE EXPRESSION.\n"
    "5. Do not quote the expression.\n"
)

# Prompt for generating Pandas code
pandas_prompt_str = (
    "You are working with a pandas dataframe in Python.\n"
    "The name of the dataframe is `df`.\n"
    "This is the result of `print(df.head())`:\n"
    "{df_str}\n\n"
    "Follow these instructions:\n"
    "{instruction_str}\n"
    "Query: {query_str}\n\n"
    "Expression:"
)

# Prompt for synthesizing a final response
response_synthesis_prompt_str = (
    "Given an input question, synthesize a response from the query results.\n"
    "Query: {query_str}\n\n"
    "Pandas Instructions (optional):\n{pandas_instructions}\n\n"
    "Pandas Output: {pandas_output}\n\n"
    "Response: "
)

# Initialize prompts
pandas_prompt = PromptTemplate(pandas_prompt_str).partial_format(
    instruction_str=instruction_str, df_str=df.head(5)
)
pandas_output_parser = PandasInstructionParser(df)
response_synthesis_prompt = PromptTemplate(response_synthesis_prompt_str)


# Define a function to use Llama 2 via Replicate
def llama2_query(prompt):
    """Query Llama 2 via Replicate API."""
    response = replicate_run(
        llama2_13b_chat,
        input={"prompt": prompt, "max_length": 300, "temperature": 0.7},
    )
    return response


# Use Llama 2 for querying and response generation
qp = QP(
    modules={
        "input": InputComponent(),
        "pandas_prompt": pandas_prompt,
        "llm1": llama2_query,  # First Llama 2 query for Pandas instructions
        "pandas_output_parser": pandas_output_parser,
        "response_synthesis_prompt": response_synthesis_prompt,
        "llm2": llama2_query,  # Second Llama 2 query for final response
    },
    verbose=True,
)

# Define chains and links
qp.add_chain(["input", "pandas_prompt", "llm1", "pandas_output_parser"])
qp.add_links(
    [
        Link("input", "response_synthesis_prompt", dest_key="query_str"),
        Link("llm1", "response_synthesis_prompt", dest_key="pandas_instructions"),
        Link("pandas_output_parser", "response_synthesis_prompt", dest_key="pandas_output"),
    ]
)
qp.add_link("response_synthesis_prompt", "llm2")  # Final response link


# --- Test the Query Pipeline ---
if __name__ == "__main__":
    # Example query
    query = "How many rows are there?"
    try:
        response = qp.run({"query_str": query})
        print(f"Query: {query}\nResponse: {response}")
    except Exception as e:
        print(f"Error: {e}")

    # Another query example
    query = "What is the value of 'energia_diamagnetica' in row 10?"
    try:
        response = qp.run({"query_str": query})
        print(f"Query: {query}\nResponse: {response}")
    except Exception as e:
        print(f"Error: {e}")

  df = pd.read_csv(file_path, delimiter=";", encoding="latin1", on_bad_lines="skip")


AttributeError: 'function' object has no attribute 'sub_query_components'