In [5]:
import pandas as pd

df = pd.read_csv("energy_consumption_queries.csv")

df.to_excel("energy_temp.xlsx")

In [1]:
from agent.configuration import Configuration
from agent.utils import load_chat_model
import pandas as pd
import yaml

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model = load_chat_model("ollama/gemma3-v3")

In [3]:
from agent.state import QueryOutput

In [10]:
response=model.invoke("""You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.
You must output the postgres SQL query that answers the question

DATABASE SCHEMA:
CREATE TABLE smart_buildings.building ( cups TEXT PRIMARY KEY, name TEXT NOT NULL, address TEXT, type TEXT );
CREATE TABLE smart_buildings.energy_consumption_monthly_metrics ( cups TEXT NOT NULL, year_month DATE NOT NULL, total_consumption_kwh DOUBLE PRECISION, avg_daily_consumption_kwh DOUBLE PRECISION, total_consumption_prev_month_kwh DOUBLE PRECISION, diff_pct_consumption_prev_month DOUBLE PRECISION, std_daily_consumption_kwh DOUBLE PRECISION, ytd_consumption_kwh NUMERIC, ytd_prev_year_consumption_kwh NUMERIC, total_consumption_prev_year_same_month_kwh NUMERIC, date_insert TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (cups, year_month), FOREIGN KEY (cups) REFERENCES smart_buildings.building(cups) );
CREATE TABLE smart_buildings.energy_consumption_weekly_metrics ( cups TEXT NOT NULL, week_start DATE NOT NULL, total_consumption_kwh DOUBLE PRECISION, daily_consumption_kwh DOUBLE PRECISION, total_consumption_prev_week_kwh DOUBLE PRECISION, diff_pct_consumption_prev_week DOUBLE PRECISION, std_daily_consumption_kwh DOUBLE PRECISION, date_insert TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (cups, week_start), FOREIGN KEY (cups) REFERENCES smart_buildings.building(cups) );

Some example building names that might be useful for the query:
['Línea De Socorro Ceip Federico García Lorca', 'La Torre']

Building types:
['Administración', 'Educación', 'Comercio', 'Punto Limpio', 'Casal/Centro Cívico', 'Cultura y Ocio', 'Restauración', 'Salud y Servicios Sociales', 'Bienestar Social', 'Mercado', 'Parque', 'Industrial', 'Centros Deportivos', 'Parking', 'Policia', 'Cementerio', 'Protección Civil']

CURRENT DATE: 2025-05-21
                                                                                                                    
Return **only** the SQL query—no additional explanation or formatting.
             
¿Podrías darme un resumen del consumo anual del edificio 'Torre Alta'?
""")

In [11]:
print(response.content)

```sql
SELECT b.name AS building_name,
       EXTRACT(YEAR
               FROM e.year_month) AS YEAR,
       SUM(e.total_consumption_kwh) AS total_consumption_kwh,
       ROUND(AVG(e.total_consumption_kwh), 2) AS avg_monthly_consumption_kwh
FROM smart_buildings.building b
JOIN smart_buildings.energy_consumption_monthly_metrics e ON b.cups = e.cups
WHERE b.name = 'La Torre'
  AND EXTRACT(YEAR
              FROM e.year_month) = EXTRACT(YEAR
                                           FROM CURRENT_DATE)
GROUP BY b.name,
         EXTRACT(YEAR
                 FROM e.year_month);
```


In [17]:
print(response.query)

SELECT b.name AS building_name, 
 m.year_month, 
 m.total_consumption_kwh, 
 m.avg_daily_consumption_kwh, 
 m.total_consumption_prev_month_kwh, 
 m.diff_pct_consumption_prev_month, 
 m.std_daily_consumption_kwh, 
 m.ytd_consumption_kwh 
FROM smart_buildings.building b 
JOIN smart_buildings.energy_consumption_monthly_metrics m ON b.cups = m.cups 
WHERE b.name = 'Torres Norte' 
AND EXTRACT(YEAR  FROM m.year_month) = EXTRACT(YEAR FROM CURRENT_DATE) 
ORDER BY m.year_month DESC 
LIMIT 1;


In [12]:
import pandas as pd
import sqlparse

# Parámetros de entrada
excel_file = 'training.csv'

# Lee el Excel
df = pd.read_csv(excel_file,encoding='latin1',sep=";")

def pretty_sql(sql_text):
    if pd.isnull(sql_text):
        return sql_text
    return sqlparse.format(sql_text, reindent=True, keyword_case='upper')

# Aplica el formateo a la columna 'sql'
df['sql_parsed'] = df['sql'].apply(pretty_sql)
df.sql_parsed = df.sql_parsed.str.replace('"','')
df.sql_parsed = "```sql\n"+df.sql_parsed+"\n```"
# Guarda el resultado en un nuevo archivo
output_file = 'training_v2.csv'
df.to_csv(output_file, index=False)

print(f"Archivo guardado como {output_file}")


Archivo guardado como training_v2.csv


In [None]:
from agent.utils import execute_sql_query
configuration = Configuration()

engine = configuration
for index, row in df.iterrows():
    sql = row['sql_parsed']
    print(execute_sql_query(sql,"smart_buildings",engine))

```sql
SELECT b.name AS building_name,
       EXTRACT(YEAR
               FROM e.year_month) AS YEAR,
       SUM(e.total_consumption_kwh) AS total_consumption_kwh_year,
       ROUND(AVG(e.total_consumption_kwh)::numeric, 2) AS avg_monthly_consumption_kwh
FROM smart_buildings.building b
JOIN smart_buildings.energy_consumption_monthly_metrics e ON b.cups = e.cups
WHERE b.name = 'Torre Blanca'
  AND EXTRACT(YEAR
              FROM e.year_month) = EXTRACT(YEAR
                                           FROM CURRENT_DATE)
GROUP BY b.name,
         EXTRACT(YEAR
                 FROM e.year_month);
```


In [39]:
import os
import pandas as pd
import openai
from copy import deepcopy
from openai import OpenAI
# Configura tu API Key
openai.api_key = os.getenv("OPENAI_API_KEY")  # o pon aquí tu clave directamente: openai.api_key = "tu_api_key"

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def paraphrase_question(question, n=3):
    """
    Usa la API de OpenAI para generar n parafraseos de una pregunta dada.
    """
    prompt = f"Parafrasea la siguiente pregunta de forma natural y diferente, manteniendo el mismo sentido:\n\nPregunta: \"{question}\"\n\nGenera {n} preguntas parafraseadas, cada una en una línea separada."
    
    try:
        response = client.responses.create(
            model="gpt-4.1-nano",
            instructions = "Eres un experto en reformulación de preguntas.",
            input = prompt,
        )
        text = response.output_text
        # Asumimos que el modelo devuelve las preguntas parafraseadas una por línea
        paraphrases = [line.strip("- ").strip() for line in text.split("\n") if line.strip()]
        # En caso de que genere más o menos, tomamos solo n
        return paraphrases[:n]
    except Exception as e:
        print(f"Error al parafrasear: {e}")
        return []

def expand_questions(df, question_col='question', paraphrases_per_question=3):
    """
    Dado un DataFrame, genera nuevas filas con preguntas parafraseadas y mantiene las otras columnas iguales.
    """
    new_rows = []
    for _, row in df.iterrows():
        original_question = row[question_col]
        paraphrases = paraphrase_question(original_question, n=paraphrases_per_question)
        
        # Agrega la pregunta original
        new_rows.append(row.to_dict())
        
        # Agrega las parafraseadas, clonando el resto de columnas
        for pq in paraphrases:
            new_row = deepcopy(row.to_dict())
            new_row[question_col] = pq
            new_rows.append(new_row)
    
    return pd.DataFrame(new_rows)

def main():
    input_csv = "new_questions.csv"  # Cambia aquí el nombre de tu archivo
    output_csv = "new_add_questions.csv"
    
    print("Leyendo CSV...")
    df = pd.read_csv(input_csv,encoding="utf-8",sep=";")
    
    print("Generando parafraseos...")
    df_expanded = expand_questions(df, question_col='question', paraphrases_per_question=3)
    
    print(f"Guardando resultado en {output_csv} ...")
    df_expanded.to_csv(output_csv, index=False)
    print("Proceso completado.")

if __name__ == "__main__":
    main()


Leyendo CSV...
Generando parafraseos...
Guardando resultado en new_add_questions.csv ...
Proceso completado.


In [5]:
for i in range(0, len(df), 10):
    batch = df.iloc[i:i+10]
    print(batch.sql_parsed)
    break

0    ```sql\nSELECT b.name AS building_name,\n     ...
1    ```sql\nSELECT b.name AS building_name,\n     ...
2    ```sql\nSELECT b.name AS building_name,\n     ...
3    ```sql\nSELECT b.name AS building_name,\n     ...
4    ```sql\nSELECT b.name AS building_name,\n     ...
5    ```sql\nSELECT b.name AS building_name,\n     ...
6    ```sql\nSELECT b.name AS building_name,\n     ...
7    ```sql\nSELECT b.name AS building_name,\n     ...
8    ```sql\nSELECT b.name AS building_name,\n     ...
9    ```sql\nSELECT b.name AS building_name,\n     ...
Name: sql_parsed, dtype: object


In [15]:
import os
import pandas as pd
import openai
from copy import deepcopy
from openai import OpenAI
# Configura tu API Key
openai.api_key = os.getenv("OPENAI_API_KEY")  # o pon aquí tu clave directamente: openai.api_key = "tu_api_key"

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def review_question_sql(batch):
    """
    Usa la API de OpenAI para generar n parafraseos de una pregunta dada.
    """
    questions=''
    for idx, row in batch.iterrows():
        questions+=(
            f"{idx+1}. User Question: {row['question']}\n   SQL Query: {row['sql_parsed']}\n"
        )
    prompt = f"""{questions}

Instrucciones:

Valida si la consulta SQL responde correctamente y completamente a la pregunta del usuario.

Indica cualquier error o mejora, tanto en la lógica de la consulta como en la forma en que se extraen los datos.

Señala si faltan columnas relevantes, si se consulta información innecesaria, o si hay errores de sintaxis o incompatibilidad con el esquema.

Si la consulta es correcta y eficiente, responde simplemente: "Correcta".

Si hay problemas, señala exactamente cuáles, justifica tu respuesta y sugiere una versión mejorada del SQL.

Devuelve una lista de evaluaciones en el mismo orden. Si una consulta es correcta, solo pon "Correcta". Si no, da feedback concreto y una versión corregida.

Formato respuesta:
1. Correcta.
2. Correcta.
3. Incorrecta, razon de por que es incorrecta (si es que es incorrecta).
...

"""
    
    try:
        response = client.responses.create(
            model="gpt-4.1-nano",
            instructions = """Eres un experto en bases de datos y procesamiento de lenguaje natural. Tu tarea es revisar la correspondencia entre una pregunta de usuario (user question) y una consulta SQL (sql query) generada automáticamente, asegurándote de lo siguiente:

Corrección sintáctica: La consulta SQL debe estar correctamente escrita y sin errores de sintaxis.

Ajuste al esquema: Debe utilizar únicamente las tablas y columnas que existen en el esquema proporcionado, siguiendo sus restricciones y relaciones.

Relevancia: La consulta SQL debe extraer toda la información necesaria (y sólo la necesaria) para poder responder con precisión a la pregunta del usuario.

Eficiencia: Cuando sea posible, la consulta debe evitar operaciones innecesarias o costosas (por ejemplo, evitar SELECT * si no es necesario, evitar subconsultas redundantes, etc.).

Esquema:

CREATE TABLE smart_buildings.building (
    cups TEXT PRIMARY KEY,
    name TEXT NOT NULL,
    address TEXT,
    type TEXT
);

CREATE TABLE smart_buildings.energy_consumption_monthly_metrics (
    cups TEXT NOT NULL,
    year_month DATE NOT NULL,
    total_consumption_kwh DOUBLE PRECISION,
    avg_daily_consumption_kwh DOUBLE PRECISION,
    total_consumption_prev_month_kwh DOUBLE PRECISION,
    diff_pct_consumption_prev_month DOUBLE PRECISION,
    std_daily_consumption_kwh DOUBLE PRECISION,
    ytd_consumption_kwh NUMERIC,
    ytd_prev_year_consumption_kwh NUMERIC,
    total_consumption_prev_year_same_month_kwh NUMERIC,
    date_insert TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (cups, year_month),
    FOREIGN KEY (cups) REFERENCES smart_buildings.building(cups)
);

CREATE TABLE smart_buildings.energy_consumption_weekly_metrics (
    cups TEXT NOT NULL,
    week_start DATE NOT NULL,
    total_consumption_kwh DOUBLE PRECISION,
    daily_consumption_kwh DOUBLE PRECISION,
    total_consumption_prev_week_kwh DOUBLE PRECISION,
    diff_pct_consumption_prev_week DOUBLE PRECISION,
    std_daily_consumption_kwh DOUBLE PRECISION,
    date_insert TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (cups, week_start),
    FOREIGN KEY (cups) REFERENCES smart_buildings.building(cups)
);""",
            input = prompt,
        )
        text = response.output_text
        # Asumimos que el modelo devuelve las preguntas parafraseadas una por línea
        # En caso de que genere más o menos, tomamos solo n
        return text
    except Exception as e:
        print(f"Error al parafrasear: {e}")
        return ''

results = []
for i in range(0, len(df), 10):
    response = review_question_sql(batch)

    # Divide la respuesta en líneas
    evaluations = response.strip().split('\n')
    # Si hay líneas en blanco, filtra
    evaluations = [line.strip() for line in evaluations if line.strip()]
    # Asume que el orden se mantiene
    results.extend(evaluations)

# Agrega los resultados al dataframe
df['evaluation'] = results
df.to_csv('questions_sql_evaluated.csv', index=False)



ValueError: Length of values (830) does not match length of index (827)

In [18]:
print(results)

['1. Correcta.', '2. Correcta.', '3. Correcta.', '4. Correcta.', '5. Correcta.', '6. Correcta.', '7. Correcta.', '8. Correcta.', '9. Correcta.', '10. Correcta.', '1. Correcta.', '2. Correcta.', '3. Correcta.', '4. Correcta.', '5. Correcta.', '6. Correcta.', '7. Correcta.', '8. Correcta.', '9. Correcta.', '10. Correcta.', '1. Correcta.', '2. Correcta.', '3. Correcta.', '4. Correcta.', '5. Correcta.', '6. Correcta.', '7. Correcta.', '8. Correcta.', '9. Correcta.', '10. Correcta.', '1. Correcta.', '2. Correcta.', '3. Correcta.', '4. Correcta.', '5. Correcta.', '6. Correcta.', '7. Correcta.', '8. Correcta.', '9. Correcta.', '10. Correcta.', '1. Correcta.', '2. Correcta.', '3. Correcta.', '4. Correcta.', '5. Correcta.', '6. Correcta.', '7. Correcta.', '8. Correcta.', '9. Correcta.', '10. Correcta.', '1. Correcta.', '2. Correcta.', '3. Correcta.', '4. Correcta.', '5. Correcta.', '6. Correcta.', '7. Correcta.', '8. Correcta.', '9. Correcta.', '10. Correcta.', '1. Correcta.', '2. Correcta.', '

In [None]:
import pandas as pd
from openai import OpenAI
import time

# Initialize OpenAI client
client = OpenAI()

# Example DataFrame setup
# Replace this with your actual DataFrame loading step

def review_sql(question: str, sql_query: str,schema:str,column_description:str,current_month:str) -> str:
    """
    Sends the question and sql_query to the LLM to get correctness review.
    Returns the model's response as a string.
    """
    prompt = f"""
You are an expert SQL reviewer.

Postgres SQL Schema:
{schema}

Column Descriptions:
{column_description}

Current Month: {current_month}

Question: {question}
SQL Query: {sql_query}

- Check if the SQL syntax is correct.
- Evaluate whether the query is efficient and optimized (e.g., uses appropriate filters, joins, and avoids unnecessary operations).
- Determine if the query accurately answers the user's question.
- Identify any missing conditions, incorrect table or column usage, or logic errors.

Return a JSON with the fields:
- correct: true or false
- comments: brief explanation of any issues or why it is correct.

Example output:
{{"correct": true, "comments": "The query correctly counts all users."}}

"""

    try:
        response = client.responses.create(
            model="gpt-4.1",
            input=prompt.strip()
        )
        # Extract the textual response
        return response.output_text
    except Exception as e:
        return f"Error: {str(e)}"

# Apply the review to each row and append results
reviews = []
for idx, row in df.iterrows():
    review = review_sql(row["question"], row["sql_query"],row["schema"],row["column_description"],row["current_month"]
    reviews.append(review)
    # Optional: To avoid rate limits
    time.sleep(1)

df["review"] = reviews

# Show the DataFrame with appended reviews
print(df)
