### Chat with Data using Agents
using sqlite/csv files

In [4]:
import os
import io
import json
from dotenv import load_dotenv
# from langchain.agents import AgentType, initialize_agent, load_tools
from langchain_core.prompts import ChatPromptTemplate, FewShotPromptTemplate, PromptTemplate
from langchain_community.llms import Ollama
from langchain_groq import ChatGroq
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.prompts import \
			SYNTHETIC_FEW_SHOT_PREFIX, SYNTHETIC_FEW_SHOT_SUFFIX
import pandas as pd
from common_functions import ensure_llama_running, datasets_dir, RANDOM_STATE

ensure_llama_running()
load_dotenv()

# llm_model = os.getenv('LLM_MODEL')
# llm = Ollama(model=llm_model)
llm = ChatGroq(model_name='llama3-70b-8192')

topic = 'sales'
data_save_path = os.path.join(datasets_dir, f'{topic}_synthetic_data.csv')

In [15]:
fields = ['product_id', 'sales', 'date']
rows = 5
examples = [
    {
        "example": """product_id: 1, sales: 100, date: 2020-01-01""",
    },
]

OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")
prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=fields,
    example_prompt=OPENAI_TEMPLATE,
)

generator = SyntheticDataGenerator(llm=llm, template=prompt_template)
data = generator.generate(subject=topic, runs=1,
							extra=f"include columns in csv response in ```: {', '.join(fields)} with row count: {rows}")
response = data[0]
# get text in quotes ```
response = response[response.find('```')+3:response.rfind('```')]
response = response.strip()
print(response)


try:
	df = pd.read_csv(io.StringIO(response))
	df.to_csv(data_save_path, index=False)
except Exception as e:
	print("No data generated")
	print(e)

def generate_synthetic_data(llm, sample_df, topic, rows=1000):
	# generate synthetic data using LLM
	pass


product_id,sales,date
1,120,2020-01-05
2,80,2020-01-10
3,150,2020-01-15
4,90,2020-01-20
5,110,2020-01-25


In [None]:
if os.path.exists(data_save_path):
    df = pd.read_csv(data_save_path)
else:
	sample_df = pd.DataFrame({
		'product_id': [
			1, 2, 3, 4, 5,
			1, 2, 3, 4, 5,
			1, 2, 3, 4, 5,
        ],
		'sales': [
			100, 200, 300, 400, 500,
			600, 700, 600, 900, 500,
			300, 200, 300, 400, 500,
		],
	})
	# repeat all the rows 10 times
	sample_df = pd.concat([sample_df]*10, ignore_index=True)
	# replace 'date' column with random dates in ascending order from 2020 to today
	sample_df['date'] = pd.date_range(start='2020-01-01', end='2024-05-05',
										periods=len(sample_df))
	rows = 100
	df = generate_synthetic_data(llm, sample_df, topic, rows=rows)
	df['date'] = df['date'].dt.date  # include only date without time
	df = df.sort_values('date').reset_index(drop=True)  # sort by date
	df.to_csv(data_save_path, index=False)

df.head()