In [15]:
import pandas as pd
import numpy as np
import datetime
from faker import Faker
import uuid

# Set seeds for reproducibility
np.random.seed(42)
Faker.seed(42)
fake = Faker()

# Number of records for each table
NUM_USERS = 1000
NUM_QUESTIONS = 1000
NUM_ANSWERS = 1000
NUM_SESSIONS = 1000

In [6]:
def name_to_email(name):
    parts = name.lower().replace("'", "").replace(".", "").split()
    username = ".".join(parts)
    return f"{username}@example.com"

user_ids = np.arange(1, NUM_USERS + 1)
user_names = [fake.name() for _ in user_ids]
user_emails = [name_to_email(name) for name in user_names]

# Generate birthdays for ages between 18 and 65
today = datetime.date.today()
birthdays = [fake.date_of_birth(minimum_age=18, maximum_age=65) for _ in user_ids]

users_df = pd.DataFrame({
    'id': user_ids,
    'name': user_names,
    'email': user_emails,
    'birthday': birthdays
})

users_df.to_csv('../Mock_Data/mock_users.csv', index=False)

In [9]:
# Define time range for question creation
q_start_ts = int(datetime.datetime(2024, 1, 1).timestamp())
q_end_ts   = int(datetime.datetime(2024, 12, 31, 23, 59, 59).timestamp())

question_ids = np.arange(1, NUM_QUESTIONS + 1)
question_user_ids = np.random.randint(1, NUM_USERS + 1, size=NUM_QUESTIONS)

# Random timestamps (in seconds since epoch)
random_q_ts = np.random.randint(q_start_ts, q_end_ts + 1, size=NUM_QUESTIONS)
question_created_at = pd.to_datetime(random_q_ts, unit='s')

# Generate fake question content (short but realistic)
question_texts = [fake.sentence(nb_words=np.random.randint(5, 12)) + "?" for _ in range(NUM_QUESTIONS)]

questions_df = pd.DataFrame({
    'question_id': question_ids,
    'user_id': question_user_ids,
    'created_at': question_created_at,
    'content': question_texts
})

# Save to CSV
questions_df.to_csv('../Mock_Data/mock_questions.csv', index=False)

In [12]:
# To ensure each question_id appears at least once, map one answer per question
answer_ids = np.arange(1, NUM_ANSWERS + 1)
answer_question_ids = question_ids.copy()  # answer_id i corresponds to question_id i
answer_user_ids = np.random.randint(1, NUM_USERS + 1, size=NUM_ANSWERS)

# Random timestamps for answers
random_a_ts = np.random.randint(q_start_ts, q_end_ts + 1, size=NUM_ANSWERS)
answer_created_at = pd.to_datetime(random_a_ts, unit='s')

# Generate answer content: 1-2 sentences each
answer_texts = [fake.paragraph(nb_sentences=np.random.randint(1, 3)) for _ in range(NUM_ANSWERS)]

answers_df = pd.DataFrame({
    'id': answer_ids,
    'question_id': answer_question_ids,
    'user_id': answer_user_ids,
    'created_at': answer_created_at,
    'content': answer_texts
})

# Save to CSV
answers_df.to_csv('../Mock_Data/mock_answers.csv', index=False)

In [17]:
sess_start_date = datetime.date(2024, 1, 1)
sess_end_date = datetime.date(2024, 12, 31)
days_range = (sess_end_date - sess_start_date).days

# Use UUIDv4 for session_id
session_ids = [str(uuid.uuid4()) for _ in range(NUM_SESSIONS)]
session_user_ids = np.random.randint(1, NUM_USERS + 1, size=NUM_SESSIONS)

# Generate random session dates
random_days = np.random.randint(0, days_range + 1, size=NUM_SESSIONS)
session_dates = [sess_start_date + datetime.timedelta(days=int(day)) for day in random_days]

# Generate session durations
durations = np.round(np.random.uniform(1, 120, size=NUM_SESSIONS), 2)

user_sessions_df = pd.DataFrame({
    'session_id': session_ids,
    'user_id': session_user_ids,
    'duration_min': durations,
    'session_date': session_dates
})

user_sessions_df.to_csv('../Mock_Data/mock_user_sessions.csv', index=False)

In [1]:
from src.agents.data_analysis_agent.subagents.data_availability_checker_agent.agent import read_all_schema_texts
print(repr(read_all_schema_texts()[:500]))      # should show the first 500 chars

/Users/raj.vasani/Developer/ADK-Hackathon/configs
<map object at 0x11b11f5b0>
/Users/raj.vasani/Developer/ADK-Hackathon
'Table: mock_user_sessions\nDescription: Tracks user session activity such as session duration and date. Useful for engagement analysis and retention metrics.\nColumns:\n- session_id: Unique identifier for a user session (STRING)\n- user_id: ID of the user who had the session (INT64)\n- duration_min: Length of the session in minutes (FLOAT64)\n- session_date: Date on which the session occurred (DATE)\n\nTable: mock_answers\nDescription: Contains user-submitted answers to questions on the QA platform. Each'
