# Data Science Exploration

## 1. Setup

In [None]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Retrieve Databricks credentials
databricks_host = os.getenv('DATABRICKS_HOST')
databricks_token = os.getenv('DATABRICKS_TOKEN')
warehouse_id = os.getenv('WAREHOUSE_ID')
catalog = os.getenv('CATALOG')
schema = os.getenv('SCHEMA')

print(f"Databricks Host: {databricks_host}")
print(f"Databricks Token is set: {'Yes' if databricks_token else 'No'}")
print(f"Warehouse ID: {warehouse_id}")
print(f"Catalog: {catalog}")
print(f"Schema: {schema}")

In [None]:
from databricks.connect import DatabricksSession
import pandas as pd

spark = DatabricksSession.builder.sdkConfig(
    host = databricks_host,
    token = databricks_token,
    cluster_id = warehouse_id, # cluster_id is the warehouse_id
    catalog = catalog,
    schema = schema
).getOrCreate()

def execute_sql(query: str) -> pd.DataFrame:
    """
    Executes a SQL query on Databricks and returns the result as a pandas DataFrame.
    
    Args:
        query: The SQL query string to execute.
    
    Returns:
        A pandas DataFrame containing the query results.
    """
    try:
        print(f"Executing query: {query[:100]}...")
        df = spark.sql(query).toPandas()
        print("Query successful, returning DataFrame.")
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame() # Return empty DataFrame on error

# Example usage:
# my_data_df = execute_sql('SELECT * FROM my_table LIMIT 10')
# display(my_data_df)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('darkgrid')

print("Libraries imported.")

## 2. Load Data

Load data from the `metadata` or `sql` directory.

In [None]:
# Example: df = pd.read_csv('path/to/your/data.csv')

## 3. Exploratory Data Analysis (EDA)