## Import libraries

In [1]:
from fastapi import FastAPI, Request
from pydantic import BaseModel
import pandas as pd
from typing import List, Dict
import json
import os
import clickhouse_connect

import mlflow

import requests

from dotenv import load_dotenv
load_dotenv()

CH_USER = os.getenv('CH_USER')
CH_PASS = os.getenv('CH_PASS')
CH_IP = os.getenv('CH_IP')

root_path = "."

client = clickhouse_connect.get_client(host=CH_IP, port=8123, username=CH_USER, password=CH_PASS)

your_mlflow_tracking_uri = f'{root_path}/mlruns' # for docker mlflow server
# your_mlflow_tracking_uri = "http://127.0.0.1:5000" # for local mlflow server
# your_mlflow_tracking_uri = MLFLOW_TRACKING_URI # for remote mlflow server
mlflow.set_tracking_uri(your_mlflow_tracking_uri)

## Spark Initialize

In [2]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf, SQLContext

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql import Window



# ml
from pyspark.ml import Pipeline as spk_pipeline
from pyspark.ml.feature import OneHotEncoder as spk_OneHotEncoder, StandardScaler as spk_StandardScaler, VectorAssembler as spk_VectorAssembler
from pyspark.ml.feature import MinMaxScaler as spk_MinMaxScaler, StringIndexer as spk_StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator as spk_RegressionEvaluator

from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param






import os
#https://repo1.maven.org/maven2/com/github/housepower/clickhouse-native-jdbc/2.7.1/clickhouse-native-jdbc-2.7.1.jar
# spark connector https://github.com/ClickHouse/spark-clickhouse-connector
# https://mvnrepository.com/artifact/com.clickhouse
# https://github.com/housepower/ClickHouse-Native-JDBC, For Spark 3.2 and upper, Spark ClickHouse Connector (see upper) is recommended.
# https://clickhouse.com/docs/en/integrations/apache-spark/spark-native-connector
packages = [
    "com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0"
    # "com.github.housepower:clickhouse-spark-runtime-3.4_2.12:0.7.3"
    ,"com.clickhouse:clickhouse-jdbc:0.7.1-patch1"
    # ,"com.clickhouse:clickhouse-jdbc:0.6.0-patch5"
    ,"com.clickhouse:clickhouse-http-client:0.7.1-patch1"
    # ,"com.clickhouse:clickhouse-http-client:0.6.0-patch5"
    ,"org.apache.httpcomponents.client5:httpclient5:5.3.1"
    # for jdbc 2.7.1 required java 8/11
    # ,"com.github.housepower:clickhouse-native-jdbc:2.7.1"
    ,"ai.catboost:catboost-spark_3.5_2.12:1.2.7"
    ,"com.microsoft.azure:synapseml_2.12:1.0.8"

]

exclude_packages = [
    "org.scala-lang:scala-reflect"
    ,"org.apache.spark:spark-tags_2.12"
    ,"org.scalactic:scalactic_2.12"
    ,"org.scalatest:scalatest_2.12"
    ,"com.fasterxml.jackson.core:jackson-databind"
]



ram = 60
cpu = 22*3
# Define the application name and setup session
appName = "Connect To ClickHouse via PySpark"
spark = (SparkSession.builder
         .appName(appName)
         .config("spark.jars.packages", ",".join(packages))
         .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog")
         .config("spark.sql.catalog.clickhouse.host", CH_IP)
         .config("spark.sql.catalog.clickhouse.protocol", "http")
         .config("spark.sql.catalog.clickhouse.http_port", "8123")
         .config("spark.sql.catalog.clickhouse.user", CH_USER)
         .config("spark.sql.catalog.clickhouse.password", CH_PASS)
         .config("spark.sql.catalog.clickhouse.database", "default")
        #  .config("spark.spark.clickhouse.write.compression.codec", "lz4")
        #  .config("spark.clickhouse.read.compression.codec", "lz4")
        #  .config("spark.clickhouse.write.format", "arrow")
         #    .config("spark.clickhouse.write.distributed.convertLocal", "true") l
         #    .config("spark.clickhouse.write.repartitionNum", "1") 
         #.config("spark.clickhouse.write.maxRetry", "1000")
         #    .config("spark.clickhouse.write.repartitionStrictly", "true") 
         #    .config("spark.clickhouse.write.distributed.useClusterNodes", "false") 
        #  .config("spark.clickhouse.write.batchSize", "1000000")
         #.config("spark.sql.catalog.clickhouse.socket_timeout", "600000000")
        #  .config("spark.sql.catalog.clickhouse.connection_timeout", "600000000")
        #  .config("spark.sql.catalog.clickhouse.query_timeout", "600000000")
        #  .config("spark.clickhouse.options.socket_timeout", "600000000")
        #  .config("spark.clickhouse.options.connection_timeout", "600000000")
        #  .config("spark.clickhouse.options.query_timeout", "600000000")         
         .config("spark.executor.memory", f"{ram}g")
        #  .config("spark.executor.cores", "5")
         .config("spark.driver.maxResultSize", f"{ram}g")
         .config("spark.driver.memory", f"{ram}g")
         .config("spark.executor.memoryOverhead", f"{ram}g")
        #  .config("spark.sql.debug.maxToStringFields", "100000")
         .getOrCreate()
         )

# LightGBM set config https://microsoft.github.io/SynapseML/docs/Get%20Started/Install%20SynapseML/
# spark.conf.set("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
# spark.conf.set("spark.jars.excludes", ",".join(exclude_packages))
# spark.conf.set("spark.yarn.user.classpath.first", "true")
# spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

#SedonaRegistrator.registerAll(spark)
# spark.conf.set("spark.sql.catalog.clickhouse", "xenon.clickhouse.ClickHouseCatalog")
# spark.conf.set("spark.sql.catalog.clickhouse.host", "127.0.0.1")
# spark.conf.set("spark.sql.catalog.clickhouse.protocol", "http")
# spark.conf.set("spark.sql.catalog.clickhouse.http_port", "8123")
# spark.conf.set("spark.sql.catalog.clickhouse.user", "default")
# spark.conf.set("spark.sql.catalog.clickhouse.password", "")
# spark.conf.set("spark.sql.catalog.clickhouse.database", "default")



from catboost_spark import CatBoostRegressor as CatBoostRegressor_spark
from synapse.ml.lightgbm import LightGBMRegressor as LightGBMRegressor_spark


spark.sql("use clickhouse")

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.clickhouse.spark#clickhouse-spark-runtime-3.5_2.12 added as a dependency
com.clickhouse#clickhouse-jdbc added as a dependency
com.clickhouse#clickhouse-http-client added as a dependency
org.apache.httpcomponents.client5#httpclient5 added as a dependency
ai.catboost#catboost-spark_3.5_2.12 added as a dependency
com.microsoft.azure#synapseml_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7f9b1db0-fbc5-4f15-9a11-60c62ac96008;1.0
	confs: [default]
	found com.clickhouse.spark#clickhouse-spark-runtime-3.5_2.12;0.8.0 in central
	found com.clickhouse#clickhouse-jdbc;0.7.1-patch1 in central
	found com.clickhouse#clickhouse-client;0.7.1-patch1 in central
	found com.clickhouse#clickhouse-data;0.7.1-patch1 in central
	found com.clickhouse#clickhouse-http-client;0.7.1-patch1 in central
	found org.apache.httpcomponents.core5#httpcore5-h2;5.2 in central
	

DataFrame[]

## Server-Side: FastAPI Application

In [None]:
app = FastAPI()

class DataFrameInput(BaseModel):
    data: List[Dict]  # Expecting a list of dictionaries as input

@app.post("/process-dataframe")
async def process_dataframe(input_data: DataFrameInput):
    # Convert the JSON data into a Pandas DataFrame
    df = pd.DataFrame(input_data.data)
    
    # Modify the DataFrame (example: add a new column)
    df["new_column"] = df["column1"] * 2  # Assuming 'column1' exists in the input
    
    # Convert the modified DataFrame back to JSON
    response_data = df.to_dict(orient="records")
    return {"data": response_data}


@app.get("/control")
async def get_control():
    control = client.query('''
        SELECT * 
        FROM ycup.control
        LIMIT 10'''
    )


## Client-Side: Sending and Receiving Data

### clickhouse connect

In [27]:
# # Execute the query to retrieve the list of tables
# tables_query = client.query('SHOW TABLES IN ycup')

# # Fetch the data from the result
# tables = tables_query.result_rows

# tables[0][0]

# # Execute the query
# query = f'SELECT * FROM ycup.{tables[0][0]} LIMIT 10'
# result = client.query(query)

# # Fetch the data and column names
# data = result.result_rows  # Retrieves the rows of the result
# columns = result.column_names  # Retrieves the column names

# pd.DataFrame(data, columns=columns)


### spark connect

In [4]:
db = 'ycup'
# list of tables in db
tables = spark.sql(f'SHOW TABLES in {db}').collect()

df_list = {}
for table in tables:
    table_name = table.tableName
    df = spark.sql(f'SELECT * FROM {db}.{table_name} limit 10').toPandas()
    df_list[table_name] = df

In [5]:
df_list['control']

Unnamed: 0,id,stamp_ns,acceleration_level,steering
0,0,36479492,-929,5.739836
1,0,76459951,-926,5.280618
2,0,116678417,-918,5.039505
3,0,156788958,-908,4.734873
4,0,196857808,-897,4.387096
5,0,236974997,-892,4.014573
6,0,276890721,-892,3.627408
7,0,316915752,-901,3.191926
8,0,356884436,-911,2.748362
9,0,396895329,-918,2.286391


In [15]:
# get data from clickhouse
print(client.command('select * from ycup.control limit 10'))


# control = client.query('''
#         SELECT * 
#         FROM ycup.control
#         LIMIT 10                        
# ''')


# pd.DataFrame(control, columns=['id', 'stamp_ns', 'acceleration_level', 'steering'])

['0', '36479492', '-929', '5.7398357\n0', '76459951', '-926', '5.2806177\n0', '116678417', '-918', '5.039505\n0', '156788958', '-908', '4.7348733\n0', '196857808', '-897', '4.387096\n0', '236974997', '-892', '4.014573\n0', '276890721', '-892', '3.6274078\n0', '316915752', '-901', '3.191926\n0', '356884436', '-911', '2.7483618\n0', '396895329', '-918', '2.2863915']


In [None]:
# Example DataFrame to send
control = pd.DataFrame(
    client.command('''
        SELECT * 
        FROM ycup.control                   
    ''')
)

localization = client.command('''
    SELECT * 
    FROM ycup.localization                   
''')

metadata = client.command('''
    SELECT * 
    FROM ycup.metadata                   
''')

# Convert DataFrame to JSON format
data_to_send = {"data": df.to_dict(orient="records")}

# Send the POST request
response = requests.post("http://127.0.0.1:8000/process-dataframe", json=data_to_send)

# Convert the response JSON back into a DataFrame
if response.status_code == 200:
    returned_data = response.json()["data"]
    result_df = pd.DataFrame(returned_data)
    print(result_df)
else:
    print("Error:", response.status_code, response.text)


## Running the Server


```
uvicorn your_script_name:app --reload
'''
