## Import libraries

In [22]:
from fastapi import FastAPI, Request
from pydantic import BaseModel
import pandas as pd
from typing import List, Dict
import json
import os
import clickhouse_connect

import sklearn

sklearn.set_config(transform_output='pandas')

import mlflow

import requests

from dotenv import load_dotenv
load_dotenv()

CH_USER = os.getenv('CH_USER')
CH_PASS = os.getenv('CH_PASS')
CH_IP = os.getenv('CH_IP')

from api.union_dfs import union_dfs
from api.df_preprocessor import df_preprocessor

root_path = "./api"
preprocessor_path = f"{root_path}/preprocessor"

client = clickhouse_connect.get_client(host=CH_IP, port=8123, username=CH_USER, password=CH_PASS)

your_mlflow_tracking_uri = f'{root_path}/mlruns' # for docker mlflow server
# your_mlflow_tracking_uri = "http://127.0.0.1:5000" # for local mlflow server
# your_mlflow_tracking_uri = MLFLOW_TRACKING_URI # for remote mlflow server
mlflow.set_tracking_uri(your_mlflow_tracking_uri)

## Spark Initialize

In [2]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf, SQLContext

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql import Window



# ml
from pyspark.ml import Pipeline as spk_pipeline
from pyspark.ml.feature import OneHotEncoder as spk_OneHotEncoder, StandardScaler as spk_StandardScaler, VectorAssembler as spk_VectorAssembler
from pyspark.ml.feature import MinMaxScaler as spk_MinMaxScaler, StringIndexer as spk_StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator as spk_RegressionEvaluator

from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param






import os
#https://repo1.maven.org/maven2/com/github/housepower/clickhouse-native-jdbc/2.7.1/clickhouse-native-jdbc-2.7.1.jar
# spark connector https://github.com/ClickHouse/spark-clickhouse-connector
# https://mvnrepository.com/artifact/com.clickhouse
# https://github.com/housepower/ClickHouse-Native-JDBC, For Spark 3.2 and upper, Spark ClickHouse Connector (see upper) is recommended.
# https://clickhouse.com/docs/en/integrations/apache-spark/spark-native-connector
packages = [
    "com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0"
    # "com.github.housepower:clickhouse-spark-runtime-3.4_2.12:0.7.3"
    ,"com.clickhouse:clickhouse-jdbc:0.7.1-patch1"
    # ,"com.clickhouse:clickhouse-jdbc:0.6.0-patch5"
    ,"com.clickhouse:clickhouse-http-client:0.7.1-patch1"
    # ,"com.clickhouse:clickhouse-http-client:0.6.0-patch5"
    ,"org.apache.httpcomponents.client5:httpclient5:5.3.1"
    # for jdbc 2.7.1 required java 8/11
    # ,"com.github.housepower:clickhouse-native-jdbc:2.7.1"
    ,"ai.catboost:catboost-spark_3.5_2.12:1.2.7"
    ,"com.microsoft.azure:synapseml_2.12:1.0.8"

]

exclude_packages = [
    "org.scala-lang:scala-reflect"
    ,"org.apache.spark:spark-tags_2.12"
    ,"org.scalactic:scalactic_2.12"
    ,"org.scalatest:scalatest_2.12"
    ,"com.fasterxml.jackson.core:jackson-databind"
]



ram = 60
cpu = 22*3
# Define the application name and setup session
appName = "Connect To ClickHouse via PySpark"
spark = (SparkSession.builder
         .appName(appName)
         .config("spark.jars.packages", ",".join(packages))
         .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog")
         .config("spark.sql.catalog.clickhouse.host", CH_IP)
         .config("spark.sql.catalog.clickhouse.protocol", "http")
         .config("spark.sql.catalog.clickhouse.http_port", "8123")
         .config("spark.sql.catalog.clickhouse.user", CH_USER)
         .config("spark.sql.catalog.clickhouse.password", CH_PASS)
         .config("spark.sql.catalog.clickhouse.database", "default")
        #  .config("spark.spark.clickhouse.write.compression.codec", "lz4")
        #  .config("spark.clickhouse.read.compression.codec", "lz4")
        #  .config("spark.clickhouse.write.format", "arrow")
         #    .config("spark.clickhouse.write.distributed.convertLocal", "true") l
         #    .config("spark.clickhouse.write.repartitionNum", "1") 
         #.config("spark.clickhouse.write.maxRetry", "1000")
         #    .config("spark.clickhouse.write.repartitionStrictly", "true") 
         #    .config("spark.clickhouse.write.distributed.useClusterNodes", "false") 
        #  .config("spark.clickhouse.write.batchSize", "1000000")
         #.config("spark.sql.catalog.clickhouse.socket_timeout", "600000000")
        #  .config("spark.sql.catalog.clickhouse.connection_timeout", "600000000")
        #  .config("spark.sql.catalog.clickhouse.query_timeout", "600000000")
        #  .config("spark.clickhouse.options.socket_timeout", "600000000")
        #  .config("spark.clickhouse.options.connection_timeout", "600000000")
        #  .config("spark.clickhouse.options.query_timeout", "600000000")         
         .config("spark.executor.memory", f"{ram}g")
        #  .config("spark.executor.cores", "5")
         .config("spark.driver.maxResultSize", f"{ram}g")
         .config("spark.driver.memory", f"{ram}g")
         .config("spark.executor.memoryOverhead", f"{ram}g")
        #  .config("spark.sql.debug.maxToStringFields", "100000")
         .getOrCreate()
         )

# LightGBM set config https://microsoft.github.io/SynapseML/docs/Get%20Started/Install%20SynapseML/
# spark.conf.set("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
# spark.conf.set("spark.jars.excludes", ",".join(exclude_packages))
# spark.conf.set("spark.yarn.user.classpath.first", "true")
# spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

#SedonaRegistrator.registerAll(spark)
# spark.conf.set("spark.sql.catalog.clickhouse", "xenon.clickhouse.ClickHouseCatalog")
# spark.conf.set("spark.sql.catalog.clickhouse.host", "127.0.0.1")
# spark.conf.set("spark.sql.catalog.clickhouse.protocol", "http")
# spark.conf.set("spark.sql.catalog.clickhouse.http_port", "8123")
# spark.conf.set("spark.sql.catalog.clickhouse.user", "default")
# spark.conf.set("spark.sql.catalog.clickhouse.password", "")
# spark.conf.set("spark.sql.catalog.clickhouse.database", "default")



from catboost_spark import CatBoostRegressor as CatBoostRegressor_spark
from synapse.ml.lightgbm import LightGBMRegressor as LightGBMRegressor_spark


spark.sql("use clickhouse")

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.clickhouse.spark#clickhouse-spark-runtime-3.5_2.12 added as a dependency
com.clickhouse#clickhouse-jdbc added as a dependency
com.clickhouse#clickhouse-http-client added as a dependency
org.apache.httpcomponents.client5#httpclient5 added as a dependency
ai.catboost#catboost-spark_3.5_2.12 added as a dependency
com.microsoft.azure#synapseml_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-fc57db16-8e73-4932-a58a-c86243948a28;1.0
	confs: [default]
	found com.clickhouse.spark#clickhouse-spark-runtime-3.5_2.12;0.8.0 in central
	found com.clickhouse#clickhouse-jdbc;0.7.1-patch1 in central
	found com.clickhouse#clickhouse-client;0.7.1-patch1 in central
	found com.clickhouse#clickhouse-data;0.7.1-patch1 in central
	found com.clickhouse#clickhouse-http-client;0.7.1-patch1 in central
	found org.apache.httpcomponents.core5#httpcore5-h2;5.2 in central
	

DataFrame[]

## Server-Side: FastAPI Application

In [None]:
app = FastAPI()

class DataFrameInput(BaseModel):
    data: List[Dict]  # Expecting a list of dictionaries as input

@app.post("/process-dataframe")
async def process_dataframe(input_data: DataFrameInput):
    # Convert the JSON data into a Pandas DataFrame
    df = pd.DataFrame(input_data.data)
    
    # Modify the DataFrame (example: add a new column)
    df["new_column"] = df["column1"] * 2  # Assuming 'column1' exists in the input
    
    # Convert the modified DataFrame back to JSON
    response_data = df.to_dict(orient="records")
    return {"data": response_data}






@app.get("/control")
async def get_control(id: int):
    control = client.query_df(f'''
        select * 
        from ycup.control yc
        where yc.id = {id}
        limit 10'''
    )


### clickhouse connect

In [11]:
client.query_df('SHOW TABLES IN ycup')

Unnamed: 0,name
0,control
1,localization
2,metadata


In [26]:

def set_cols_for_model(test: pd.DataFrame, target: str=None, targets: list=None) -> list:
    '''Set num and cat columns for model'''
    
    cols_checked = test.columns

    # target in [ ] because yaw hase more then one letter
    not_target = list(set(targets) - set([target]))


    # Set num columns
    control_cols = ['ctrl_stamp_ns', 'acceleration_level', 'steering']
    shift_cols = [col for col in cols_checked if '_shift' in col]
    tmp = [col for col in shift_cols for nt in not_target if f'{nt}_' in col]
    shift_cols = list(set(shift_cols) - set(tmp))

    last_10_cols = [col for col in cols_checked if 'last_10' in col]
    tmp = [col for col in last_10_cols for nt in not_target if f'{nt}_' in col]
    last_10_cols = list(set(last_10_cols) - set(tmp))

    num_cols = control_cols + shift_cols + last_10_cols

    # Set categorical columns
    cols_temp = [col for col in cols_checked if col in control_cols or 'last' in col or 'shift' in col or 'diff' in col]
    cat_cols = list(set(cols_checked) - set(cols_temp) - set(targets))


    return cat_cols, num_cols



# catboost encoder
def ctb_encoder(test: pd.DataFrame, target: str, id: int, targets: list) -> pd.DataFrame:
    '''Encode with CatBoostEncoder categorical columns of each target test data    '''     
    # drop unnecessary columns
    test_target = test[test['id'] == id].drop(columns=['z', 'roll', 'pitch'])

    # set columns for one target
    cat_cols, num_cols = set_cols_for_model(test_target, target, targets)

    # obsereved columns
    obs_cols = [col for col in test_target.columns if 'obs' in col] 

    
    # use only columns for one target and 'shift_1_obs' doesn't need to encode
    test_target = test_target.loc[:, cat_cols + list(set(num_cols) - set(obs_cols))]

    # fill null in target_shift column for correct work CatBoostEncoder
    test_target.fillna(value={f'{target}_shift_1': -1}, inplace=True)

    # encode categorical columns
    test_target = ce.CatBoostEncoder(cols=cat_cols).fit_transform(test_target, test_target[f'{target}_shift_1'])

    # del num and reminder from col names
    # train_target.columns = [col.split("__")[1] if "__" in col else col for col in train_target.columns]

    # replace -1 to nan
    test_target[f'{target}_shift_1'].replace(-1, np.nan, inplace=True)


    # add target column to the end
    test_target[target] = test[target]
 
    test_target['id_obs'] = id

    
    return test_target


def df_preprocessor(test: pd.DataFrame, target: str, id: int, preprocessor_path: str, targets: list) -> pd.DataFrame:
    '''Preprocess test one id_obs data for model
    cat_cols encoded with CatBoostEncoder
    num_cols transformed with PowerTransformer
   
    test - copy of test data
    '''
    
    test_id = test[test['id'] == id]

    # add target columns with shifts
    for i in range(1, 4):
        test_id[f'{target}_shift_{i}'] = test_id[target].shift(i)

    # add mean last 10 values for target columns
    test_id[f'{target}_last_10_mean'] = test_id[target].rolling(window=10).mean()

    # # add obs shift_1 column to preprocessed data
    # test_id[f'{target}_shift_1_obs'] = test_id[f'{target}_shift_1']

    # load preprocessor
    # cat_encoder = pd.read_pickle(f'{tmp_data_path}/cat_encoder_{target}.pkl')
    preprocessor = pd.read_pickle(f'{preprocessor_path}/preprocessor_{target}.pkl')
    
    # transform the encoded test data with preprocessor
    test_prepr = preprocessor.transform(ctb_encoder(test_id, target, id, targets))

    # del num and reminder from col names
    test_prepr.columns = [col.split("__")[1] if "__" in col else col for col in test_prepr.columns]
    
    # add diff target columns to preprocessed data
    test_id[f'{target}_diff'] = test_id[target] - test_id[f'{target}_shift_1']
    test_id.fillna(value={f'{target}_diff':0}, inplace=True)
    test_prepr[f'{target}_diff'] = test_id[f'{target}_diff']

    # add 'shift_1_obs' column to preprocessed data
    test_prepr[f'{target}_shift_1_obs'] = test_id[f'{target}_shift_1']

    # replace null values with -100 for correct work of VectorAssembler. -100 is out of range after preprocessing by PowerTransformer
    test_prepr = test_prepr.fillna(-100)

    # add target columns to preprocessed data
    test_prepr[f'{target}'] = test_id[target]

    # add row_number_by_id column
    test_prepr['row_number_by_id'] = test_prepr.sort_values(['id_obs', 'ctrl_stamp_ns']).groupby('id_obs').cumcount()


    return test_prepr

In [16]:
[table for table in client.query_df('show tables from ycup')['name'].values if 'test' in table]

['test_control', 'test_localization', 'test_metadata']

In [18]:
def get_df(client, table_name: str, id: int) -> pd.DataFrame:
    '''Get df from clickhouse by table name and id'''
    df = client.query_df(f'''select * from ycup.{table_name} where id = {id}''')
    return df

In [24]:
ids = [0]
targets = ['x']


df_list = []
for id in ids:
    test_control = get_df(client, 'test_control', id)
    test_localizations = get_df(client, 'test_localization', id)
    test_metadata = get_df(client, 'test_metadata', id)   
    df_list.append(union_dfs(test_control, test_localizations, test_metadata))

df = pd.concat(df_list)
df

Unnamed: 0,ctrl_stamp_ns,acceleration_level,steering,x,y,z,roll,pitch,yaw,id,...,y_shift_2,y_shift_3,yaw_shift_1,yaw_shift_2,yaw_shift_3,acceleration_level_last_10_mean,steering_last_10_mean,x_last_10_mean,y_last_10_mean,yaw_last_10_mean
0,36479492,-929,5.739836,-1482.652694,-1321.883413,-16.014849,0.027502,-0.001965,2.240336,0,...,,,,,,,,,,
1,76459951,-926,5.280618,-1482.766079,-1321.739951,-16.013857,0.027058,-0.001575,2.241208,0,...,,,2.240336,,,,,,,
2,116678417,-918,5.039505,-1482.878241,-1321.598810,-16.010625,0.027793,-0.001459,2.242182,0,...,-1321.883413,,2.241208,2.240336,,,,,,
3,156788958,-908,4.734873,-1482.990065,-1321.458237,-16.010063,0.027618,-0.001372,2.243131,0,...,-1321.739951,-1321.883413,2.242182,2.241208,2.240336,,,,,
4,196857808,-897,4.387096,-1483.100891,-1321.319065,-16.009805,0.028147,-0.001524,2.244002,0,...,-1321.598810,-1321.739951,2.243131,2.242182,2.241208,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,19837060299,5889,-3.057310,,,,,,,0,...,,,,,,6142.8,-2.844868,,,
496,19877059284,5087,-3.112980,,,,,,,0,...,,,,,,6042.6,-2.886396,,,
497,19917145118,4193,-3.112980,,,,,,,0,...,,,,,,5847.2,-2.927923,,,
498,19957108547,3414,-3.183301,,,,,,,0,...,,,,,,5571.4,-2.976483,,,


In [None]:
targets = ['x', 'y', 'yaw', 'z', 'roll', 'pitch']
target = 'x'

cols_checked = df.columns

# target in [ ] because yaw hase more then one letter
not_target = list(set(targets) - set([target]))


# Set num columns
control_cols = ['ctrl_stamp_ns', 'acceleration_level', 'steering']
shift_cols = [col for col in cols_checked if '_shift' in col]
tmp = [col for col in shift_cols for nt in not_target if f'{nt}_' in col]
shift_cols = list(set(shift_cols) - set(tmp))

last_10_cols = [col for col in cols_checked if 'last_10' in col]
tmp = [col for col in last_10_cols for nt in not_target if f'{nt}_' in col]
last_10_cols = list(set(last_10_cols) - set(tmp))

num_cols = control_cols + shift_cols + last_10_cols

# Set categorical columns
cols_temp = [col for col in cols_checked if col in control_cols or 'last' in col or 'shift' in col or 'diff' in col]
cat_cols = list(set(cols_checked) - set(cols_temp) - set(targets))

In [29]:
targets = ['x', 'y', 'yaw', 'z', 'roll', 'pitch']
target = 'x'
set_cols_for_model(df, target, targets)

(['vehicle_model_modification',
  'rear_tire',
  'location_reference_point_id',
  'vehicle_model',
  'id',
  'front_tire',
  'ride_month',
  'ride_year',
  'ride_day',
  'vehicle_id'],
 ['ctrl_stamp_ns',
  'acceleration_level',
  'steering',
  'acceleration_level_shift_3',
  'steering_shift_2',
  'x_shift_2',
  'steering_shift_3',
  'x_shift_3',
  'acceleration_level_shift_2',
  'x_shift_1',
  'acceleration_level_shift_1',
  'steering_shift_1',
  'steering_last_10_mean',
  'x_last_10_mean',
  'acceleration_level_last_10_mean'])

In [30]:
targets = ['x', 'y', 'yaw', 'z', 'roll', 'pitch']

df_prepr = {}
for target in targets:
    df_prepr[target] = df_preprocessor(df, target, id, preprocessor_path, targets)

NameError: name 'ce' is not defined

### spark connect

In [4]:
db = 'ycup'
# list of tables in db
tables = spark.sql(f'SHOW TABLES in {db}').collect()

df_list = {}
for table in tables:
    table_name = table.tableName
    df = spark.sql(f'SELECT * FROM {db}.{table_name} limit 10').toPandas()
    df_list[table_name] = df

In [None]:
# second variant


In [5]:
df_list['control']

Unnamed: 0,id,stamp_ns,acceleration_level,steering
0,0,36479492,-929,5.739836
1,0,76459951,-926,5.280618
2,0,116678417,-918,5.039505
3,0,156788958,-908,4.734873
4,0,196857808,-897,4.387096
5,0,236974997,-892,4.014573
6,0,276890721,-892,3.627408
7,0,316915752,-901,3.191926
8,0,356884436,-911,2.748362
9,0,396895329,-918,2.286391


## Client-Side: Sending and Receiving Data

## Running the Server


```
uvicorn your_script_name:app --reload
'''
