## Import libraries

In [3]:
import json
import pandas as pd
import os
import sklearn
import datetime
import numpy as np
import pytz

from pprint import pprint

from copy import deepcopy

import clickhouse_connect

import mlflow

# turn off warnings
import warnings
warnings.filterwarnings('ignore')

from dotenv import load_dotenv
load_dotenv()

# set all columns to be displayed
pd.set_option('display.max_columns', None)

# import tools

from tools import pd_tools, spark_tools, db_tools


root_path = "."
tmp_path = f'{root_path}/tmp'
data_path = f'{root_path}/data/self-drive'
train_data_path = f'{data_path}/train_data'
test_data_path = f'{data_path}/test_data'
tmp_data_path=f'{data_path}/tmp_data'


your_mlflow_tracking_uri = f'{root_path}/mlruns' # for docker mlflow server
# your_mlflow_tracking_uri = "http://127.0.0.1:5000" # for local mlflow server
# your_mlflow_tracking_uri = MLFLOW_TRACKING_URI # for remote mlflow server
mlflow.set_tracking_uri(your_mlflow_tracking_uri)

# constants
CH_USER = os.getenv("CH_USER")
CH_PASS = os.getenv("CH_PASS")
CH_IP = os.getenv('CH_IP')
RAND_ST = 354
# Define the timezone
EXP_TIMEZONE = pytz.timezone('Etc/GMT-3')
# MLFLOW_TRACKING_URI = os.getenv('MLFLOW_TRACKING_URI')

ch_client = clickhouse_connect.get_client(host=CH_IP, port=8123, username=CH_USER, password=CH_PASS)


## Spark Initialize

In [4]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf, SQLContext

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql import Window



# ml
from pyspark.ml import Pipeline as spk_pipeline
from pyspark.ml.feature import OneHotEncoder as spk_OneHotEncoder, StandardScaler as spk_StandardScaler, VectorAssembler as spk_VectorAssembler
from pyspark.ml.feature import MinMaxScaler as spk_MinMaxScaler, StringIndexer as spk_StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator as spk_RegressionEvaluator

from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param






import os
#https://repo1.maven.org/maven2/com/github/housepower/clickhouse-native-jdbc/2.7.1/clickhouse-native-jdbc-2.7.1.jar
# spark connector https://github.com/ClickHouse/spark-clickhouse-connector
# https://mvnrepository.com/artifact/com.clickhouse
# https://github.com/housepower/ClickHouse-Native-JDBC, For Spark 3.2 and upper, Spark ClickHouse Connector (see upper) is recommended.
packages = [
    "com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0"
    # "com.github.housepower:clickhouse-spark-runtime-3.4_2.12:0.7.3"
    ,"com.clickhouse:clickhouse-jdbc:0.7.1-patch1"
    # ,"com.clickhouse:clickhouse-jdbc:0.6.0-patch5"
    ,"com.clickhouse:clickhouse-http-client:0.7.1-patch1"
    # ,"com.clickhouse:clickhouse-http-client:0.6.0-patch5"
    ,"org.apache.httpcomponents.client5:httpclient5:5.3.1"
    # for jdbc 2.7.1 required java 8/11
    # ,"com.github.housepower:clickhouse-native-jdbc:2.7.1"
    # ,"ai.catboost:catboost-spark_3.5_2.12:1.2.7"
    # ,"com.microsoft.azure:synapseml_2.12:1.0.8"

]

# exclude_packages = [
#     "org.scala-lang:scala-reflect"
#     ,"org.apache.spark:spark-tags_2.12"
#     ,"org.scalactic:scalactic_2.12"
#     ,"org.scalatest:scalatest_2.12"
#     ,"com.fasterxml.jackson.core:jackson-databind"
# ]



ram = 10
cpu = 22*3
# Define the application name and setup session
appName = "Connect To ClickHouse via PySpark"
spark = (SparkSession.builder
         .appName(appName)
         .config("spark.jars.packages", ",".join(packages))
        #  .config("spark.sql.catalog.clickhouse", "xenon.clickhouse.ClickHouseCatalog")
         .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog")
         .config("spark.sql.catalog.clickhouse.host", CH_IP)
         .config("spark.sql.catalog.clickhouse.protocol", "http")
         .config("spark.sql.catalog.clickhouse.http_port", "8123")
         .config("spark.sql.catalog.clickhouse.user", CH_USER)
         .config("spark.sql.catalog.clickhouse.password", CH_PASS)
         .config("spark.sql.catalog.clickhouse.database", "default")
        #  .config("spark.spark.clickhouse.write.compression.codec", "lz4")
        #  .config("spark.clickhouse.read.compression.codec", "lz4")
        #  .config("spark.clickhouse.write.format", "arrow")
         #    .config("spark.clickhouse.write.distributed.convertLocal", "true") l
         #    .config("spark.clickhouse.write.repartitionNum", "1") 
         #.config("spark.clickhouse.write.maxRetry", "1000")
         #    .config("spark.clickhouse.write.repartitionStrictly", "true") 
         #    .config("spark.clickhouse.write.distributed.useClusterNodes", "false") 
        #  .config("spark.clickhouse.write.batchSize", "1000000")
         #.config("spark.sql.catalog.clickhouse.socket_timeout", "600000000")
        #  .config("spark.sql.catalog.clickhouse.connection_timeout", "600000000")
        #  .config("spark.sql.catalog.clickhouse.query_timeout", "600000000")
        #  .config("spark.clickhouse.options.socket_timeout", "600000000")
        #  .config("spark.clickhouse.options.connection_timeout", "600000000")
        #  .config("spark.clickhouse.options.query_timeout", "600000000")         
         .config("spark.executor.memory", f"{ram}g")
        #  .config("spark.executor.cores", "5")
         .config("spark.driver.maxResultSize", f"{ram}g")
         .config("spark.driver.memory", f"{ram}g")
         .config("spark.executor.memoryOverhead", f"{ram}g")
        #  .config("spark.sql.debug.maxToStringFields", "100000")
         .getOrCreate()
         )

# LightGBM set config https://microsoft.github.io/SynapseML/docs/Get%20Started/Install%20SynapseML/
# spark.conf.set("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
# spark.conf.set("spark.jars.excludes", ",".join(exclude_packages))
# spark.conf.set("spark.yarn.user.classpath.first", "true")
# spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

#SedonaRegistrator.registerAll(spark)
# spark.conf.set("spark.sql.catalog.clickhouse", "xenon.clickhouse.ClickHouseCatalog")
# spark.conf.set("spark.sql.catalog.clickhouse.host", "127.0.0.1")
# spark.conf.set("spark.sql.catalog.clickhouse.protocol", "http")
# spark.conf.set("spark.sql.catalog.clickhouse.http_port", "8123")
# spark.conf.set("spark.sql.catalog.clickhouse.user", "default")
# spark.conf.set("spark.sql.catalog.clickhouse.password", "")
# spark.conf.set("spark.sql.catalog.clickhouse.database", "default")



# from catboost_spark import CatBoostRegressor as CatBoostRegressor_spark
# from synapse.ml.lightgbm import LightGBMRegressor as LightGBMRegressor_spark


spark.sql("use clickhouse")

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.clickhouse.spark#clickhouse-spark-runtime-3.5_2.12 added as a dependency
com.clickhouse#clickhouse-jdbc added as a dependency
com.clickhouse#clickhouse-http-client added as a dependency
org.apache.httpcomponents.client5#httpclient5 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a04aa209-5337-4fb5-bec1-b89bbbccaf43;1.0
	confs: [default]
	found com.clickhouse.spark#clickhouse-spark-runtime-3.5_2.12;0.8.0 in central
	found com.clickhouse#clickhouse-jdbc;0.7.1-patch1 in central
	found com.clickhouse#clickhouse-client;0.7.1-patch1 in central
	found com.clickhouse#clickhouse-data;0.7.1-patch1 in central
	found com.clickhouse#clickhouse-http-client;0.7.1-patch1 in central
	found org.apache.httpcomponents.core5#httpcore5-h2;5.2 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.httpcomponents.core5#httpcore5;5.2.1 in central
	found

DataFrame[]

In [14]:
# read folder names in path
def read_names(path: str):
    '''Read folder names or file names in the path'''
    return os.listdir(path)

train_ids = pd.Series(read_names(train_data_path)).apply(int).sort_values().reset_index(drop=True)
test_ids = pd.Series(read_names(test_data_path)).apply(int).sort_values().reset_index(drop=True)

In [16]:
def tires_to_columns_date(metadata:pd.DataFrame):
    '''Change tires column to front and rear columns and 
    convert ride_date to datetime and add year, month, day columns'''
    metadata['front_tire'] = metadata['tires'][0]
    metadata['rear_tire'] = metadata['tires'][1]
    metadata = metadata.drop(columns=['tires']).reset_index(drop=True).loc[:0]
    # convert ride_date to datetime and add year, month, day columns
    metadata['ride_date'] = pd.to_datetime(metadata['ride_date'])
    metadata['ride_year'] = metadata['ride_date'].dt.year
    metadata['ride_month'] = metadata['ride_date'].dt.month
    metadata['ride_day'] = metadata['ride_date'].dt.day
    metadata = metadata.drop(columns=['ride_date'])
    
    return metadata

In [12]:
ctl = []
lcz = []
mtd = []
for id in test_ids[:]:
    control = pd.read_csv(f'{test_data_path}/{id}/control.csv')
    localization = pd.read_csv(f'{test_data_path}/{id}/localization.csv')
    metadata = tires_to_columns_date(pd.read_json(f'{test_data_path}/{id}/metadata.json'))
    control['id'] = id
    localization['id'] = id
    metadata['id'] = id
    ctl.append(control)
    lcz.append(localization)
    mtd.append(metadata)
    if id % 1000 == 0:
        print(id)

pd.concat(ctl).to_parquet(f'{tmp_data_path}/test_control.parquet', index=False)
pd.concat(lcz).to_parquet(f'{tmp_data_path}/test_localization.parquet', index=False)
pd.concat(mtd).to_parquet(f'{tmp_data_path}/test_metadata.parquet', index=False)
# ch_client.insert_df(f'{db_name}.{table_name}', pd.concat(ctl))  

0
1000
2000
3000
4000
5000
6000
7000


In [18]:
ctl = []
lcz = []
mtd = []
for id in train_ids[:]:
    control = pd.read_csv(f'{train_data_path}/{id}/control.csv')
    localization = pd.read_csv(f'{train_data_path}/{id}/localization.csv')
    metadata = tires_to_columns_date(pd.read_json(f'{train_data_path}/{id}/metadata.json'))
    control['id'] = id
    localization['id'] = id
    metadata['id'] = id
    ctl.append(control)
    lcz.append(localization)
    mtd.append(metadata)
    if id % 1000 == 0:
        print(id)

pd.concat(ctl).to_parquet(f'{tmp_data_path}/train_control.parquet', index=False)
pd.concat(lcz).to_parquet(f'{tmp_data_path}/train_localization.parquet', index=False)
pd.concat(mtd).to_parquet(f'{tmp_data_path}/train_metadata.parquet', index=False)
# ch_client.insert_df(f'{db_name}.{table_name}', pd.concat(ctl))  

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000


In [12]:
db_name = 'ycup'
tables = ['test_control', 'test_localization', 'test_metadata']

# upload data with spark
for table in tables[:1]:
    try:
    # Read the Parquet file into a DataFrame
        dfs = spark.read.parquet(f'{test_data_path}/{table}.parquet')
        (
            dfs.write.format("jdbc")
            .option("url", f"jdbc:clickhouse://{CH_IP}:9000")
            .option("driver", "com.clickhouse.jdbc.ClickHouseDriver")
            .option("dbtable", f"{db_name}.{table}") # table name
            .option("user", CH_USER)
            .option("password", CH_PASS)
            .option("isolationLevel", "NONE")
            .mode("append")
            .save()
        )

        
        print(f"Data successfully written to {db_name}.{table} in ClickHouse.")
    except Exception as e:
        print(f"Error: {e}")

Py4JJavaError: An error occurred while calling o113.save.
: java.sql.SQLException: Port 9000 is for clickhouse-client program
You must use port 8123 for HTTP.

	at com.clickhouse.jdbc.SqlExceptionUtils.handle(SqlExceptionUtils.java:85)
	at com.clickhouse.jdbc.SqlExceptionUtils.create(SqlExceptionUtils.java:31)
	at com.clickhouse.jdbc.SqlExceptionUtils.handle(SqlExceptionUtils.java:90)
	at com.clickhouse.jdbc.internal.ClickHouseConnectionImpl.getServerInfo(ClickHouseConnectionImpl.java:131)
	at com.clickhouse.jdbc.internal.ClickHouseConnectionImpl.<init>(ClickHouseConnectionImpl.java:339)
	at com.clickhouse.jdbc.internal.ClickHouseConnectionImpl.<init>(ClickHouseConnectionImpl.java:288)
	at com.clickhouse.jdbc.ClickHouseDriver.connect(ClickHouseDriver.java:175)
	at com.clickhouse.jdbc.ClickHouseDriver.connect(ClickHouseDriver.java:34)
	at org.apache.spark.sql.execution.datasources.jdbc.connection.BasicConnectionProvider.getConnection(BasicConnectionProvider.scala:49)
	at org.apache.spark.sql.execution.datasources.jdbc.connection.ConnectionProviderBase.create(ConnectionProvider.scala:102)
	at org.apache.spark.sql.jdbc.JdbcDialect.$anonfun$createConnectionFactory$1(JdbcDialects.scala:161)
	at org.apache.spark.sql.jdbc.JdbcDialect.$anonfun$createConnectionFactory$1$adapted(JdbcDialects.scala:157)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:50)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:251)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.io.IOException: Port 9000 is for clickhouse-client program
You must use port 8123 for HTTP.

	at com.clickhouse.client.http.ApacheHttpConnectionImpl.checkResponse(ApacheHttpConnectionImpl.java:241)
	at com.clickhouse.client.http.ApacheHttpConnectionImpl.post(ApacheHttpConnectionImpl.java:304)
	at com.clickhouse.client.http.ClickHouseHttpClient.send(ClickHouseHttpClient.java:195)
	at com.clickhouse.client.AbstractClient.execute(AbstractClient.java:280)
	at com.clickhouse.client.ClickHouseClientBuilder$Agent.sendOnce(ClickHouseClientBuilder.java:282)
	at com.clickhouse.client.ClickHouseClientBuilder$Agent.send(ClickHouseClientBuilder.java:294)
	at com.clickhouse.client.ClickHouseClientBuilder$Agent.execute(ClickHouseClientBuilder.java:349)
	at com.clickhouse.client.ClickHouseClient.executeAndWait(ClickHouseClient.java:881)
	at com.clickhouse.client.ClickHouseRequest.executeAndWait(ClickHouseRequest.java:2154)
	at com.clickhouse.jdbc.internal.ClickHouseConnectionImpl.getServerInfo(ClickHouseConnectionImpl.java:128)
	... 50 more


In [None]:
spark.sql(f"select * from {db_name}.{tables[0]}").show(5)