## Import libraries

In [5]:
from fastapi import FastAPI, Request
from pydantic import BaseModel
import pandas as pd
import numpy as np
from typing import List, Dict
import json
import os
import clickhouse_connect

import category_encoders as ce

import sklearn

sklearn.set_config(transform_output='pandas')

import mlflow

import requests

from dotenv import load_dotenv
load_dotenv()

CH_USER = os.getenv('CH_USER')
CH_PASS = os.getenv('CH_PASS')
CH_IP = os.getenv('CH_IP')

from tools.spark_tools import SparkTools

from api.union_dfs import union_dfs
from api.df_preprocessor import df_preprocessor

root_path = "."
tmp_path = f'{root_path}/tmp'
data_path = f'{root_path}/data/self-drive'
train_data_path = f'{data_path}/train_data'
test_data_path = f'{data_path}/test_data'
tmp_data_path=f'{data_path}/tmp_data'

preprocessor_path = f"{root_path}/api/preprocessor"


client = clickhouse_connect.get_client(host=CH_IP, port=8123, username=CH_USER, password=CH_PASS)

your_mlflow_tracking_uri = f'{root_path}/mlruns' # for docker mlflow server
# your_mlflow_tracking_uri = "http://127.0.0.1:5000" # for local mlflow server
# your_mlflow_tracking_uri = MLFLOW_TRACKING_URI # for remote mlflow server
mlflow.set_tracking_uri(your_mlflow_tracking_uri)

## Spark Initialize

In [6]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf, SQLContext

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql import Window



# ml
from pyspark.ml import Pipeline as spk_pipeline
from pyspark.ml.feature import OneHotEncoder as spk_OneHotEncoder, StandardScaler as spk_StandardScaler, VectorAssembler as spk_VectorAssembler
from pyspark.ml.feature import MinMaxScaler as spk_MinMaxScaler, StringIndexer as spk_StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator as spk_RegressionEvaluator

from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param






import os
#https://repo1.maven.org/maven2/com/github/housepower/clickhouse-native-jdbc/2.7.1/clickhouse-native-jdbc-2.7.1.jar
# spark connector https://github.com/ClickHouse/spark-clickhouse-connector
# https://mvnrepository.com/artifact/com.clickhouse
# https://github.com/housepower/ClickHouse-Native-JDBC, For Spark 3.2 and upper, Spark ClickHouse Connector (see upper) is recommended.
# https://clickhouse.com/docs/en/integrations/apache-spark/spark-native-connector
packages = [
    "com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0"
    # "com.github.housepower:clickhouse-spark-runtime-3.4_2.12:0.7.3"
    ,"com.clickhouse:clickhouse-jdbc:0.7.1-patch1"
    # ,"com.clickhouse:clickhouse-jdbc:0.6.0-patch5"
    ,"com.clickhouse:clickhouse-http-client:0.7.1-patch1"
    # ,"com.clickhouse:clickhouse-http-client:0.6.0-patch5"
    ,"org.apache.httpcomponents.client5:httpclient5:5.3.1"
    # for jdbc 2.7.1 required java 8/11
    # ,"com.github.housepower:clickhouse-native-jdbc:2.7.1"
    ,"ai.catboost:catboost-spark_3.5_2.12:1.2.7"
    ,"com.microsoft.azure:synapseml_2.12:1.0.8"

]

exclude_packages = [
    "org.scala-lang:scala-reflect"
    ,"org.apache.spark:spark-tags_2.12"
    ,"org.scalactic:scalactic_2.12"
    ,"org.scalatest:scalatest_2.12"
    ,"com.fasterxml.jackson.core:jackson-databind"
]



ram = 60
cpu = 22*3
# Define the application name and setup session
appName = "Connect To ClickHouse via PySpark"
spark = (SparkSession.builder
         .appName(appName)
         .config("spark.jars.packages", ",".join(packages))
         .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog")
         .config("spark.sql.catalog.clickhouse.host", CH_IP)
         .config("spark.sql.catalog.clickhouse.protocol", "http")
         .config("spark.sql.catalog.clickhouse.http_port", "8123")
         .config("spark.sql.catalog.clickhouse.user", CH_USER)
         .config("spark.sql.catalog.clickhouse.password", CH_PASS)
         .config("spark.sql.catalog.clickhouse.database", "default")
        #  .config("spark.spark.clickhouse.write.compression.codec", "lz4")
        #  .config("spark.clickhouse.read.compression.codec", "lz4")
        #  .config("spark.clickhouse.write.format", "arrow")
         #    .config("spark.clickhouse.write.distributed.convertLocal", "true") l
         #    .config("spark.clickhouse.write.repartitionNum", "1") 
         #.config("spark.clickhouse.write.maxRetry", "1000")
         #    .config("spark.clickhouse.write.repartitionStrictly", "true") 
         #    .config("spark.clickhouse.write.distributed.useClusterNodes", "false") 
        #  .config("spark.clickhouse.write.batchSize", "1000000")
         #.config("spark.sql.catalog.clickhouse.socket_timeout", "600000000")
        #  .config("spark.sql.catalog.clickhouse.connection_timeout", "600000000")
        #  .config("spark.sql.catalog.clickhouse.query_timeout", "600000000")
        #  .config("spark.clickhouse.options.socket_timeout", "600000000")
        #  .config("spark.clickhouse.options.connection_timeout", "600000000")
        #  .config("spark.clickhouse.options.query_timeout", "600000000")         
         .config("spark.executor.memory", f"{ram}g")
        #  .config("spark.executor.cores", "5")
         .config("spark.driver.maxResultSize", f"{ram}g")
         .config("spark.driver.memory", f"{ram}g")
         .config("spark.executor.memoryOverhead", f"{ram}g")
        #  .config("spark.sql.debug.maxToStringFields", "100000")
         .getOrCreate()
         )

# LightGBM set config https://microsoft.github.io/SynapseML/docs/Get%20Started/Install%20SynapseML/
# spark.conf.set("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
# spark.conf.set("spark.jars.excludes", ",".join(exclude_packages))
# spark.conf.set("spark.yarn.user.classpath.first", "true")
# spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

#SedonaRegistrator.registerAll(spark)
# spark.conf.set("spark.sql.catalog.clickhouse", "xenon.clickhouse.ClickHouseCatalog")
# spark.conf.set("spark.sql.catalog.clickhouse.host", "127.0.0.1")
# spark.conf.set("spark.sql.catalog.clickhouse.protocol", "http")
# spark.conf.set("spark.sql.catalog.clickhouse.http_port", "8123")
# spark.conf.set("spark.sql.catalog.clickhouse.user", "default")
# spark.conf.set("spark.sql.catalog.clickhouse.password", "")
# spark.conf.set("spark.sql.catalog.clickhouse.database", "default")



from catboost_spark import CatBoostRegressor as CatBoostRegressor_spark
from synapse.ml.lightgbm import LightGBMRegressor as LightGBMRegressor_spark


spark.sql("use clickhouse")

DataFrame[]

## Server-Side: FastAPI Application

In [8]:
sptools = SparkTools(spark, data_path, CH_IP, CH_USER, CH_PASS)

### Files from local path

In [None]:
file_names = ['test_control', 'test_localization', 'test_metadata']

df_test = {}
for name in file_names:
    df_test[name] = pd.read_parquet(f'{tmp_data_path}/{name}.parquet')

df_test['test_metadata'].head()

Unnamed: 0,vehicle_id,vehicle_model,vehicle_model_modification,location_reference_point_id,front_tire,rear_tire,ride_year,ride_month,ride_day,id
0,83,1,1,0,0,0,2021,9,6,0
1,54,1,1,0,0,0,2021,9,6,1
2,25,1,1,0,0,0,2021,9,7,2
3,86,1,2,0,2,2,2022,6,4,3
4,81,0,0,0,1,1,2022,6,5,4


In [13]:
df_test['test_localization'].shape

(999655, 8)

In [10]:
test = union_dfs(df_test['test_control'], df_test['test_localization'], df_test['test_metadata'])
test.head()

KeyboardInterrupt: 

In [None]:
SparkTools.check_nn_spark()

sptools.dict_from_spark_dfs()

In [None]:
app = FastAPI()

class DataFrameInput(BaseModel):
    data: List[Dict]  # Expecting a list of dictionaries as input

@app.post("/process-dataframe")
async def process_dataframe(input_data: DataFrameInput):
    # Convert the JSON data into a Pandas DataFrame
    df = pd.DataFrame(input_data.data)
    
    # Modify the DataFrame (example: add a new column)
    df["new_column"] = df["column1"] * 2  # Assuming 'column1' exists in the input
    
    # Convert the modified DataFrame back to JSON
    response_data = df.to_dict(orient="records")
    return {"data": response_data}






@app.get("/control")
async def get_control(id: int):
    control = client.query_df(f'''
        select * 
        from ycup.control yc
        where yc.id = {id}
        limit 10'''
    )


### clickhouse connect

In [3]:
client.query_df('SHOW TABLES IN ycup')

Unnamed: 0,name
0,test_control
1,test_localization
2,test_metadata
3,train_control
4,train_localization
5,train_metadata


In [16]:
[table for table in client.query_df('show tables from ycup')['name'].values if 'test' in table]

['test_control', 'test_localization', 'test_metadata']

In [7]:
def get_df(client, table_name: str, id: int) -> pd.DataFrame:
    '''Get df from clickhouse by table name and id'''
    df = client.query_df(f'''select * from ycup.{table_name} where id = {id}''')
    return df

In [9]:
targets_all = ['x', 'y', 'z', 'yaw', 'pitch', 'roll']
targets_pred = ['x', 'y', 'yaw']
ids = [0, 1, 2]

df_prepr = {}
for target in targets_pred[:1]:
    df_list = []
    for id in ids:
        test_control = get_df(client, 'test_control', id)
        test_localizations = get_df(client, 'test_localization', id)
        test_metadata = get_df(client, 'test_metadata', id)   
        df_list.append(union_dfs(test_control, test_localizations, test_metadata))

    df = pd.concat(df_list)
    # df_prepr[target] = df_preprocessor(df, target, id, preprocessor_path, targets_all)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1500 entries, 0 to 499
Data columns (total 39 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ctrl_stamp_ns                    1500 non-null   uint64 
 1   acceleration_level               1500 non-null   int16  
 2   steering                         1500 non-null   float32
 3   x                                373 non-null    float64
 4   y                                373 non-null    float64
 5   z                                373 non-null    float64
 6   roll                             373 non-null    float64
 7   pitch                            373 non-null    float64
 8   yaw                              373 non-null    float64
 9   id                               1500 non-null   uint32 
 10  vehicle_id                       1500 non-null   uint8  
 11  vehicle_model                    1500 non-null   uint8  
 12  vehicle_model_modification

In [86]:
def set_cols_for_model(test: pd.DataFrame, target: str=None, targets_all: list=None) -> list:
    '''Set num and cat columns for model'''
    
    cols_checked = test.columns

    # target in [ ] because yaw hase more then one letter
    not_target = list(set(targets_all) - set([target]))


    # Set num columns
    control_cols = ['ctrl_stamp_ns', 'acceleration_level', 'steering']
    shift_cols = [col for col in cols_checked if '_shift' in col]
    tmp = [col for col in shift_cols for nt in not_target if f'{nt}_' in col]
    shift_cols = list(set(shift_cols) - set(tmp))

    last_10_cols = [col for col in cols_checked if 'last_10' in col]
    tmp = [col for col in last_10_cols for nt in not_target if f'{nt}_' in col]
    last_10_cols = list(set(last_10_cols) - set(tmp))

    num_cols = control_cols + shift_cols + last_10_cols

    # Set categorical columns
    cols_temp = [col for col in cols_checked if col in control_cols or 'last' in col or 'shift' in col or 'diff' in col]
    cat_cols = list(set(cols_checked) - set(cols_temp) - set(targets_all))


    return cat_cols, num_cols



# catboost encoder
def ctb_encoder(test: pd.DataFrame, target: str, id: int) -> pd.DataFrame:
    '''Encode with CatBoostEncoder categorical columns of each target test data    '''     

    
    test_id = test[test['id'] == id]

    # obsereved columns
    obs_cols = [col for col in test_id.columns if 'obs' in col] 

    cat_cols, num_cols = set_cols_for_model(test, target, targets_all)

    
    # use only columns for one target and 'shift_1_obs' doesn't need to encode
    test_enc = test_id.loc[:, cat_cols + list(set(num_cols) - set(obs_cols)-set(target))]

    # fill null in target_shift column for correct work CatBoostEncoder
    test_enc.fillna(value={f'{target}_shift_1': -1}, inplace=True)

    # encode categorical columns
    test_enc = ce.CatBoostEncoder(cols=cat_cols).fit_transform(test_enc, test_enc[f'{target}_shift_1'])

    # del num and reminder from col names
    # train_target.columns = [col.split("__")[1] if "__" in col else col for col in train_target.columns]

    # replace -1 to nan
    test_enc[f'{target}_shift_1'] = test_enc[f'{target}_shift_1'].replace(-1, np.nan)


    # add target column to the end
    test_enc[target] = test_id[target]
 
    test_enc['id_obs'] = id

    
    return test_enc


def df_preprocessor(test: pd.DataFrame, target: str, id: int, preprocessor_path: str, targets_all: list) -> pd.DataFrame:
    '''Preprocess test one id_obs data for model
    cat_cols encoded with CatBoostEncoder
    num_cols transformed with PowerTransformer
   
    test - copy of test data
    '''
    # use test data by id
    test_target = test[test['id'] == id]

    # set columns for one target
    cat_cols, num_cols = set_cols_for_model(test_target, target, targets_all)


    test_id = test[test['id'] == id][cat_cols + num_cols + list(target)].copy()

    # add target columns with shifts
    for i in range(1, 4):
        test_id[f'{target}_shift_{i}'] = test_id[target].shift(i)

    # add mean last 10 values for target columns
    test_id[f'{target}_last_10_mean'] = test_id[target].rolling(window=10).mean()

    # # add obs shift_1 column to preprocessed data
    # test_id[f'{target}_shift_1_obs'] = test_id[f'{target}_shift_1']

    # load preprocessor
    # cat_encoder = pd.read_pickle(f'{tmp_data_path}/cat_encoder_{target}.pkl')
    preprocessor = pd.read_pickle(f'{preprocessor_path}/preprocessor_{target}.pkl')
    
    # transform the encoded test data with preprocessor
    test_prepr = preprocessor.transform(ctb_encoder(test_id, target, id, cat_cols, num_cols))

    # del num and reminder from col names
    test_prepr.columns = [col.split("__")[1] if "__" in col else col for col in test_prepr.columns]
    
    # add diff target columns to preprocessed data
    test_id[f'{target}_diff'] = test_id[target] - test_id[f'{target}_shift_1']
    test_id.fillna(value={f'{target}_diff':0}, inplace=True)
    test_prepr[f'{target}_diff'] = test_id[f'{target}_diff']

    # add 'shift_1_obs' column to preprocessed data
    test_prepr[f'{target}_shift_1_obs'] = test_id[f'{target}_shift_1']

    # replace null values with -100 for correct work of VectorAssembler. -100 is out of range after preprocessing by PowerTransformer
    test_prepr = test_prepr.fillna(-100)

    # add target columns to preprocessed data
    test_prepr[f'{target}'] = test_id[target]

    # add row_number_by_id column
    test_prepr['row_number_by_id'] = test_prepr.sort_values(['id_obs', 'ctrl_stamp_ns']).groupby('id_obs').cumcount()


    return test_prepr

In [67]:
test = df.copy()

In [68]:
test.columns

Index(['ctrl_stamp_ns', 'acceleration_level', 'steering', 'x', 'y', 'z',
       'roll', 'pitch', 'yaw', 'id', 'vehicle_id', 'vehicle_model',
       'vehicle_model_modification', 'location_reference_point_id',
       'front_tire', 'rear_tire', 'ride_year', 'ride_month', 'ride_day',
       'acceleration_level_shift_1', 'acceleration_level_shift_2',
       'acceleration_level_shift_3', 'steering_shift_1', 'steering_shift_2',
       'steering_shift_3', 'x_shift_1', 'x_shift_2', 'x_shift_3', 'y_shift_1',
       'y_shift_2', 'y_shift_3', 'yaw_shift_1', 'yaw_shift_2', 'yaw_shift_3',
       'acceleration_level_last_10_mean', 'steering_last_10_mean',
       'x_last_10_mean', 'y_last_10_mean', 'yaw_last_10_mean'],
      dtype='object')

In [87]:
ctb_encoder(test, target, id)

Unnamed: 0,vehicle_id,vehicle_model_modification,location_reference_point_id,vehicle_model,id,ride_month,rear_tire,front_tire,ride_day,ride_year,...,acceleration_level_last_10_mean,steering,acceleration_level,x_last_10_mean,steering_last_10_mean,steering_shift_3,x_shift_2,steering_shift_1,x,id_obs
0,-369.645344,-369.645344,-369.645344,-369.645344,-369.645344,-369.645344,-369.645344,-369.645344,-369.645344,-369.645344,...,,5.739836,-929,,,,,,-1482.652694,0
1,-185.322672,-185.322672,-185.322672,-185.322672,-185.322672,-185.322672,-185.322672,-185.322672,-185.322672,-185.322672,...,,5.280618,-926,,,,,5.739836,-1482.766079,0
2,-617.766013,-617.766013,-617.766013,-617.766013,-617.766013,-617.766013,-617.766013,-617.766013,-617.766013,-617.766013,...,,5.039505,-918,,,,-1482.652694,5.280618,-1482.878241,0
3,-834.016029,-834.016029,-834.016029,-834.016029,-834.016029,-834.016029,-834.016029,-834.016029,-834.016029,-834.016029,...,,4.734873,-908,,,5.739836,-1482.766079,5.039505,-1482.990065,0
4,-963.788472,-963.788472,-963.788472,-963.788472,-963.788472,-963.788472,-963.788472,-963.788472,-963.788472,-963.788472,...,,4.387096,-897,,,5.280618,-1482.878241,4.734873,-1483.100891,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,-373.361527,-373.361527,-373.361527,-373.361527,-373.361527,-373.361527,-373.361527,-373.361527,-373.361527,-373.361527,...,6142.8,-3.057310,5889,,-2.844868,-2.931257,,-3.004209,,0
496,-372.612309,-372.612309,-372.612309,-372.612309,-372.612309,-372.612309,-372.612309,-372.612309,-372.612309,-372.612309,...,6042.6,-3.112980,5087,,-2.886396,-2.931257,,-3.057310,,0
497,-371.866099,-371.866099,-371.866099,-371.866099,-371.866099,-371.866099,-371.866099,-371.866099,-371.866099,-371.866099,...,5847.2,-3.112980,4193,,-2.927923,-3.004209,,-3.112980,,0
498,-371.122880,-371.122880,-371.122880,-371.122880,-371.122880,-371.122880,-371.122880,-371.122880,-371.122880,-371.122880,...,5571.4,-3.183301,3414,,-2.976483,-3.057310,,-3.112980,,0


In [62]:
train = pd.read_parquet(f'{tmp_data_path}/train_ctb_{target}.parquet')

In [64]:
train.columns

Index(['front_tire', 'ride_day', 'id', 'vehicle_id', 'ride_year',
       'location_reference_point_id', 'ride_month',
       'vehicle_model_modification', 'rear_tire', 'vehicle_model',
       'ctrl_stamp_ns', 'acceleration_level', 'steering',
       'acceleration_level_shift_3', 'x_shift_2', 'steering_shift_3',
       'acceleration_level_shift_1', 'x_shift_1', 'acceleration_level_shift_2',
       'x_shift_3', 'steering_shift_1', 'steering_shift_2',
       'steering_last_10_mean', 'x_last_10_mean',
       'acceleration_level_last_10_mean', 'id_obs', 'x'],
      dtype='object')

In [66]:
# Ensure the columns are consistent
cat_cols, num_cols = set_cols_for_model(train, target, targets_all)
set(cat_cols + num_cols) - set(train.columns)


set()

In [72]:
train[cat_cols + num_cols].head()

Unnamed: 0,vehicle_id,vehicle_model_modification,location_reference_point_id,vehicle_model,id,ride_month,rear_tire,front_tire,ride_day,ride_year,...,acceleration_level_shift_2,x_shift_3,x_shift_1,acceleration_level_shift_1,steering_shift_3,x_shift_2,steering_shift_1,acceleration_level_last_10_mean,x_last_10_mean,steering_last_10_mean
0,-2695.734397,-2695.734397,-2695.734397,-2695.734397,-2695.734397,-2695.734397,-2695.734397,-2695.734397,-2695.734397,-2695.734397,...,,,,,,,,,,
1,-1348.367199,-1348.367199,-1348.367199,-1348.367199,-1348.367199,-1348.367199,-1348.367199,-1348.367199,-1348.367199,-1348.367199,...,,,-4305.325027,-114.0,,,-2.65514,,,
2,-2334.019808,-2334.019808,-2334.019808,-2334.019808,-2334.019808,-2334.019808,-2334.019808,-2334.019808,-2334.019808,-2334.019808,...,-114.0,,-4305.489155,-123.0,,-4305.325027,-2.598169,,,
3,-2826.887145,-2826.887145,-2826.887145,-2826.887145,-2826.887145,-2826.887145,-2826.887145,-2826.887145,-2826.887145,-2826.887145,...,-123.0,-4305.325027,-4305.652097,-132.0,-2.65514,-4305.489155,-2.544422,,,
4,-3122.640135,-3122.640135,-3122.640135,-3122.640135,-3122.640135,-3122.640135,-3122.640135,-3122.640135,-3122.640135,-3122.640135,...,-132.0,-4305.489155,-4305.815555,-141.0,-2.598169,-4305.652097,-2.544422,,,


In [70]:
cat_cols, num_cols = set_cols_for_model(test, target, targets_all)
set(cat_cols + num_cols) - set(train.columns)

set()

In [None]:
id = 0
# use test data by id
test_target = test[test['id'] == id]

test_target

# set columns for one target
cat_cols, num_cols = set_cols_for_model(test_target, target, targets_all)


test_id = test[test['id'] == id][cat_cols + num_cols].copy()

# add target columns with shifts
for i in range(1, 4):
    test_id[f'{target}_shift_{i}'] = test_target[target].shift(i)

# add mean last 10 values for target columns
test_id[f'{target}_last_10_mean'] = test_target[target].rolling(window=10).mean()

preprocessor = pd.read_pickle(f'{preprocessor_path}/preprocessor_{target}.pkl')

# ctb encoder

# # obsereved columns
# obs_cols = [col for col in test_id.columns if 'obs' in col] 


# # use only columns for one target and 'shift_1_obs' doesn't need to encode
# test_enc = test_id.loc[:, cat_cols + list(set(num_cols) - set(obs_cols))]

# # fill null in target_shift column for correct work CatBoostEncoder
# test_enc.fillna(value={f'{target}_shift_1': -1}, inplace=True)

# # encode categorical columns
# test_enc = ce.CatBoostEncoder(cols=cat_cols).fit_transform(test_enc, test_enc[f'{target}_shift_1'])

# # del num and reminder from col names
# # train_target.columns = [col.split("__")[1] if "__" in col else col for col in train_target.columns]

# # replace -1 to nan
# test_enc[f'{target}_shift_1'] = test_enc[f'{target}_shift_1'].replace(-1, np.nan)


# # add target column to the end
# test_enc[target] = test_target[target]

# test_enc['id_obs'] = id

# test_enc.head()


(ctb_encoder(test_id, target, id, cat_cols, num_cols))
    
# # transform the encoded test data with preprocessor
# test_prepr = preprocessor.transform(ctb_encoder(test_id, target, id, cat_cols, num_cols))

# test_prepr.head()

Unnamed: 0,vehicle_id,vehicle_model_modification,location_reference_point_id,vehicle_model,id,ride_month,rear_tire,front_tire,ride_day,ride_year,...,acceleration_level_last_10_mean,steering,acceleration_level,x_last_10_mean,steering_last_10_mean,steering_shift_3,x_shift_2,steering_shift_1,x,id_obs
0,-369.645344,-369.645344,-369.645344,-369.645344,-369.645344,-369.645344,-369.645344,-369.645344,-369.645344,-369.645344,...,,5.739836,-929,,,,,,-1482.652694,0
1,-185.322672,-185.322672,-185.322672,-185.322672,-185.322672,-185.322672,-185.322672,-185.322672,-185.322672,-185.322672,...,,5.280618,-926,,,,,5.739836,-1482.766079,0
2,-617.766013,-617.766013,-617.766013,-617.766013,-617.766013,-617.766013,-617.766013,-617.766013,-617.766013,-617.766013,...,,5.039505,-918,,,,-1482.652694,5.280618,-1482.878241,0
3,-834.016029,-834.016029,-834.016029,-834.016029,-834.016029,-834.016029,-834.016029,-834.016029,-834.016029,-834.016029,...,,4.734873,-908,,,5.739836,-1482.766079,5.039505,-1482.990065,0
4,-963.788472,-963.788472,-963.788472,-963.788472,-963.788472,-963.788472,-963.788472,-963.788472,-963.788472,-963.788472,...,,4.387096,-897,,,5.280618,-1482.878241,4.734873,-1483.100891,0


In [38]:
df_preprocessor(df.copy(), target, 1, preprocessor_path, targets_all)

UnboundLocalError: cannot access local variable 'test_target' where it is not associated with a value

In [34]:
df_prepr[target] = df_preprocessor(df, target, id, preprocessor_path, targets_all)
df_prepr[target].head()

KeyError: "['z', 'roll', 'pitch'] not found in axis"

### spark connect

In [4]:
db = 'ycup'
# list of tables in db
tables = spark.sql(f'SHOW TABLES in {db}').collect()

df_list = {}
for table in tables:
    table_name = table.tableName
    df = spark.sql(f'SELECT * FROM {db}.{table_name} limit 10').toPandas()
    df_list[table_name] = df

In [None]:
# second variant


In [5]:
df_list['control']

Unnamed: 0,id,stamp_ns,acceleration_level,steering
0,0,36479492,-929,5.739836
1,0,76459951,-926,5.280618
2,0,116678417,-918,5.039505
3,0,156788958,-908,4.734873
4,0,196857808,-897,4.387096
5,0,236974997,-892,4.014573
6,0,276890721,-892,3.627408
7,0,316915752,-901,3.191926
8,0,356884436,-911,2.748362
9,0,396895329,-918,2.286391


## Client-Side: Sending and Receiving Data

## Running the Server


```
uvicorn your_script_name:app --reload
'''
