# Install Arcion, YCSB and SQL Server

In [1]:
%pip install ipywidgets

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import subprocess
print (subprocess.run("bin/download-jars.sh",stdout=subprocess.PIPE).stdout.decode('utf-8'))
print (subprocess.run("bin/install-arcion.sh",stdout=subprocess.PIPE).stdout.decode('utf-8'))
print (subprocess.run("bin/install-ycsb.sh",stdout=subprocess.PIPE).stdout.decode('utf-8'))
print (subprocess.run("bin/install-sqlserver.sh",stdout=subprocess.PIPE).stdout.decode('utf-8'))

deltalake /opt/stage/libs/SparkJDBC42.jar found
lakehouse  /opt/stage/libs/DatabricksJDBC42.jar found
postgres  /opt/stage/libs/postgresql-42.7.1.jar found
mariadb  /opt/stage/libs/mariadb-java-client-3.3.2.jar found
oracle /opt/stage/libs/ojdbc8.jar found
log4j /opt/stage/libs/log4j-1.2.17.jar found

arcion  /opt/stage/arcion/replicant-cli/bin/replicant found
checking jar(s) in /opt/stage/arcion/24.01.25.1/replicant-cli/lib for updates
checking jar(s) in /opt/stage/arcion/24.01.25.1/lib for updates
checking jar(s) in /opt/stage/arcion/replicant-cli/lib for updates
checking jar(s) in /opt/stage/arcion/replicate-cli-23.05.31.29/lib for updates
checking jar(s) in /opt/stage/arcion/23.05.31.31/lib for updates
checking jar(s) in /opt/stage/arcion/24.01.25.7/lib for updates
checking jar(s) in /opt/stage/arcion/23.09.29.11/lib for updates

YCSB  /opt/stage/ycsb/ycsb-jdbc-binding-0.18.0-SNAPSHOT  found
numfmt found
checking jar(s) in /opt/stage/ycsb/ycsb-jdbc-binding-0.18.0-SNAPSHOT/lib for u

In [4]:
import subprocess
from libpython.arcion_control import *    
from libpython.ycsb_control import *    
show_arcion_config()
show_ycsb_config()

# Customize YCSB workload characteristics

In [5]:
VBox([HBox([Label('Sparse'), sparse_cnt, sparse_fieldcount, sparse_fieldlength, sparse_recordcount]),
    HBox([Label('Dense'),  dense_cnt, dense_fieldcount, dense_fieldlength, dense_recordcount])])

VBox(children=(HBox(children=(Label(value='Sparse'), BoundedIntText(value=1, description='Table Cnt:', min=1),…

## Create SQL Server user, create and load YCSB data sets

In [7]:

print (subprocess.run(f""". ./demo/sqlserver/run-ycsb-sqlserver-source.sh; 
    create_user;
    y_fieldcount_sparse={sparse_fieldcount.value} y_fieldcount_dense={dense_fieldcount.value} y_fieldlength_sparse={sparse_fieldlength.value} y_fieldlength_dense={dense_fieldlength.value} y_recordcount_sparse={sparse_recordcount.value} y_recordcount_dense={dense_recordcount.value} 
    load_sparse_data_cnt {sparse_cnt.value};
    y_fieldcount_sparse={sparse_fieldcount.value} y_fieldcount_dense={dense_fieldcount.value} y_fieldlength_sparse={sparse_fieldlength.value} y_fieldlength_dense={dense_fieldlength.value} y_recordcount_sparse={sparse_recordcount.value} y_recordcount_dense={dense_recordcount.value} 
    load_dense_data_cnt {dense_cnt.value}""",
    shell=True,executable="/usr/bin/bash",stdout=subprocess.PIPE).stdout.decode('utf-8'))



real	0m1.810s
user	0m0.140s
sys	0m0.004s

real	0m0.253s
user	0m0.148s
sys	0m0.004s

real	0m1.797s
user	0m0.135s
sys	0m0.008s

real	0m1.769s
user	0m0.135s
sys	0m0.012s

real	0m26.641s
user	0m0.947s
sys	0m0.056s


replicant
24.01.25.1 24.01
PATH=/opt/stage/bin/jsqsh-dist-3.0-SNAPSHOT/bin added
Msg 15025, Level 16, State 1, Server ron, Line 2
The server principal 'arcsrc' already exists.
Msg 1801, Level 16, State 3, Server ron, Line 1
Database 'arcsrc' already exists. Choose a different database name.
Changed database context to 'arcsrc'.
Msg 15023, Level 16, State 5, Server ron, Line 1
User, group, or role 'arcsrc' already exists in the current database.
Starting sparse table 1
/home/rslee/github/dbx/ingestion/demo/sqlserver/config/03_sparsetable.sql
/home/rslee/github/dbx/ingestion/demo/sqlserver/config/03_sparsetable.fmt

Starting copy...
1000000 rows sent to SQL Server. Total sent: 1000000

1000000 rows copied.
Network packet size (bytes): 4096
Clock Time (ms.) Total     : 1788   Average : (559284.1 rows per sec.)
Finished sparse table 1
Starting sparse table 2
Msg 2714, Level 16, State 6, Server ron, Line 2
There is already an object named 'YCSBSPARSE2' in the database.
/home/rslee/github/db


real	0m5.940s
user	0m0.115s
sys	0m0.000s


# Run YCSB and Arcion in the background

## Start/Restart YCSB workload at 1 TPS
1. Adjust the TPS (throughput per second) via the UI
   1. 0=fast as possible
   2. 1=1 TPS
   3. 10=10 TPS
2. Run YCSB for sparse and dense tables

In [8]:
VBox([HBox([Label('Sparse'), sparse_tps, sparse_threads]), HBox([Label('Dense'),  dense_tps, dense_threads])])

VBox(children=(HBox(children=(Label(value='Sparse'), BoundedIntText(value=1, description='TPS:', max=1000), Bo…

In [9]:
print (subprocess.run(f""". ./demo/sqlserver/run-ycsb-sqlserver-source.sh; 
    kill_ycsb;
    y_target_sparse={sparse_tps.value} y_target_dense={dense_tps.value} y_threads_sparse={sparse_threads.value} y_threads_dense={dense_threads.value} y_fieldcount_sparse={sparse_fieldcount.value} y_fieldcount_dense={dense_fieldcount.value} y_fieldlength_sparse={sparse_fieldlength.value} y_fieldlength_dense={dense_fieldlength.value} 
    start_ycsb;""",
    shell=True,executable="/usr/bin/bash",stdout=subprocess.PIPE).stdout.decode('utf-8'))

replicant
24.01.25.1 24.01
PATH=/opt/stage/bin/jsqsh-dist-3.0-SNAPSHOT/bin added
YCSBDENSE
dense
ycsb YCSBDENSE pid 447835
ycsb YCSBDENSE log is at /home/rslee/github/dbx/ingestion/demo/sqlserver/logs/ycsb.YCSBDENSE.log
ycsb YCSBDENSE can be killed with . ./demo/sqlserver/run-ycsb-sqlserver-source.sh; kill_ycsb)
YCSBDENSE2
dense
ycsb YCSBDENSE2 pid 447843
ycsb YCSBDENSE2 log is at /home/rslee/github/dbx/ingestion/demo/sqlserver/logs/ycsb.YCSBDENSE2.log
ycsb YCSBDENSE2 can be killed with . ./demo/sqlserver/run-ycsb-sqlserver-source.sh; kill_ycsb)
YCSBSPARSE
sparse
ycsb YCSBSPARSE pid 447854
ycsb YCSBSPARSE log is at /home/rslee/github/dbx/ingestion/demo/sqlserver/logs/ycsb.YCSBSPARSE.log
ycsb YCSBSPARSE can be killed with . ./demo/sqlserver/run-ycsb-sqlserver-source.sh; kill_ycsb)
YCSBSPARSE2
sparse
ycsb YCSBSPARSE2 pid 447876
ycsb YCSBSPARSE2 log is at /home/rslee/github/dbx/ingestion/demo/sqlserver/logs/ycsb.YCSBSPARSE2.log
ycsb YCSBSPARSE2 can be killed with . ./demo/sqlserver/run-yc

## Start Arcion

In [8]:
VBox([
      HBox([Label('Arcion'), repl_mode, cdc_mode]),
      HBox([Label('Threads'), snapshot_threads, realtime_threads, delta_threads])
      ])

VBox(children=(HBox(children=(Label(value='Arcion'), Dropdown(description='Replication:', options=('snapshot',…

In [11]:
import subprocess

print (f"""{cdc_mode.value} {repl_mode.value}""")

print (subprocess.run(f""". ./demo/sqlserver/run-ycsb-sqlserver-source.sh; 
    echo $PROG_DIR;
    cd $PROG_DIR;
    kill_arcion;
    a_repltype={repl_mode.value} 
    SRCDB_SNAPSHOT_THREADS={snapshot_threads.value} 
    SRCDB_REALTIME_THREADS={realtime_threads.value} 
    SRCDB_DELTA={delta_threads.value}_THREADS
    start_{cdc_mode.value}_arcion;""",
    shell=True,executable="/usr/bin/bash",stdout=subprocess.PIPE).stdout.decode('utf-8'))


change snapshot
replicant
24.01.25.1 24.01
PATH=/opt/stage/bin/jsqsh-dist-3.0-SNAPSHOT/bin added
/home/rslee/github/dbx/ingestion/demo/sqlserver
enable change tracking on database arcsrc
skip ALTER DATABASE arcsrc SET CHANGE_TRACKING = ON  (CHANGE_RETENTION = 2 DAYS, AUTO_CLEANUP = ON);
skip ALTER TABLE replicate_io_audit_ddl ENABLE CHANGE_TRACKING;
skip ALTER TABLE replicate_io_audit_tbl_cons ENABLE CHANGE_TRACKING;
skip ALTER TABLE replicate_io_audit_tbl_schema ENABLE CHANGE_TRACKING;
ALTER TABLE YCSBDENSE ENABLE CHANGE_TRACKING;
ALTER TABLE YCSBDENSE2 ENABLE CHANGE_TRACKING;
ALTER TABLE YCSBSPARSE ENABLE CHANGE_TRACKING;
ALTER TABLE YCSBSPARSE2 ENABLE CHANGE_TRACKING;
ALTER TABLE YCSBSPARSE3 ENABLE CHANGE_TRACKING;
ALTER TABLE YCSBSPARSE4 ENABLE CHANGE_TRACKING;
replicant
arcion pid 448383
arcion log is at /home/rslee/github/dbx/ingestion/demo/sqlserver/logs/arcion.log
arcion can be killed with . ./demo/sqlserver/run-ycsb-sqlserver-source.sh; kill_arcion)



In [26]:

cluster_id = spark.conf.get("spark.databricks.clusterUsageTags.clusterId")

workspace_id =spark.conf.get("spark.databricks.clusterUsageTags.clusterOwnerOrgId")

# clusterName = spark.conf.get("spark.databricks.clusterUsageTags.clusterName")

workspaceUrl = spark.conf.get("spark.databricks.workspaceUrl") # host name

http_path = f"sql/protocolv1/o/{workspace_id}/{cluster_id}"

spark_url=f"jdbc:spark://{workspaceUrl}:443/default;transportMode=http;ssl=1;httpPath={http_path};AuthMech=3"
databricks_url=f"jdbc:databricks://{workspaceUrl}:443/default;transportMode=http;ssl=1;httpPath={http_path};AuthMech=3"

NameError: name 'spark' is not defined