In [6]:
%pip install --quiet databricks-sdk~=0.28.0 pymssql sqlalchemy jinja2 pandas ipywidgets
try:
    dbutils.library.restartPython()
except:
    pass

Note: you may need to restart the kernel to use updated packages.


In [None]:
from typing import Dict
from collections import defaultdict
import sqlalchemy as sa, pandas as pd, getpass, jinja2
from databricks.sdk import WorkspaceClient
from databricks.sdk.service import catalog, jobs, pipelines
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [92]:
if "source_host_name" not in vars() or not source_host_name: source_host_name=input("source_host_name")
if "source_port" not in vars() or not source_port: source_port=input("source_port")
if "source_catalog_name" not in vars() or not source_catalog_name: source_catalog_name=input("source_catalog_name")
if "source_schema_name" not in vars() or not source_schema_name: source_schema_name=input("source_schema_name")
if "source_user_name" not in vars() or not source_user_name: source_user_name=input("dba source_user_name")
if "source_dba_name" not in vars() or not source_dba_name: source_dba_name=input("dba source_dba_name")
if "source_user_password" not in vars() or not source_user_password: source_user_password=getpass.getpass("user source_password")
if "source_dba_password" not in vars() or not source_dba_password: source_dba_password=getpass.getpass("dba source_password")

master_dba_url=f"mssql+pymssql://{source_dba_name}:{source_dba_password}@{source_host_name}:{source_port}/master"
user_dba_url=f"mssql+pymssql://{source_dba_name}:{source_dba_password}@{source_host_name}:{source_port}/{source_catalog_name}"
user_user_url=f"mssql+pymssql://{source_user_name}:{source_user_password}@{source_host_name}:{source_port}/{source_catalog_name}"

master_dba_engine = sa.create_engine(master_dba_url, pool_size=20, max_overflow=0, pool_pre_ping=True, isolation_level="AUTOCOMMIT", connect_args={"timeout": 60})
user_dba_engine = sa.create_engine(user_dba_url, pool_size=20, max_overflow=0, pool_pre_ping=True, isolation_level="AUTOCOMMIT", connect_args={"timeout": 60})
user_user_engine = sa.create_engine(user_user_url, pool_size=20, max_overflow=0, pool_pre_ping=True, isolation_level="AUTOCOMMIT", connect_args={"timeout": 60})

master_dba_conn = master_dba_engine.connect() 
user_dba_conn = user_dba_engine.connect() 
user_user_conn = user_user_engine.connect() 

In [109]:
# tabulate tables per schema
s_t_filter={}
s_t_filter[source_schema_name]=set()
for s_or_t in tables_to_replicate if "tables_to_replicate" in vars() else []:
    if s_or_t.schema:
        if s_or_t.schema.source_schema not in s_t_filter:  s_t_filter[s_or_t.schema.source_schema] = set()
    elif s_or_t.table:
        if s_or_t.table.source_schema not in s_t_filter: s_t_filter[s_or_t.table.source_schema] = set()
        s_t_filter[s_or_t.table.source_schema].add(s_or_t.table.source_table)
    else:
        raise Exception(f"expecting either schema or table. found {s_or_t}")
# build where clause
s_t_filter_sql_list=[]
for s_name, t_name in s_t_filter.items():
    schema_filter = f"table_schema='{s_name}'" 
    table_filter = ",".join(f"'{table_name}'" for table_name in t_name)
    if table_filter: s_t_filter_sql_list.append(f"({schema_filter} and table_name in ({table_filter}))")
    else: s_t_filter_sql_list.append(f"({schema_filter})") 
s_t_filter_sql = "and (" + " or ".join(s_t_filter_sql_list) + ")" if s_t_filter_sql_list else ""   

In [110]:
# tsql scripts
cdc_enable_on_db_tsql=f"""
-- enable CDC
if exists (select name, is_cdc_enabled from sys.databases where name=db_name() and is_cdc_enabled=1)
    BEGIN
        select 'CDC already enabled'
    END
ELSE
    BEGIN
        if exists(SELECT name, schema_name(schema_id) FROM sys.objects WHERE type = 'P' and name='sp_cdc_enable_db')
            BEGIN
                select 'enable cdc on database sys.sp_cdc_enable_db';
                EXEC sys.sp_cdc_enable_db
            END
        else if exists(SELECT name, schema_name(schema_id) FROM msdb.sys.objects WHERE type = 'P' and name='gcloudsql_cdc_enable_db')
            BEGIN
                select 'enable cdc on database msdb.dbo.gcloudsql_cdc_enable_db';
                EXEC msdb.dbo.gcloudsql_cdc_enable_db {source_catalog_name};
            END
        else if exists(SELECT name, schema_name(schema_id) FROM msdb.sys.objects WHERE type = 'P' and name='rds_cdc_enable_db')
            BEGIN
                select 'enable cdc on database msdb.dbo.rds_cdc_enable_db';
                EXEC msdb.dbo.rds_cdc_enable_db '<database-name>' 
            END
        else
            BEGIN
                select 'cdc enable stored proc does not exist'
            END
    END
"""

cdc_disable_on_db_tsql=f"""
-- disable CDC
if not exists (select name, is_cdc_enabled from sys.databases where name=db_name() and is_cdc_enabled=1)
    BEGIN
        select 'CDC already disabled'
    END
ELSE
    BEGIN
        if exists(SELECT name, schema_name(schema_id) FROM sys.objects WHERE type = 'P' and name='sp_cdc_enable_db')
            BEGIN
                select 'disable cdc on database sys.sp_cdc_enable_db';
                EXEC sys.sp_cdc_disable_db
            END
        else if exists(SELECT name, schema_name(schema_id) FROM msdb.sys.objects WHERE type = 'P' and name='gcloudsql_cdc_disable_db')
            BEGIN
                select 'disable cdc on database msdb.dbo.gcloudsql_cdc_disable_db';
                EXEC msdb.dbo.gcloudsql_cdc_disable_db {source_catalog_name};
            END
        else if exists(SELECT name, schema_name(schema_id) FROM msdb.sys.objects WHERE type = 'P' and name='rds_cdc_disable_db')
            BEGIN
                select 'disable cdc on database msdb.dbo.rds_cdc_disable_db';
                EXEC msdb.dbo.rds_cdc_disable_db '<database-name>' 
            END
        else
            BEGIN
                select 'cdc disable stored proc does not exist'
            END
    END
"""

ct_enable_on_db_tsql=f"""
-- enable CT
if exists (select * from sys.change_tracking_databases where database_id=db_id())
    BEGIN
        select 'CT already enabled'
    END
else
    BEGIN
        select 'enable ct on database';
        exec ('ALTER DATABASE {source_catalog_name} SET CHANGE_TRACKING = ON (CHANGE_RETENTION = 3 DAYS, AUTO_CLEANUP = ON)');
    END
"""

# tsql
cdc_ct_enable_tsql="""
OPEN MyCursor
FETCH NEXT FROM MyCursor INTO @TABLE_CAT, @TABLE_SCHEM, @TABLE_NAME, @PK, @CDC, @CT
WHILE @@FETCH_STATUS = 0
BEGIN
	if @PK is not NULL and @CT is NULL
		exec('ALTER TABLE ['+@TABLE_SCHEM+'].['+@TABLE_NAME+'] ENABLE CHANGE_TRACKING WITH (TRACK_COLUMNS_UPDATED = ON)')
	if @PK is NULL and @CDC is NULL
		exec sys.sp_cdc_enable_table @source_schema = @TABLE_SCHEM, @source_name = @TABLE_NAME,  @role_name = NULL, @supports_net_changes = 0
	if @PK is NOT NULL and @CDC is NOT NULL
		exec sys.sp_cdc_disable_table @source_schema = @TABLE_SCHEM, @source_name = @TABLE_NAME,  @capture_instance = 'all'
	if @PK is NULL and @CT is NOT NULL -- should never happen but just in case
		exec('ALTER TABLE ['+@TABLE_SCHEM+'].['+@TABLE_NAME+'] DISABLE CHANGE_TRACKING')
	FETCH NEXT FROM MyCursor INTO @TABLE_CAT, @TABLE_SCHEM, @TABLE_NAME, @PK, @CDC, @CT
END
CLOSE MyCursor;
DEALLOCATE MyCursor;
SELECT 'CDC / CT enabled for tables'
"""

cdc_cd_tsql=jinja2.Template("""
-- CHANGE schema_name to your schema name
BEGIN
DECLARE @schema_name nvarchar(128) = N'{{source_schema_name}}';
DECLARE @TABLE_CAT nvarchar(128), @TABLE_SCHEM nvarchar(128), @TABLE_NAME nvarchar(128), @PK nvarchar(128), @CT nvarchar(128), @CDC nvarchar(128);
{% if run_cdc_ct %}
DECLARE MyCursor CURSOR FOR
{% endif %}
with 
tab as (
	select table_catalog TABLE_CAT, table_schema table_schem, table_name TABLE_NAME 
	from information_schema.tables 
	where table_type='BASE TABLE'
	and table_name not in ('MSchange_tracking_history', 'systranschemas')
	{{s_t_filter_sql}}
	)
, pk as (
	-- PRIMARY KEY TABLES
    SELECT 
        tc.constraint_catalog as TABLE_CAT, tc.constraint_schema as TABLE_SCHEM, tc.table_name as TABLE_NAME 
    FROM information_schema.table_constraints tc 
    JOIN information_schema.constraint_column_usage AS ccu 
        on tc.constraint_schema = ccu.constraint_schema and tc.constraint_name = ccu.constraint_name 
    JOIN information_schema.columns AS c 
        ON c.table_schema = tc.constraint_schema AND tc.table_name = c.table_name AND ccu.column_name = c.column_name
    where tc.constraint_type='PRIMARY KEY'
    and tc.constraint_schema = @schema_name
    )
, ct as (    
    -- CT enabled tables
    select db_name() TABLE_CAT, schema_name(t.schema_id) TABLE_SCHEM, t.name TABLE_NAME  
    from sys.change_tracking_tables ctt 
    left join sys.tables t on ctt.object_id = t.object_id
    where t.schema_id=schema_id(@schema_name)
)
, cdc as (
    -- CDC enabled table
    select db_name() TABLE_CAT, s.name TABLE_SCHEM, t.name as TABLE_NAME 
    from sys.tables t
    left join sys.schemas s on t.schema_id = s.schema_id
    where t.is_tracked_by_cdc=1 and 
    t.schema_id=schema_id(@schema_name)
)
select tab.TABLE_CAT, tab.TABLE_SCHEM, tab.TABLE_NAME, pk.TABLE_NAME PK, cdc.TABLE_NAME CDC, ct.TABLE_NAME CT 
from tab
left join pk  on pk.TABLE_CAT=tab.TABLE_CAT  and pk.TABLE_SCHEM=tab.TABLE_SCHEM  and pk.TABLE_NAME=tab.TABLE_NAME
left join ct  on ct.TABLE_CAT=tab.TABLE_CAT  and ct.TABLE_SCHEM=tab.TABLE_SCHEM  and ct.TABLE_NAME=tab.TABLE_NAME
left join cdc on cdc.TABLE_CAT=tab.TABLE_CAT and cdc.TABLE_SCHEM=tab.TABLE_SCHEM and cdc.TABLE_NAME=tab.TABLE_NAME
{% if run_cdc_ct %}
{{cdc_ct_enable_tsql}}
{% endif %}
END
""")

sql_cmd_status = sa.text(cdc_cd_tsql.render(source_schema_name=source_schema_name, cdc_ct_enable_tsql=cdc_ct_enable_tsql, s_t_filter_sql=s_t_filter_sql, run_cdc_ct=False))
sql_cmd_alter  = sa.text(cdc_cd_tsql.render(source_schema_name=source_schema_name, cdc_ct_enable_tsql=cdc_ct_enable_tsql, s_t_filter_sql=s_t_filter_sql, run_cdc_ct=True))


# enable cdc/ct

In [113]:
display(pd.read_sql(cdc_enable_on_db_tsql, user_dba_conn))
display(pd.read_sql(ct_enable_on_db_tsql, user_dba_conn))
display(pd.read_sql(sql_cmd_status, user_dba_conn))
display(pd.read_sql(sql_cmd_alter, user_dba_conn))
display(pd.read_sql(sql_cmd_status, user_dba_conn))

Unnamed: 0,Unnamed: 1
0,CDC already enabled


Unnamed: 0,Unnamed: 1
0,CT already enabled


Unnamed: 0,TABLE_CAT,TABLE_SCHEM,TABLE_NAME,PK,CDC,CT
0,arcsrc,robert_lee,pk,pk,,pk


Unnamed: 0,Unnamed: 1
0,CDC / CT enabled for tables


Unnamed: 0,TABLE_CAT,TABLE_SCHEM,TABLE_NAME,PK,CDC,CT
0,arcsrc,robert_lee,pk,pk,,pk


# disable cd ctc

In [None]:
# pd.read_sql(cdc_disable_on_db_tsql, user_dba_conn)
# pd.read_sql(ctc_disable_on_db_tsql, user_dba_conn)

Unnamed: 0,Unnamed: 1
0,disable cdc on database msdb.dbo.gcloudsql_cdc_disable_db
