# Cost Estimation for using MySQL and MSSQL Fivetran Connectors

## How fivetran pricing is structured
Fivetran structures their pricing based on the number of monthly active rows ingested (MAR). According to their documentation, MAR is defined as "the number of distinct primary keys synced from the source system to your destination in a given calendar month". If a primary key is not available, they create a hashed primary key to use instead.

If a row is synced more than once in a month, it is still only counted as one row. You don't pay multiple times for updates on the same row in the same month.

Initial syncs do not count towards monthly active rows. I.e. the initial full load which syncs historical data will not incur any costs. Most resyncs do not count towards monthly active cost, with the exception of automatic re-syncs of a table that was previously excluded, and for oracle connectors when a resync is triggered due to a table's SCN is no longer being available.

## How this notebook estimates cost
There are three Qlik tasks that we plan to move over to Fivetran:
- MYSQL_ORACLE_SNOWFLAKE
- MYSQL_RISK_SNOWFLAKE
- MYSQL_AIRFLOW_SNOWFLAKE

This script counts the number of rows in all the tables and then computes cost estimates based on the percentage of rows that could be active in any given month. This way, we can see the upper and lower bounds on the additional cost of adding these tables to fivetran.


In [1]:
from api_utils.snowflake_api import Snowflake
import os
import pandas as pd
import matplotlib.pyplot as plt


def get_initial_sync_date(sf, conn, schema, table, date_column):


    initial_sync_query = f"select min({date_column}) as initial_sync_date from landing.{schema}.{table};"
    
    initial_sync_date = sf.execute_query(query=initial_sync_query, conn=conn).fetchone()[0]

    return initial_sync_date

def get_MAR(conn, initial_sync_date, schema, table, date_column):

    MAR_query = f"""
        select sum(rows_replicated), month, year from (
        select 
        case
            when date_trunc('SECOND', {date_column}) = 
            date_trunc('SECOND', to_timestamp('{initial_sync_date}')) then 0 else 1 
        end as rows_replicated,
        month({date_column}) as month,
        year({date_column}) as year
        from landing.{schema}.{table}
        )
        group by month, year;
    """

    cursor = conn.cursor()
    cursor.execute(MAR_query)
    df = cursor.fetch_pandas_all()

    return df

In [2]:
sf = Snowflake(
    user=os.getenv("SNOWFLAKE_USER"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    role=os.getenv("SNOWFLAKE_ROLE"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE")
)

conn = sf.get_connection()

schemas_tables_date_columns = {
    "risk": {
        "date_column": "replication_date",
        "tables": [
            "sam_list_excludedaddress",
            "sam_list_excludedcardholder",
            "sam_list_excludedcardnumber",
            "sam_list_merchantexclusion",
            "sam_ref_birulesmapping",
            "sam_ref_customerfields",
            "sam_ref_mismatchrule",
            "sam_ref_pickupfraudrule",
            "sam_ref_postfilterrule",
            "sam_ref_prefilterrule",
            "sam_ref_ruleprofile",
            "sam_ref_ruletype",
            "sam_ref_trxloadschedule",
            "sam_ref_velocityfields",
            "sam_ref_velocityrule"
        ]
    },
    "oracle": {
        "date_column": "replication_date",
        "tables": [
            "oracle_prod_acquirer",
            "oracle_prod_acquirer_bin",
            "oracle_prod_acquirer_bin_mapping",
            "oracle_prod_audit",
            "oracle_prod_country",
            "oracle_prod_country_override",
            "oracle_prod_criteria",
            "oracle_prod_currency",
            "oracle_prod_excluded_fee",
            "oracle_prod_fee",
            "oracle_prod_fee_acquirer_mapping",
            "oracle_prod_fee_action_mapping",
            "oracle_prod_fee_country_mapping",
            "oracle_prod_fee_criteria_mapping",
            "oracle_prod_fee_external_descriptor",
            "oracle_prod_fee_rate",
            "oracle_prod_fee_region_mapping",
            "oracle_prod_fee_scheme_mapping",
            "oracle_prod_global_acquirer_mapping",
            "oracle_prod_industry",
            "oracle_prod_lookup_list",
            "oracle_prod_lookup_list_value",
            "oracle_prod_mastercard_country",
            "oracle_prod_mastercard_public_holiday",
            "oracle_prod_merchant_category_code",
            "oracle_prod_schemaversions",
            "oracle_prod_scheme",
            "oracle_prod_settlement_currencies",
            "oracle_prod_user_account",
            "oracle_prod_visa_country",
            "oracle_prod_visa_public_holiday"

        ]
    },
    "airflow_db": {
        "date_column": "event_timestamp",
        "tables": [
            "ab_permission",
            "ab_permission_view",
            "ab_permission_view_role",
            "ab_register_user",
            "ab_role",
            "ab_user",
            "ab_user_role",
            "ab_view_menu",
            "alembic_version",
            "callback_request",
            "celery_tasksetmeta",
            "celery_taskmeta",
            "connection",
            "dag",
            "dag_code",
            "dag_pickle",
            "dag_run", 
            "dag_tag",
            "enabled_dags",
            "import_error",
            "job",
            "known_event",
            "known_event_type",
            "log",
            "log_template",
            "rendered_task_instance_fields",
            "sensor_instance",
            "serialized_dag",
            "session",
            "sla_miss",
            "slot_pool",
            "task_fail",
            "task_instance",
            "task_map",
            "task_reschedule",
            "variable",
            "xcom"
        ]
    },
}

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [3]:
for schema, info in schemas_tables_date_columns.items():
    date_column = info["date_column"]
    tables = info["tables"]
    df = None
    n_rows = 0
    for table in tables:
        query = f"select count(*) from landing.{schema}.{table}"
        n_rows += sf.execute_query(query=query, conn=conn).fetchone()[0]

conn.close()

print("total number of rows in tables: ", n_rows)

total number of rows in tables:  25788361


### create df with percentages of this number, the new MAR, the new cost, and the cost difference

In [4]:
percents = [n*5 for n in range(0, 21)]
MARs = [n*5*0.01*n_rows for n in range(0,21)]
# current MAR on average is approx 30,000,000
currentMAR = [30000000 for n in range(0,21)]
df = pd.DataFrame(
    {
        "Percent": percents,
        "MAR": MARs,
        "currentMAR": currentMAR
    }
)

df["projectedMAR"] = df.apply(lambda row: (row.MAR + row.currentMAR), axis=1)
df

Unnamed: 0,Percent,MAR,currentMAR,projectedMAR
0,0,0.0,30000000,30000000.0
1,5,1289418.05,30000000,31289418.05
2,10,2578836.1,30000000,32578836.1
3,15,3868254.15,30000000,33868254.15
4,20,5157672.2,30000000,35157672.2
5,25,6447090.25,30000000,36447090.25
6,30,7736508.3,30000000,37736508.3
7,35,9025926.35,30000000,39025926.35
8,40,10315344.4,30000000,40315344.4
9,45,11604762.45,30000000,41604762.45


In [5]:
# add new column which estimates $ per million MARs
dollarsPerMillionMARs = [130-n for n in range(0,21)]
df["dollarsPerMillionMARs"] = dollarsPerMillionMARs
df["currentCost"] = df.apply(lambda row: (row.currentMAR/10**6)*row.dollarsPerMillionMARs, axis=1)
df["projectedCost"] = df.apply(lambda row: (row.projectedMAR/10**6)*row.dollarsPerMillionMARs, axis=1)
df["projectedAdditionalSpend"] = df.apply(lambda row: abs(row.projectedCost-row.currentCost), axis=1)

df

Unnamed: 0,Percent,MAR,currentMAR,projectedMAR,dollarsPerMillionMARs,currentCost,projectedCost,projectedAdditionalSpend
0,0,0.0,30000000,30000000.0,130,3900.0,3900.0,0.0
1,5,1289418.05,30000000,31289418.05,129,3870.0,4036.334928,166.334928
2,10,2578836.1,30000000,32578836.1,128,3840.0,4170.091021,330.091021
3,15,3868254.15,30000000,33868254.15,127,3810.0,4301.268277,491.268277
4,20,5157672.2,30000000,35157672.2,126,3780.0,4429.866697,649.866697
5,25,6447090.25,30000000,36447090.25,125,3750.0,4555.886281,805.886281
6,30,7736508.3,30000000,37736508.3,124,3720.0,4679.327029,959.327029
7,35,9025926.35,30000000,39025926.35,123,3690.0,4800.188941,1110.188941
8,40,10315344.4,30000000,40315344.4,122,3660.0,4918.472017,1258.472017
9,45,11604762.45,30000000,41604762.45,121,3630.0,5034.176256,1404.176256
