## Create unified reference data. 
Last updated by Developer 2025-01-09

In [None]:
import os
import datetime
from google.cloud import bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/user/base/ficc/creds.json"
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/jupyter/creds.json"
bq_client = bigquery.Client()

project = "eng-reactor-287421"

In [None]:
from datetime import date
def mkview(dataset,name,sql):
    db = f"{project}.{dataset}."
    name = db + name
    bq_client.delete_table(name, not_found_ok=True) 
    view = bigquery.Table(name)
    view.view_query = sql
    view = bq_client.create_table(view)
    return name

In [None]:
def sqltodf(sql,limit = ""):
    if limit != "": 
        limit = f" ORDER BY RAND() LIMIT {limit}"
    bqr = bq_client.query(sql + limit).result()
    return bqr.to_dataframe()

In [None]:
unified = mkview("reference_data_v2", "unified", f'''
SELECT
  file_received_from_provider_timestamp,
  upload_datetime,
  TIMESTAMP(ref_valid_from_date) old_valid_from_date,
  TIMESTAMP(ref_valid_to_date) old_valid_to_date,
  CAST(coupon AS NUMERIC) AS coupon,
  security_description,
  cusip,
  TIMESTAMP(file_received_from_sp_date) AS file_date,
  incorporated_state_code,
  organization_primary_name,
  instrument_primary_name,
  CAST(issue_key AS STRING) AS issue_key,
  -- Ensure issue_key is consistently a STRING
  issue_text,
  conduit_obligor_name,
  is_called,
  is_callable,
  is_escrowed_or_pre_refunded,
  first_call_date,
  call_date_notice,
  callable_at_cav,
  CAST(par_price AS NUMERIC) AS par_price,
  -- Ensure numeric consistency
  CAST(call_defeased AS STRING) AS call_defeased,
  -- Ensure consistent type
  call_timing,
  call_timing_in_part,
  extraordinary_make_whole_call,
  extraordinary_redemption,
  make_whole_call,
  next_call_date,
  CAST(next_call_price AS NUMERIC) AS next_call_price,
  -- Ensure numeric consistency
  call_redemption_id,
  par_call_date,
  CAST(par_call_price AS NUMERIC) AS par_call_price,
  -- Ensure numeric consistency
  called_redemption_type,
  muni_issue_type,
  refund_date,
  CAST(refund_price AS NUMERIC) AS refund_price,
  -- Ensure numeric consistency
  redemption_cav_flag,
  max_notification_days,
  min_notification_days,
  next_put_date,
  put_end_date,
  CAST(put_feature_price AS NUMERIC) AS put_feature_price,
  -- Ensure numeric consistency
  put_frequency,
  put_start_date,
  put_type,
  maturity_date,
  has_sink_schedule,
  next_sink_date,
  sink_indicator,
  sink_frequency,
  CAST(sink_defeased AS STRING) AS sink_defeased,
  -- Ensure consistent type
  sink_amount_type,
  additional_sink_frequency,
  min_amount_outstanding,
  max_amount_outstanding,
  default_exists,
  has_unexpired_lines_of_credit,
  years_to_loc_expiration,
  escrow_exists,
  CAST(escrow_obligation_percent AS NUMERIC) AS escrow_obligation_percent,
  -- Ensure numeric consistency
  CAST(escrow_obligation_agent AS STRING) AS escrow_obligation_agent,
  -- Ensure consistent type
  escrow_obligation_type,
  --child_linkage_exists,
  put_exists,
  floating_rate_exists,
  bond_insurance_exists,
  is_general_obligation,
  has_zero_coupons,
  delivery_date,
  CAST(issue_price AS NUMERIC) AS issue_price,
  -- Ensure numeric consistency
  primary_market_settlement_date,
  issue_date,
  outstanding_indicator,
  federal_tax_status,
  CAST(maturity_amount AS NUMERIC) AS maturity_amount,
  -- Ensure numeric consistency
  CAST(available_denom AS NUMERIC) AS available_denom,
  -- Ensure numeric consistency
  CAST(denom_increment_amount AS NUMERIC) AS denom_increment_amount,
  -- Ensure numeric consistency
  CAST(min_denom_amount AS NUMERIC) AS min_denom_amount,
  -- Ensure numeric consistency
  accrual_date,
  CAST(bond_insurance AS STRING) AS bond_insurance,
  -- Ensure consistent type
  coupon_type,
  CAST(current_coupon_rate AS NUMERIC) AS current_coupon_rate,
  -- Ensure numeric consistency
  daycount_basis_type,
  debt_type,
  default_indicator,
  first_coupon_date,
  interest_payment_frequency,
  CAST(issue_amount AS NUMERIC) AS issue_amount,
  -- Ensure numeric consistency
  last_period_accrues_from_date,
  next_coupon_payment_date,
  odd_first_coupon_date,
  CAST(orig_principal_amount AS NUMERIC) AS orig_principal_amount,
  -- Ensure numeric consistency
  CAST(original_yield AS NUMERIC) AS original_yield,
  -- Ensure numeric consistency
  CAST(outstanding_amount AS NUMERIC) AS outstanding_amount,
  -- Ensure numeric consistency
  previous_coupon_payment_date,
  sale_type,
  settlement_type,
  CAST(additional_project_txt AS STRING) AS additional_project_txt,
  -- Ensure consistent type
  asset_claim_code,
  cast(additional_state_code as string) additional_state_code,
  backed_underlying_security_id,
  cast(bank_qualified as boolean) bank_qualified,
  capital_type,
  -- conditional_call_date,
  -- CAST(conditional_call_price AS NUMERIC) AS conditional_call_price,
  -- Ensure numeric consistency
-- designated_termination_date,
  DTCC_status,
  -- first_execution_date,
  -- formal_award_date,
  muni_security_type,
  maturity_description_code,
  cast(mtg_insurance as string) mtg_insurance,
  orig_cusip_status,
  orig_instrument_enhancement_type,
  other_enhancement_type,
  cast(other_enhancement_company as string) other_enhancement_company,
  --pac_bond_indicator,
  cast(project_name as string) project_name,
  purpose_class,
  purpose_sub_class,
  refunding_issue_key,
 -- refunding_dated_date,
  --sale_date,
  sec_regulation,
  secured,
  cast(series_name as string) series_name,
  sink_fund_redemption_method,
  state_tax_status,
  tax_credit_frequency,
  CAST(tax_credit_percent AS NUMERIC) AS tax_credit_percent,
  -- Ensure numeric consistency
  use_of_proceeds,
  sp_long
FROM
  `reference_data_v2.reference_data_flat`''') 
print(unified)
%time df = sqltodf(f"SELECT * FROM {unified}", 3)
df

In [None]:
unified_valid_time = mkview('reference_data_v2', 'unified_valid_time', f'''SELECT
CASE
    WHEN ROW_NUMBER() OVER (PARTITION BY cusip ORDER BY file_date ASC) = 1 THEN TIMESTAMP("2010-01-01 00:00:00")
    ELSE file_date
END
  AS valid_from_date,
  CASE
    WHEN ROW_NUMBER() OVER (PARTITION BY cusip ORDER BY file_date DESC) = 1 THEN TIMESTAMP("2100-01-01 00:00:00")
    ELSE TIMESTAMP_SUB(LEAD(file_date) OVER (PARTITION BY cusip ORDER BY file_date ASC), INTERVAL 1 SECOND )
END
  AS valid_to_date,
  CONCAT(cusip, CAST(file_date AS STRING)) AS KEY,
  *
FROM 
    eng-reactor-287421.reference_data_v2.unified''')
print(unified_valid_time)
%time df = sqltodf(f"SELECT * FROM {unified_valid_time}", 3)
df