## Initial Setup Before Monitoring
---

## We will run this notebook twice.

* [ ] Update config
    * "static_sample_dates", "date_sample_start", "date_sample_end", "number_periods_in_sample"
* [ ] Query data
    * [ ] Production
    * [ ] Development
    * [ ] Previous refitting/validation data

In [1]:
# Helper 1 - get sample_dates
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

from smart_open import open

In [2]:
mmonth = "2021-12"  # monitoring month  03-06-09-12
prev_test_df_path = "s3://sofi-data-science/jxu/monitoring/customer-risk-v1/2021-06/test_df_2020-01-01.parquet"  # only change after a model refit
date_sample_start = "2019-01-16"   # refit 
date_sample_end = "2020-12-31"   # refit
monitor_start_date = "2021-07-01" 
monitor_end_date = "2021-12-31"   # 2021-04-01, 02-01, ..., 09-01
prod_data_dates = ["2021-12-31"]   # correction: insert current date
data_output_path = f"s3://sofi-data-science/jxu/monitoring/customer-risk-v1/{mmonth}/"
artifact_path = f"../../artifacts/monitoring-2021Q4/"  # where the monitoring result is stored locally

model_paths = {"incumbent_model": "s3://sofi-data-science/jxu/money-risk-models/customer-risk-model/models/customer_risk_target_no_giact_time_since_last_link.pkl",
               "customer_refitted_2021Q1": "s3://sofi-data-science/jxu/money-risk-models/models/customer_risk_refit_20201231_wo_ind.pkl"}

In [3]:
dev_df_path = "../../data/combined_all_features/combined_1640842189.feather"  # produce using the data pipeline

### config["static_sample_dates"]

In [4]:
with open("../../config.json", "r") as f:
    config = json.load(f)
    
config["static_sample_dates"]

['2019-03-15',
 '2019-05-27',
 '2019-08-08',
 '2019-10-20',
 '2020-01-01',
 '2020-03-14',
 '2020-05-26',
 '2020-08-07',
 '2020-10-19',
 '2020-12-31',
 '2021-07-01',
 '2021-08-01',
 '2021-09-01',
 '2021-10-01',
 '2021-11-01',
 '2021-12-01']

In [5]:
dates = config["static_sample_dates"]

In [6]:
def get_spacing(dates):
    """ Use this to find the spacing of previous sampling methods """
    dates = pd.to_datetime(dates)
    dates = pd.Series(dates).diff().value_counts()
    return dates

def get_sampling_dates(start, end, freq):
    """ Get static sampling dates from start to end with period in between """
    start = pd.to_datetime(start)
    end = pd.to_datetime(end).normalize()
    
    result = list(pd.date_range(start, end, freq=freq))
    result = list(map(lambda d: str(d).split(" ")[0], result))
    return result

def get_monitoring_dates(start, end="today"):
    """ We get monitoring dfs by looking at first day of every month """ 
    start = pd.to_datetime(start)
    end = pd.to_datetime(end).normalize()
    
    # 365/28 about 13, so set 15 to include every month
    dates = pd.date_range(start, end, freq="15D")
    dates = sorted(list(set(map(lambda d: d.replace(day=1), dates))))
    dates = list(map(lambda d: str(d).split(" ")[0], dates))
    
    return dates

In [7]:
dates = config["static_sample_dates"]
get_spacing(dates)

73 days     9
31 days     3
30 days     2
182 days    1
dtype: int64

In [8]:
sample_dates = get_sampling_dates("2019-03-15", date_sample_end, "73D")
monitoring_dates = get_monitoring_dates(monitor_start_date, monitor_end_date)
static_sample_dates = sorted(list(set(sample_dates + monitoring_dates)))

In [9]:
sample_dates

['2019-03-15',
 '2019-05-27',
 '2019-08-08',
 '2019-10-20',
 '2020-01-01',
 '2020-03-14',
 '2020-05-26',
 '2020-08-07',
 '2020-10-19',
 '2020-12-31']

In [10]:
# for monitoring , double check this 
monitoring_dates

['2021-07-01',
 '2021-08-01',
 '2021-09-01',
 '2021-10-01',
 '2021-11-01',
 '2021-12-01']

In [11]:
static_sample_dates

['2019-03-15',
 '2019-05-27',
 '2019-08-08',
 '2019-10-20',
 '2020-01-01',
 '2020-03-14',
 '2020-05-26',
 '2020-08-07',
 '2020-10-19',
 '2020-12-31',
 '2021-07-01',
 '2021-08-01',
 '2021-09-01',
 '2021-10-01',
 '2021-11-01',
 '2021-12-01']

### config["number_periods_in_sample"]
---
during development we sampled about 15 records for dynamic sampling over one year worth of data.

If we'd like to main the same ratio, config["number_periods_in_sample"] needs to be scaled.

In [12]:
def get_updated_number_periods_in_sample(start, end, n_start):
    ndays = (pd.to_datetime(end) - pd.to_datetime(start)).days / 365 * n_start
    return int(ndays)

In [13]:
display(date_sample_start, date_sample_end)


n_start = 15
ndays = get_updated_number_periods_in_sample(date_sample_start, date_sample_end, n_start)

'2019-01-16'

'2020-12-31'

In [14]:
config["date_sample_end"] = date_sample_end
config["number_periods_in_sample"] = ndays

In [15]:
# monitoring config file. Will be used in later nbs
mconfig = {"mmonth": mmonth,
           "dev_df_path": dev_df_path,
           "monitoring_dates": monitoring_dates,
           "prod_data_dates": prod_data_dates,
           "data_output_path": data_output_path,
           "artifact_path": artifact_path,
           "date_sample_start": date_sample_start,
           "date_sample_end": date_sample_end,
           "model_paths": model_paths,
           "prev_test_df_path": prev_test_df_path
          }  


In [16]:
config["monitoring_config"] = mconfig

### Output to Config
---

Last check the important values

In [17]:
from pprint import pprint
for c in ["static_sample_dates", "date_sample_start", "date_sample_end", "number_periods_in_sample"]:
    
    pprint((c, config[c]))

('static_sample_dates',
 ['2019-03-15',
  '2019-05-27',
  '2019-08-08',
  '2019-10-20',
  '2020-01-01',
  '2020-03-14',
  '2020-05-26',
  '2020-08-07',
  '2020-10-19',
  '2020-12-31',
  '2021-07-01',
  '2021-08-01',
  '2021-09-01',
  '2021-10-01',
  '2021-11-01',
  '2021-12-01'])
('date_sample_start', '2019-01-16')
('date_sample_end', '2020-12-31')
('number_periods_in_sample', 29)


In [18]:
# if sampling dates makes sense, output to config

config["static_sample_dates"] = static_sample_dates
with open("../../config.json", "w") as f:
    json.dump(config, f, indent=4)

### Install the following packages to get production data

In [19]:
!pip install --index-url https://build.sofi.com/artifactory/api/pypi/pypi/simple sofiproto==release-1703
!pip install read_protobuf

Looking in indexes: https://build.sofi.com/artifactory/api/pypi/pypi/simple
