In [224]:
import pandas as pd
import datetime

In [225]:
est_data = pd.read_csv("C:/model_data/estimate_raw.csv")

In [226]:
est_data.drop('Unnamed: 0', inplace = True, axis = 1)

In [227]:
# source_id and currency -> category for 
est_data['source_id'] = est_data['source_id'].astype('category')
est_data['currency'] = est_data['currency'].astype('category')

In [228]:
# now we need to convert estimate and period dates to dates
est_data['period_date'] = pd.to_datetime(est_data['period_date'])
est_data['estimate_date'] = pd.to_datetime(est_data['estimate_date'])

In [229]:
#managed to reduce the mem usage by a third

In [230]:
lag_tol = 90 # 90 days is the minimum lag tolerance

max_est = (est_data
               .groupby(['security_id', 'broker_id', 'period_date'])
               ['estimate_date']
               .max()
          )

min_est = (est_data
               .groupby(['security_id', 'broker_id', 'period_date'])
               ['estimate_date']
               .min()
          )

lag_bool = (max_est - min_est).dt.days > lag_tol

est_data = (
    est_data
    .join(lag_bool, 
          on = ['security_id', 'broker_id', 'period_date'],
          rsuffix = '_in')
)

In [231]:
est_data = est_data[est_data.estimate_date_in == True]
est_data.drop('estimate_date_in', axis = 1, inplace = True)

In [233]:
est_data.sort_values(['security_id', 'broker_id', 'period_date', 'estimate_date'], inplace = True)

In [267]:
# This just makes the time series daily between revision dates
est_data_daily = (
    est_data.head(3)
    .set_index(['security_id', 'broker_id', 'period_date'])
    .groupby(['security_id', 'broker_id', 'period_date'])
    .apply(
        lambda df: df.drop_duplicates('estimate_date')
                     .set_index('estimate_date')
                     .resample('D')
                     .ffill()
    )
)

In [274]:
est_data_daily.index

MultiIndex([(30064771087, -2084193872, '2020-08-31', '2019-08-20'),
            (30064771087, -2084193872, '2020-08-31', '2019-08-21'),
            (30064771087, -2084193872, '2020-08-31', '2019-08-22'),
            (30064771087, -2084193872, '2020-08-31', '2019-08-23'),
            (30064771087, -2084193872, '2020-08-31', '2019-08-24'),
            (30064771087, -2084193872, '2020-08-31', '2019-08-25'),
            (30064771087, -2084193872, '2020-08-31', '2019-08-26'),
            (30064771087, -2084193872, '2020-08-31', '2019-08-27'),
            (30064771087, -2084193872, '2020-08-31', '2019-08-28'),
            (30064771087, -2084193872, '2020-08-31', '2019-08-29'),
            ...
            (30064771087, -2084193872, '2020-08-31', '2020-03-10'),
            (30064771087, -2084193872, '2020-08-31', '2020-03-11'),
            (30064771087, -2084193872, '2020-08-31', '2020-03-12'),
            (30064771087, -2084193872, '2020-08-31', '2020-03-13'),
            (30064771087, -20841

In [286]:
# We still need a lookahead function that determines whether there is an upward or downward 
# revision in the next 90 days, and then to shift the estimate date 90 days forward
# on a copied data frame that can be outer joined back to the result
# for a shifted timeseries
fill_value = [est_data_daily.value[-1], est_data_daily.currency[-1], est_data_daily.source_id[-1]]
est_daily_shift = est_data_daily.shift(periods = -90)

# Now that we have shifted we will have NaN values that need to be assigned


est_daily_shift.value[est_daily_shift.value.isna()] = fill_value
est_daily_shift

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [282]:
est_daily_shift.value[est_daily_shift.value.isna()]

security_id  broker_id    period_date  estimate_date
30064771087  -2084193872  2020-08-31   2019-12-21      NaN
                                       2019-12-22      NaN
                                       2019-12-23      NaN
                                       2019-12-24      NaN
                                       2019-12-25      NaN
                                       2019-12-26      NaN
                                       2019-12-27      NaN
                                       2019-12-28      NaN
                                       2019-12-29      NaN
                                       2019-12-30      NaN
                                       2019-12-31      NaN
                                       2020-01-01      NaN
                                       2020-01-02      NaN
                                       2020-01-03      NaN
                                       2020-01-04      NaN
                                       2020-01-05      NaN
   

In [199]:
init = 1
for gp_name, gp_df in est_data_daily.groupby(['security_id', 'broker_id', 'period_date']):
    changes = (gp_df.value[lag_tol:]
               .subtract(gp_df.value[:(-lag_tol)])
               .rename('value_change')
              )
    if init == 1:
        change_data_daily_right = pd.DataFrame(changes)
        init = 0
    else:
        change_data_daily_right = pd.concat([change_data_daily_right, pd.DataFrame(changes)])


In [223]:
X = [1, 2, 3]

X[1:]
X[:(-1)]

# so this appears correct, but now I need it to persist for another 90 days... 
# not really sure what I am doing here...

[1, 2]