In [20]:
import pandas as pd
import datetime
from scipy import stats
from numpy import mean, dot

In [21]:
# CHANGE n_rows TO WORK WITH FULL DATASET

# variable inputs
lag_tol = 90 # 90 days is the minimum lag tolerance
n_rows = 1000 # want to perform large calcs on a subset of the data

In [22]:
est_data = pd.read_csv("C:/model_data/estimate_raw.csv")

In [23]:
# dropping the initial index column
est_data.drop('Unnamed: 0', inplace = True, axis = 1)

In [24]:
# source_id and currency -> category form
est_data['source_id'] = est_data['source_id'].astype('category')
est_data['currency'] = est_data['currency'].astype('category')

In [25]:
# now we need to convert estimate and period dates to dates
est_data['period_date'] = pd.to_datetime(est_data['period_date'])
est_data['estimate_date'] = pd.to_datetime(est_data['estimate_date'])

In [26]:
max_est = (est_data
               .groupby(['security_id', 'broker_id', 'period_date'])
               ['estimate_date']
               .max()
          )

min_est = (est_data
               .groupby(['security_id', 'broker_id', 'period_date'])
               ['estimate_date']
               .min()
          )

lag_bool = (max_est - min_est).dt.days > lag_tol

est_data = (
    est_data
    .join(lag_bool, 
          on = ['security_id', 'broker_id', 'period_date'],
          rsuffix = '_in')
)

In [27]:
est_data = est_data[est_data.estimate_date_in == True]
est_data.drop('estimate_date_in', axis = 1, inplace = True)

In [28]:
est_data.sort_values(['security_id', 'broker_id', 'period_date', 'estimate_date'], inplace = True)

In [29]:
# This just makes the time series daily between revision dates
est_data_daily = (
    est_data.head(n_rows)
    .set_index(['security_id', 'broker_id', 'period_date'])
    .groupby(['security_id', 'broker_id', 'period_date'])
    .apply(
        lambda df: df.drop_duplicates('estimate_date')
                     .set_index('estimate_date')
                     .resample('D')
                     .ffill()
    )
)

In [30]:
def shift_df(df, lag_tol):
    
    # shift data
    shifted_df = df.shift(periods = -lag_tol)
    
    # assign na
    shifted_df.value.fillna(est_data_daily.value[-1], inplace = True)
    shifted_df.currency.fillna(est_data_daily.currency[-1], inplace = True)
    shifted_df.source_id.fillna(est_data_daily.source_id[-1], inplace = True)
    
    # shift date
    shifted_df.reset_index(['security_id', 'broker_id', 'period_date', 'estimate_date'], 
                           inplace = True)
    shifted_df['estimate_date'] = shifted_df['estimate_date'] + datetime.timedelta(days = 90)
    
    # append out-shifted portion
    df.reset_index(['security_id', 'broker_id', 'period_date', 'estimate_date'], 
                   inplace = True)
    df_append = df[df.estimate_date < shifted_df.estimate_date.min()]
    shift_full = (pd.concat([df_append, shifted_df])
                      .set_index(['security_id', 'broker_id', 'period_date'])
                 )
    return shift_full

In [31]:
def add_changes(df_changes, lag_tol):
    
    # calculating changes
    changes = (df_changes['value'][lag_tol:]
                   .subtract(df_changes['value'][:(-lag_tol)])
              )
    changes.reset_index(drop = True, inplace = True)
    
    # adding back changes that were 0
    changes_full = (changes.shift(lag_tol, fill_value = 0)[:lag_tol]
                       .append(changes)
                       .rename('value_changes')
                   )
    changes_full.reset_index(drop = True, inplace = True)
    
    # adding the vector to the df
    df_changes.reset_index(['security_id', 'broker_id', 'period_date'], 
                         inplace = True)
    df_changes.insert(5, 
                    'changes', 
                    changes_full, 
                    allow_duplicates = True)
    
    # add true false columns to sum
    df_changes = df_changes.assign(up = (df_shift.changes > 0))
    df_changes = df_changes.assign(down = (df_shift.changes < 0))
    
    return df_changes
    

In [32]:
# Looping through and storing results
init = False
for name_gp, df_gp in est_data_daily.groupby(['security_id', 'broker_id', 'period_date']):
    df_shift = shift_df(df_gp, lag_tol)
    df_w_changes = add_changes(df_shift, lag_tol)
    if init == False:
        df_calc = df_w_changes
        init = True
    else:
        df_calc = pd.concat([df_calc, df_w_changes])

In [33]:
# applying the laplace factor calculation accross brokers
# need to collapse broker_id col here... then the rest will run smoothly
df_factor = ((df_calc
                 .groupby(['security_id', 'period_date', 'estimate_date', 'source_id'], as_index = False)
                 .apply(lambda df_ind: (sum(df_ind.up) - sum(df_ind.down))/(sum(df_ind.up) + sum(df_ind.down) + 2))
            )
            .rename('factor_value')
            .reset_index(['security_id', 'period_date', 'estimate_date', 'source_id'])
            )

In [34]:
# weighting the factor based on optimal forecast period
df_factor['scalar_weight'] = (stats.norm(550, 310)
                              .pdf((df_factor.period_date - df_factor.estimate_date).dt.days)
                          )

In [41]:
df_out = (df_factor.groupby(['security_id', 'estimate_date', 'source_id'])
                .apply(lambda df: dot(df.factor_value, df.scalar_weight)/sum(df.scalar_weight))
            ).rename('factor_value').reset_index(['security_id', 'estimate_date', 'source_id'])
df_out['factor_id'] = 'lap_breadth'

In [42]:
df_out = df_out.reindex(columns = ['factor_id', 'estimate_date', 'security_id', 'factor_value', 'source_id'])

In [43]:
df_out.rename(columns = {'estimate_date':'date', 'factor_value':'value'}, inplace = True)

In [44]:
df_out.value.describe() # a five number summary of the laplace breadth factor

count    6548.000000
mean       -0.002384
std         0.344256
min        -0.855590
25%        -0.198952
50%         0.000000
75%         0.300129
max         0.819532
Name: value, dtype: float64

In [46]:
df_out.to_csv('factor_out.csv')