In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# compute base forecast no coherent
from statsforecast.models import AutoARIMA, AutoETS, AutoCES, AutoTheta, HoltWinters, CrostonOptimized, SeasonalNaive, SeasonalExponentialSmoothingOptimized
from statsforecast.core import StatsForecast

#obtain hierarchical reconciliation methods and evaluation
from hierarchicalforecast.methods import BottomUp, MinTrace, TopDown, OptimalCombination, ERM
from hierarchicalforecast.utils import aggregate, HierarchicalPlot
from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.evaluation import HierarchicalEvaluation

In [2]:
df = pd.read_excel('../HCP_Data_KDAG_Hackathon.xlsx', parse_dates=['Time_Period'])
df.head()

Unnamed: 0,Physician_ID,Time_Period,Brand_Rx,Market_Rx,Sales_Rep_Calls,Samples_Dropped,Physician_Segment,Emails_Delivered,Speaker_Programs_Attended,Vouchers_Dropped,Specialty
0,axt00001,2019-01-04,0.0,2.80567,1,0,3-Low,0,0,0,Dermatologist
1,axt00001,2019-01-11,0.0,20.57312,1,0,3-Low,0,0,0,Dermatologist
2,axt00001,2019-01-18,0.0,6.1601,1,0,3-Low,0,0,0,Dermatologist
3,axt00001,2019-01-25,0.0,8.95501,1,5,3-Low,0,0,0,Dermatologist
4,axt00001,2019-02-01,0.0,9.13793,1,0,3-Low,0,0,0,Dermatologist


In [3]:
df = df.drop(['Speaker_Programs_Attended', 'Vouchers_Dropped', 'Market_Rx', 'Sales_Rep_Calls', 'Samples_Dropped', 'Emails_Delivered'], axis=1)
df = df[['Physician_Segment', 'Specialty', 'Physician_ID', 'Time_Period', 'Brand_Rx']]
df['Specialty'] = df['Specialty'].map({'Dermatologist':'D', 'General Physician':'GP', 'Nurse Practitioner':'NP'})
df['Physician_Segment'] = df['Physician_Segment'].map({'3-Low':'L', '2-Medium':'M', '1-High':'H'})
df = df.rename({'Time_Period': 'ds', 'Brand_Rx': 'y'}, axis=1)
df.insert(0, 'Job', 'HCP')
df.head()

Unnamed: 0,Job,Physician_Segment,Specialty,Physician_ID,ds,y
0,HCP,L,D,axt00001,2019-01-04,0.0
1,HCP,L,D,axt00001,2019-01-11,0.0
2,HCP,L,D,axt00001,2019-01-18,0.0
3,HCP,L,D,axt00001,2019-01-25,0.0
4,HCP,L,D,axt00001,2019-02-01,0.0


In [4]:
spec = [
    ['Job'],
    ['Job', 'Physician_Segment'], 
    ['Job', 'Specialty'],
    ['Job', 'Physician_Segment', 'Specialty'],
    ['Job', 'Physician_Segment', 'Specialty', 'Physician_ID']
]

In [5]:
Y_df, S_df, tags = aggregate(df=df, spec=spec)
Y_df = Y_df.reset_index()

In [6]:
base_ts = tags['Job/Physician_Segment/Specialty/Physician_ID']

In [72]:
fcst = StatsForecast(df=Y_df,
                     models=[
                            # AutoARIMA(season_length=7),
                            AutoETS(model="MMM", season_length=7, damped=True),
                            # AutoCES(season_length=7),
                            # SeasonalExponentialSmoothingOptimized(season_length=7),
                            # AutoTheta(season_length=7),
                            # HoltWinters(season_length=7, error_type='M'),
                            # CrostonOptimized(),
                            ], 
                     freq='W-FRI', n_jobs=-1, fallback_model = SeasonalNaive(season_length=7))
Y_hat_df = fcst.forecast(h=1, fitted=True, level=[95])
Y_fitted_df = fcst.forecast_fitted_values()

In [73]:
reconcilers = [
    TopDown(method='proportion_averages'),
    # MinTrace(method='mint_shrink', nonnegative=True),
    # MinTrace(method='ols', nonnegative=True),
    # OptimalCombination(method='ols', nonnegative=True),
    # ERM(method='closed')
]
hrec = HierarchicalReconciliation(reconcilers=reconcilers)
Y_rec_df = hrec.reconcile(Y_hat_df=Y_hat_df, Y_df=Y_fitted_df, 
                          S=S_df, tags=tags, level=[95])

In [74]:
Y_rec_df

Unnamed: 0_level_0,ds,AutoETS,AutoETS-lo-95,AutoETS-hi-95,AutoETS/TopDown_method-proportion_averages,AutoETS/TopDown_method-proportion_averages-lo-95,AutoETS/TopDown_method-proportion_averages-hi-95
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
HCP,2020-02-07,7196.500000,6449.206543,7943.792969,7196.500418,6449.206915,7943.793920
HCP/D,2020-02-07,5772.672363,5162.792480,6382.552246,5887.559839,5276.188345,6498.931334
HCP/GP,2020-02-07,1114.120117,981.993347,1246.246826,1079.789099,967.662464,1191.915734
HCP/H,2020-02-07,4093.842773,3624.852539,4562.833008,4245.775914,3804.889292,4686.662535
HCP/H/D,2020-02-07,3601.208740,3182.650391,4019.767090,3768.820893,3377.461871,4160.179916
...,...,...,...,...,...,...,...
HCP/M/NP/axt09901,2020-02-07,0.000000,-0.486022,0.486022,0.055914,0.050107,0.061720
HCP/M/NP/axt09915,2020-02-07,0.000000,-1.172613,1.172613,0.074237,0.066528,0.081946
HCP/M/NP/axt09922,2020-02-07,0.000000,-1.085416,1.085416,0.149639,0.134100,0.165178
HCP/M/NP/axt09997,2020-02-07,0.000000,-0.407672,0.407672,0.057578,0.051599,0.063557


In [78]:
out_df = Y_rec_df[Y_rec_df.index.isin(base_ts)]['AutoETS/TopDown_method-proportion_averages']
# take last 8 characters out of indexes
out_df.index = out_df.index.str[-8:]
out_df = out_df.sort_index()
out_df = out_df.reset_index()
out_df.columns = ['Physician_ID', 'Expected_TRx']
out_df.where(out_df['Expected_TRx'] < 0).dropna()

Unnamed: 0,Physician_ID,Expected_TRx


In [79]:
out_df.to_csv('submission_new.csv', index=False)

In [80]:
out_df

Unnamed: 0,Physician_ID,Expected_TRx
0,axt00001,0.663252
1,axt00002,0.000000
2,axt00003,0.543696
3,axt00004,0.037128
4,axt00005,0.685329
...,...,...
9995,axt09996,0.000000
9996,axt09997,0.057578
9997,axt09998,0.000000
9998,axt09999,0.935128
