In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd
import hts

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

# settings
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (16, 8)

import pickle

In [None]:
df = pd.read_excel('./HCP_Data_KDAG_Hackathon.xlsx', parse_dates=['Time_Period'])
df.head()

In [None]:
df.nunique()

In [None]:
df = df.drop(['Speaker_Programs_Attended', 'Vouchers_Dropped'], axis=1)
df = df[['Physician_ID', 'Time_Period', 'Physician_Segment', 'Specialty', 'Sales_Rep_Calls', 'Samples_Dropped', 'Emails_Delivered', 'Brand_Rx', 'Market_Rx']]
# df['Percent_Market_Share'] = df['Brand_Rx'] / df['Market_Rx'] * 100.0
df['Specialty'] = df['Specialty'].map({'Dermatologist':'D', 'General Physician':'GP', 'Nurse Practitioner':'NP'})
df['Physician_Segment'] = df['Physician_Segment'].map({'3-Low':'L', '2-Medium':'M', '1-High':'H'})
df.head()

In [None]:
out_dict = {'Sales_Rep_Calls':0, 'Samples_Dropped':1, 'Emails_Delivered':2}

In [None]:
# Number of weeks when Sales_Rep_Calls, Samples_Dropped, Emails_Delivered are all 0
df[['Sales_Rep_Calls', 'Samples_Dropped', 'Emails_Delivered']].apply(lambda x: (x!=0).sum(), axis=1).value_counts()

In [None]:
# create the bottom level id
df["seg_specialty_id"] = df.apply(lambda x: f"{x['Physician_Segment']}_{x['Specialty']}_{x['Physician_ID']}", axis=1)
# create the l1 level id
df["seg_specialty"] = df.apply(lambda x: f"{x['Physician_Segment']}_{x['Specialty']}", axis=1)

In [None]:
# create the bottom level df
df_bottom_level = df.pivot(index="Time_Period", columns="seg_specialty_id", values="Brand_Rx")
df_bottom_level.head()

In [None]:
# create the l1 level df
df_l1_level = df.groupby(["Time_Period", "Physician_Segment", "Specialty", "seg_specialty"]).sum() \
                    .reset_index(drop=False) \
                    .pivot(index="Time_Period", columns="seg_specialty", values="Brand_Rx")
df_l1_level.head()

In [None]:
# create the l2 level df
df_l2_level = df.groupby(["Time_Period", "Physician_Segment"]).sum() \
                    .reset_index(drop=False) \
                    .pivot(index="Time_Period", columns="Physician_Segment", values="Brand_Rx")
df_l2_level.head()

In [None]:
# create the total level df
df_total = df.groupby(["Time_Period"])["Brand_Rx"].sum() \
                    .to_frame().rename(columns={"Brand_Rx": "total"})
df_total.head()

In [None]:
# join the DataFrames
hierarchy_df = df_bottom_level.join(df_l1_level).join(df_l2_level).join(df_total)
hierarchy_df.index = pd.to_datetime(hierarchy_df.index)
hierarchy_df = hierarchy_df.resample('W-FRI').sum()

hierarchy_df.head()
# if(hierarchy_df1 == hierarchy_df).all().all():
#     print("True")

In [None]:
segments = df["Physician_Segment"].unique()
specialties = df["seg_specialty"].unique()
ids = df["seg_specialty_id"].unique()

total = {'total': list(segments)}
segment = {k: [v for v in specialties if v.startswith(k)] for k in segments}
id = {k: [v for v in ids if v.startswith(k)] for k in specialties}
hierarchy = {**total, **segment, **id}

In [None]:
model = hts.HTSRegressor(model='holt_winters', revision_method='PHA', n_jobs=0, damped_trend=True, trend="mul")
model = model.fit(hierarchy_df, hierarchy)
pred = model.predict(steps_ahead=1)

In [None]:
# Remove all rows except last from pred
pred1 = pred.drop(['total', 'L', 'H', 'M', 'L_D', 'L_GP', 'L_NP', 'H_D', 'H_GP', 'H_NP', 'M_D', 'M_GP', 'M_NP'], axis=1)
pred1 = pred1.iloc[-1]
pred1 = pd.DataFrame(pred1)
pred1.reset_index(inplace=True)
pred1.columns = ['Physician_ID', 'Expected_TRx']
pred1['Physician_ID'] = pred1['Physician_ID'].apply(lambda x: x.split('_')[-1])
pred1.sort_values(by=['Physician_ID'], inplace=True)
pred1.reset_index(drop=True, inplace=True)
pred1

In [None]:
pred1.to_csv('submission.csv', index=False)