In [15]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import make_pipeline
# from sklearn.impute import SimpleImputer

In [9]:
AW_df = pd.read_csv('~/Documents/realdata/AW_df.csv')

variables = [
    'ParticipantIdentifier',
    'Value_tran',
    'Notification_c',
    'TimeEnrolled_days',
    'Phases',
    'Value_30min_before',
    'NotificationType',
    'Baseline_steps',
    'IsWeekend',
    'IsIndoor',
    'IsLossFramed',
    'IsSnow',
    'IsActivity',
    'Value_30min_before_tran',
    'AgeEnrollment_years',
    'Gender',
    'Race',
    'ExerciseTimeAgg_min',
    'WalkDistanceAgg_m',
    'StepsAgg_priorweek',
    'Value_tran_sd_week',
    'Value_tran_sd_priorweek',
    'Distance_m_0'
]

data = AW_df[variables].dropna()

# Adding binary columns for NotificationType, Gender, and Race based on conditions
data['Naf'] = (data['NotificationType'] == 'afternoon').astype(int)
data['Nev'] = (data['NotificationType'] == 'evening').astype(int)
data['Nlu'] = (data['NotificationType'] == 'lunch').astype(int)
data['Nmo'] = (data['NotificationType'] == 'morning').astype(int)

data['GenderM'] = (data['Gender'] == 'Male').astype(int)
data['RaceC'] = (data['Race'] == 'Caucasian').astype(int)


data['P2'] = data['TimeEnrolled_days'] * (data['Phases'] == 2).astype(int)
data['P3'] = data['TimeEnrolled_days'] * (data['Phases'] == 3).astype(int)

# Remove original 'NotificationType' and 'Gender' columns
data = data.drop(columns=['NotificationType', 'Gender', 'Race', 'Phases'])

In [10]:
np.random.seed(420)

# Split data into estimation and rest set based on unique ParticipantIdentifier
unique_ids = data['ParticipantIdentifier'].unique()
est_ids = np.random.choice(unique_ids, size=int(len(unique_ids) * 0.3), replace=False)
est_set = data[data['ParticipantIdentifier'].isin(est_ids)]
rest_set = data[~data['ParticipantIdentifier'].isin(est_ids)]

In [18]:
# Define covariates for the model
covariates = [
    'TimeEnrolled_days',
    'Value_30min_before',
    'Baseline_steps',
    'IsWeekend',
    'IsIndoor',
    'IsLossFramed',
    'IsSnow',
    'IsActivity',
    'Value_30min_before_tran',
    'AgeEnrollment_years',
    'GenderM',
    'RaceC',
    'ExerciseTimeAgg_min',
    'WalkDistanceAgg_m',
    'StepsAgg_priorweek',
    'Value_tran_sd_week',
    'Value_tran_sd_priorweek',
    'Distance_m_0',
    'Naf',  
    'Nev',
    'Nlu',
    'Nmo',
    'P2',
    'P3'
]

# Linear regression model for estimation set
X1 = est_set[covariates]
y1 = est_set['Value_tran']

alphahat = np.array(sm.OLS(y1, X1).fit().params)

In [27]:
X = rest_set[covariates]
y = rest_set['Value_tran'] - np.dot(X,alphahat)

centered_treatment = rest_set['Notification_c']
X = X.multiply(centered_treatment, axis="index")
X['Y'] = y
X['id'] = rest_set['ParticipantIdentifier']
X.to_csv('~/Documents/realdata/stepsdata.csv', index=False)

In [26]:
X.iloc[:,:-2]

Unnamed: 0,TimeEnrolled_days,Value_30min_before,Baseline_steps,IsWeekend,IsIndoor,IsLossFramed,IsSnow,IsActivity,Value_30min_before_tran,AgeEnrollment_years,...,StepsAgg_priorweek,Value_tran_sd_week,Value_tran_sd_priorweek,Distance_m_0,Naf,Nev,Nlu,Nmo,P2,P3
8,-11.253301,-50.189723,-908.720141,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-1.217475,-12.024085,...,-962.639530,-0.149006,-0.171263,-100.126135,-0.000000,-0.000000,-0.000000,-0.225066,-11.253301,-0.000000
9,-11.253301,-1.800528,-908.720141,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-0.481656,-12.024085,...,-962.639530,-0.149006,-0.171263,-100.126135,-0.000000,-0.000000,-0.225066,-0.000000,-11.253301,-0.000000
10,-11.253301,-98.578918,-908.720141,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-1.369158,-12.024085,...,-962.639530,-0.149006,-0.171263,-100.126135,-0.225066,-0.000000,-0.000000,-0.000000,-11.253301,-0.000000
11,-11.253301,-26.107659,-908.720141,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-1.070840,-12.024085,...,-962.639530,-0.149006,-0.171263,-100.126135,-0.000000,-0.225066,-0.000000,-0.000000,-11.253301,-0.000000
12,39.521633,130.963842,3128.851287,0.774934,0.774934,0.0,0.0,0.774934,3.977622,41.400615,...,3314.503327,0.513048,0.589683,344.748368,0.000000,0.000000,0.774934,0.000000,39.521633,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34066,-41.187082,-165.873658,-1807.440921,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-1.486171,-15.858692,...,-1679.185441,-0.396373,-0.316187,-109.013339,-0.000000,-0.000000,-0.225066,-0.000000,-0.000000,-41.187082
34068,-41.187082,-119.960190,-1807.440921,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-1.413293,-15.858692,...,-1679.185441,-0.396373,-0.316187,-109.013339,-0.000000,-0.225066,-0.000000,-0.000000,-0.000000,-41.187082
34069,-41.412148,-17.330084,-1807.440921,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-0.979100,-15.858692,...,-1679.185441,-0.396373,-0.316187,-109.013339,-0.000000,-0.000000,-0.000000,-0.225066,-0.000000,-41.412148
34070,-41.412148,-43.662808,-1807.440921,-0.000000,-0.000000,-0.0,-0.0,-0.000000,-1.186195,-15.858692,...,-1679.185441,-0.396373,-0.316187,-109.013339,-0.000000,-0.000000,-0.225066,-0.000000,-0.000000,-41.412148
