In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import time
import traceback
import re
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [None]:
# Contains First Drink and FollowUp Surveys
drinking_surveys_fp = '/Users/sauravyadav/Documents/Repos/Datasets/Drinking_Surveys/output/data_draft_v3.csv'

# Contains Morning Reports and Daytime Surveys
morning_random_surveys_fp = '/Users/sauravyadav/Documents/Repos/Datasets/BAR_EMA/output_data/data_draft_v3.csv'

# Output File Paths
master_df_fp = '/Users/sauravyadav/Documents/Repos/Datasets/master_data_draft.csv'
master_vc_df_fp = '/Users/sauravyadav/Documents/Repos/Datasets/mapping.csv'

### Adding SurvNamePlus Column to Morning Reports and Daytime Surveys
- dr_df contains Drinking Surveys: "After Your 1st Drink", "Long", "Short"
- mr_r_df contains Morning Reports and Daytime Surveys

In [None]:
dr_df = pd.read_csv(drinking_surveys_fp)
mr_r_df = pd.read_csv(morning_random_surveys_fp)

conditions = [
    (mr_r_df['SurvName'] == 'Morning Reports'),
    (mr_r_df['SurvName'] == 'Daytime Surveys')
]

# Define the choices corresponding to each condition
choices = ['Morning', 'Day']

# Use numpy.select to apply the conditions and choices
mr_r_df['SurvNamePlus'] = np.select(conditions, choices)

In [None]:
mr_r_df[(mr_r_df['SurvName'] == 'Morning Reports') & (mr_r_df['InitDate'].isna())].sample(2)

### Convert Decimals to int: Drinking Surveys.

In [None]:
print('Before:')
display(dr_df.head(2))
int_cols = ['Burst', 'Weekend', 'Drinking_bin', 'UI_DrnkFin',
       'UI_DrnkNum', 'DrnkNum', 'DrnkNum_combine', 'DrnkNum_cumulative',
       'Plsur', 'Rliev', 'IntxNow', 'MorAlc', 'enrgz', 'excit', 'sedat',
       'slotht', 'slug', 'up', 'crntloc', 'othrloc', 'wthothr', 'NumOth',
       'OthTyp', 'OthWho', 'EnjInt', 'ActCld', 'ActAcc', 'OthCld', 'OthAcc',
       'Vrtl', 'AlnLke', 'AlnGd', 'WshOth', 'lnly', 'rjct', 'incl', 'acpt',
       'vrtint', 'vrtoth', 'vrtwho', 'vrtenj', 'vrtcld', 'VrtAcc', 'VrtOthCld',
       'VrtOthAcc', 'NotInt', 'WshInt', 'UI_NtInt_lnly', 'UI_NtInt_rjct',
       'UI_NtInt_incl', 'UI_NtInt_acpt', 'said', 'spent', 'impt', 'spur']

for col in int_cols:
    dr_df[col] = dr_df[col].astype('Int64', errors = 'ignore')

display(dr_df.head(2))

In [None]:
display(mr_r_df.sample(5))

### Convert Decimals to int: Morning Reports and Daytime Surveys.

In [None]:
print('Before:')
display(mr_r_df.head(2))
int_cols = ['MR_FlSlp', 'MR_SlpQlt', 'MR_DrnkNum', 'MR_DrnkDur_min', 'MR_rlx', 'MR_hngvr', 'MR_Embrssd', 
            'MR_agrsv', 'MR_vom', 'MR_hrt', 'MR_rmbr', 'MR_work', 'MR_rude', 'MR_soc', 'MR_exprs', 'MR_guilt',
            'MR_expctd', 'MR_mood', 'MR_mrng', 'MR_prmsd', 'MR_more', 'MR_fght', 'MR_enrgtc', 'MR_PsdOut',
            'MR_MsdClss', 'MR_Impls', 'MR_sleep', 'MR_weed', 'MR_plan', 'chr', 'anoy', 'upbt', 'sad', 'irit',
            'hap', 'bor', 'cnt', 'RP_enrgz', 'RP_excit', 'RP_sedat', 'RP_slotht', 'RP_slug', 'RP_up', 'RP_crntloc',
            'RP_othrloc', 'RP_wthothr', 'RP_NumOth', 'RP_OthTyp', 'RP_OthWho', 'RP_EnjInt', 'RP_ActCld', 'RP_ActAcc',
            'RP_OthCld', 'RP_OthAcc', 'RP_Vrtl', 'RP_AlnLke', 'RP_AlnGd', 'RP_WshOth', 'RP_lnly', 'RP_rjct', 'RP_incl',
            'RP_acpt', 'RP_vrtint', 'RP_vrtoth', 'RP_vrtwho', 'RP_vrtenj', 'RP_vrtcld', 'RP_VrtAcc', 'RP_VrtOthCld', 'RP_VrtOthAcc',
            'RP_NotInt', 'RP_WshInt', 'RP_lonely', 'RP_reject', 'RP_included', 'RP_accepted', 'RP_said', 'RP_spent', 'RP_impt', 'RP_spur',
            'RP_Drnk', 'MR_duration_bed_wake', 'MR_duration_bed_wake_minutes', 'MR_duration_bed_rise', 'MR_duration_bed_rise_minutes', 'MR_duration_wake_rise',
            'MR_duration_wake_rise_minutes', 'MR_wake+', 'MR_rise+', 'MR_bed+']

for col in int_cols:
    mr_r_df[col] = mr_r_df[col].astype('Int64', errors = 'ignore')

display(mr_r_df.head(2))

### Renaming column in MR to match requested data format

In [None]:
display(mr_r_df.head(2))
no_RP_rename = ['RP_lonely', 'RP_reject', 'RP_included', 'RP_accepted', 'RP_Drnk']
renames = {}
for col in mr_r_df.columns:
    if col in no_RP_rename:
        continue
    renames[col] = col.replace('RP_', '')
mr_r_df = mr_r_df.rename(columns = renames)
display(mr_r_df.head(2))

### Uniform Date Format (Changing MR date format to match that of drinking surveys)

In [None]:
display(mr_r_df.head(2))
date_cols = ['InitDate','SubDate','TrigDate']
for date_col in date_cols:
    mr_r_df[date_col] = pd.to_datetime(mr_r_df[date_col])
    
    # Format the 'date' column as 'M/D/YYYY'
    mr_r_df[date_col] = mr_r_df[date_col].dt.strftime('%-m/%d/%Y')
display(mr_r_df.head(2))

## Creating a MASTER DATAFRAME

### Concatenating

In [None]:
master_df = pd.concat([mr_r_df, dr_df]).reset_index(drop=True)
master_df.head(2)

### Formatting Time Columns

In [None]:
time_cols = ['InitTime', 'SubTime', 'TrigTime']
for time_col in time_cols:
    master_df[time_col] = master_df[time_col].str.zfill(8)

### Sorting the Dataframe appropriately

In [None]:
weekday_encoding = {'Mon': 1, 'Tue': 2, 'Wed': 3, 'Thu': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7}
master_df['day_encoding'] = master_df['Day'].map(weekday_encoding)
master_df = master_df.sort_values(by=['SubID', 'Burst', 'Weekend', 'TrigDate', 'TrigTime', 'SubDate', 'SubTime']).reset_index(drop=True)

In [None]:
master_df.head(2)

### Adding a survey miss column to indicate missed surveys

In [None]:
master_df['SurveyMiss'] = ''
master_df['Comment'] = ''

mr_bed_df = master_df[(~master_df['MR_bed+'].isnull()) & (master_df['SurvNamePlus'].isin(['Morning', 'Day']))]

for index, row in tqdm(mr_bed_df.iterrows(), total=mr_bed_df.shape[0]):
    current_index_position = master_df.index.get_loc(index)

    sub_time = pd.to_timedelta(row['SubTime'])
    bed_time = pd.to_timedelta(row['MR_bed+'])
    date = pd.to_datetime(row['SubDate']).date()
    
    if sub_time < bed_time:
        date = date - pd.Timedelta(days=1)
        
    try:
        bed_time_hms = pd.to_datetime(row['MR_bed+'], format='%H:%M:%S').time()
    except Exception as e:
        bed_time_hms = pd.to_datetime(row['MR_bed'], format='%H:%M:%S').time()
    
    # Combine the adjusted date with 'bed_time_hms'
    cutoff_datetime = pd.Timestamp.combine(pd.Timestamp(date), bed_time_hms)

    for back_index in range(current_index_position - 1, -1, -1):
        try:            
            sub_datetime = pd.to_datetime(master_df.iloc[back_index]['TrigDate'] + ' ' + master_df.iloc[back_index]['TrigTime'])

            tdifference = cutoff_datetime - sub_datetime

            if (master_df.iloc[back_index]['SurvNamePlus'] == 'Start') or (master_df.iloc[back_index]['SurvNamePlus'] == 'Morning'):
                # Do something with the found row
                master_df.at[back_index, 'Comment'] = 'Start/Morning Reached. Breaking.'
                break
            elif (master_df.iloc[back_index]['SubID'] != row['SubID']):
                master_df.at[back_index, 'Comment'] = 'Different Subject.'
                break
            elif 'Missed' in master_df.iloc[back_index]['RespType'] and 'Drinking Follow' in master_df.iloc[back_index]['SurvName']:
                master_df.at[back_index, 'Comment'] = 'Potential'
                
                if tdifference > pd.Timedelta(days=1):
                    master_df.at[back_index, 'Comment'] = 'Too far back.'
                    break

                master_df.at[back_index, 'SurveyMiss'] = 0
                    
                try:
                    if sub_datetime < cutoff_datetime:
                        master_df.at[back_index, 'SurveyMiss'] = 1
                except Exception as e:
                    print(master_df.iloc[back_index])
        except Exception as e:
            print(e)
            traceback.print_exc()
            print(master_df.iloc[back_index])
            break


In [None]:
master_df.shape

In [None]:
temp = master_df[['RespType', 'SurvName', 'SurvType', 'SubID', 'Weekend', 'Day', 'InitDate', 'InitTime', 'SubDate', 'SubTime', 'TrigDate', 'TrigTime', 'MR_bed', 'MR_bed+', 'SurveyMiss', 'Comment']]
x = 55000
chunk_size = 2
temp.iloc[x : x + chunk_size]

### Adding SurveyCount Column

In [None]:
master_df['RespType'].value_counts()

In [None]:
master_df['SurveyCount'] = ''
master_df['SurveyCount'] = np.where(master_df['RespType'] == 'Submission', 1, master_df['SurveyCount'])

condition2 = ((master_df['RespType'] == 'Missed') | (master_df['RespType'] == 'Missed (Synthetic)')) & (master_df['SurveyMiss'] == 1)
master_df.loc[condition2, 'SurveyCount'] = 0


master_df.loc[((master_df['RespType'] == 'Missed') | (master_df['RespType'] == 'Missed (Synthetic)')) & (master_df['SurvName'].isin(['Morning Reports', 'Daytime Surveys']) ), 'SurveyCount'] = 0

master_df.loc[((master_df['RespType'] == 'Missed') | (master_df['RespType'] == 'Missed (Synthetic)')) & (master_df['SurvName'].isin(['After Your 1st Drink']) ), 'SurveyCount'] = 2

# condition2 = ((master_df['RespType'] == 'Missed') | (master_df['RespType'] == 'Missed (Synthetic)')) & (master_df['SurveyMiss'] == '')
# master_df.loc[condition2, 'SurveyCount'] = -1

# Condition 3: RespType = Missed or Missed (Synthetic) AND SurveyMiss = 0, SurveyCount should be 2
condition3 = ((master_df['RespType'] == 'Missed') | (master_df['RespType'] == 'Missed (Synthetic)')) & (master_df['SurveyMiss'] == 0)
master_df.loc[condition3, 'SurveyCount'] = 2

### Column Ordering and Renaming

In [None]:
column_order = ['SubID', 'Burst', 'Weekend', 'Day', 'RespID', 'RespType', 'SurveyMiss', 'SurveyCount', 'UserID', 'SurvName', 'SurvNamePlus', 'SurvType', 'InitDate', 'InitTime', 'SubDate',
                'SubTime', 'TrigDate', 'TrigTime',  'Drinking_bin', 'UI_DrnkFin', 'UI_DrnkNum', 'DrnkNum', 'DrnkNum_combine', 'DrnkNum_cumulative',
                'MR_wake', 'MR_wake+', 'MR_rise', 'MR_rise+', 'MR_bed', 'MR_bed+', 'MR_duration_bed_wake', 'MR_duration_bed_wake_minutes', 'MR_FlSlp',
                'MR_SlpQlt', 'MR_DrnkNum', 'MR_DrnkDur_original', 'MR_DrnkDur', 'MR_DrnkDur_min', 'MR_rlx', 'MR_hngvr', 'MR_Embrssd', 'MR_agrsv', 'MR_vom', 'MR_hrt', 'MR_rmbr', 'MR_work', 'MR_rude',
                'MR_soc', 'MR_exprs', 'MR_guilt', 'MR_expctd', 'MR_mood', 'MR_mrng', 'MR_prmsd', 'MR_more', 'MR_fght', 'MR_enrgtc', 'MR_PsdOut', 'MR_MsdClss',
                'MR_Impls', 'MR_sleep', 'MR_weed', 'MR_plan', 'chr', 'anoy', 'upbt', 'sad', 'irit', 'hap', 'bor', 'cnt', 'enrgz', 'excit', 'sedat', 'slotht', 'slug',
                'up', 'crntloc', 'othrloc', 'wthothr', 'NumOth', 'OthTyp', 'OthWho', 'EnjInt', 'ActCld', 'ActAcc', 'OthCld', 'OthAcc', 'Vrtl', 'AlnLke', 'AlnGd', 'WshOth',
                'lnly', 'rjct', 'incl', 'acpt', 'vrtint', 'vrtoth', 'vrtwho', 'vrtenj', 'vrtcld', 'VrtAcc', 'VrtOthCld', 'VrtOthAcc', 'NotInt', 'WshInt', 'UI_NtInt_lnly', 'UI_NtInt_rjct',
                'UI_NtInt_incl', 'UI_NtInt_acpt','said', 'spent', 'impt', 'spur','RP_lonely', 'RP_reject', 'RP_included', 'RP_accepted', 'RP_Drnk', 'Plsur', 'Rliev', 'IntxNow', 'MorAlc']

master_df_ordered = master_df[column_order]
column_renames = {'Burst' : 'Brst', 'Weekend' : 'Wknd', 'MR_duration_bed_wake': 'MR_duration', 'MR_duration_bed_wake_minutes': 'MR_duration_minutes'}
master_df_ordered = master_df_ordered.rename(columns = column_renames)

display(master_df_ordered.head())
print(list(master_df_ordered.columns))

### Formatting SurvName with granular details

In [None]:
df_filtered = master_df_ordered[master_df_ordered['SurvName'] == 'Drinking Follow-Ups']
master_df_ordered.loc[df_filtered.index, 'SurvName'] += ' ' + df_filtered['SurvNamePlus']

In [None]:
master_df_ordered.to_csv(master_df_fp, index = False)

In [None]:
master_df_ordered[['MR_DrnkDur_original', 'MR_DrnkDur']].value_counts().reset_index().to_csv(master_vc_df_fp, index=False)

In [None]:
# master_df_ordered[['MR_DrnkDur_original', 'MR_DrnkDur']].value_counts().reset_index()

In [None]:
master_df_ordered.sample(200)

In [None]:
master_df_ordered['SurveyCount'].value_counts()

In [None]:
master_df_ordered[master_df_ordered['SurvName'].isin(['Morning Reports', 'After Your 1st Drink'])].head(100)

In [None]:
master_df_ordered[master_df_ordered['SurvName'] == 'After Your 1st Drink'].sample(5)

In [None]:
master_df_ordered[['RespType', 'SurveyCount', 'SurvName']].value_counts().reset_index()

In [None]:
master_df_ordered['RespType'].value_counts()