### Import dependecies

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import time
import traceback
import re
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

### Initialize File Paths

In [2]:
input_file_path = '/Users/sauravyadav/Documents/Repos/Datasets/Drinking_Surveys/output/data_draft.csv'
output_file_path = '/Users/sauravyadav/Documents/Repos/Datasets/Drinking_Surveys/output/data_draft_v2.csv'

In [3]:
df = pd.read_csv(input_file_path)

### TO DO
- Create Missed Survey Records
    - For every Survey, there should be 12 follow up surveys

### 1. Create Missed Survey Records

In [4]:
df.head(2)

Unnamed: 0,SubID,Burst,Weekend,Day,RespID,RespType,UserID,SurvName,SurvType,InitDate,InitTime,SubDate,SubTime,TrigDate,TrigTime,UI_DrnkFin,UI_DrnkNum,DrnkNum,Plsur,Rliev,IntxNow,MorAlc,enrgz,excit,sedat,slotht,slug,up,crntloc,othrloc,wthothr,NumOth,OthTyp,OthWho,EnjInt,ActCld,ActAcc,OthCld,OthAcc,Vrtl,AlnLke,AlnGd,WshOth,lnly,rjct,incl,acpt,vrtint,vrtoth,vrtwho,vrtenj,vrtcld,VrtAcc,VrtOthCld,VrtOthAcc,NotInt,WshInt,UI_NtInt_lnly,UI_NtInt_rjct,UI_NtInt_incl,UI_NtInt_acpt,said,spent,impt,spur
0,2001,1,1,Thu,5ab44c822c9b2f5baaa6aa80,Submission,5a8381835893ca7070656612,After Your 1st Drink,Assessment,2018-03-22,17:52:06,2018-03-22,20:38:26,2018-03-22,05:00:00,1.0,1.0,,59.0,34.0,22.0,35.0,37.0,33.0,54.0,42.0,40.0,28.0,1.0,CONDITION_SKIPPED,1.0,3.0,3.0,CONDITION_SKIPPED,62.0,64.0,34.0,74.0,30.0,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,CONDITION_SKIPPED,0.0,0.0,9.0,6.0
1,2001,1,1,Thu,d0274600-d886-4a59-9c8e-8bcde51902de,Missed,5a8381835893ca7070656612,Drinking Follow-Ups,,2018-03-22,21:08:26,2018-03-22,21:18:26,2018-03-22,21:08:26,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Helper functions to create a new row

In [5]:
def get_next_date_time_day(prev_datetime_str: str):
    new_datetime = datetime.strptime(prev_datetime_str, '%Y-%m-%d %H:%M:%S') + timedelta(minutes=30)

    new_date_str = new_datetime.strftime('%Y-%m-%d')
    new_time_str = new_datetime.strftime('%H:%M:%S')
    new_day = new_datetime.strftime('%a')
    
    return new_date_str, new_time_str, new_day
    
def get_next_row(prev_row, isFirst = False):

    if isFirst:
        prev_datetime_str = prev_row['SubDate'] + ' ' + prev_row['SubTime']
    else:
        prev_datetime_str = prev_row['TrigDate'] + ' ' + prev_row['TrigTime']
    
    new_date_str, new_time_str, new_day = get_next_date_time_day(prev_datetime_str)
    
    
    new_row = {'SubID' : prev_row['SubID'], 'Burst' : prev_row['Burst'],
                  'Weekend' : prev_row['Weekend'], 'Day' : new_day,
                'RespType' : 'Missed (Synthetic)', 'UserID' : prev_row['UserID'], 'SurvName' : 'Drinking Follow-Ups',
                'TrigDate': new_date_str, 'TrigTime' : new_time_str}
    
    return new_row
    

### Iterative technique

In [6]:

# expected_trigger_time = None
# prev_row = None
# i = 0
# row_list = []
# session_row_count = 0

# while i < len(df):
#     row = df.iloc[i]

#     if row['SurvName'] != 'After Your 1st Drink' and session_row_count >= 13:
#         print(f'check: {i}')
        
#     # First of the session OR Last Session Over
#     if row['SurvName'] == 'After Your 1st Drink' or session_row_count == 13:
        
#         # Finish previous session
#         while session_row_count > 0 and session_row_count < 13:
#             new_row = get_next_row(prev_row)
#             row_list.append(new_row)
#             prev_row = new_row
#             session_row_count += 1


#         # if row['SurvName'] == 'After Your 1st Drink' and row['RespType'] == 'Missed':
#         #     i += 1
#         #     continue
            
#         expected_trigger_time = datetime.strptime(row['SubTime'], '%H:%M:%S') + timedelta(minutes=30)
#         i += 1
#         session_row_count = 1
#         prev_row = row
#         row_list.append(row.to_dict())
#         continue
        

#     # if record is present
#     elif datetime.strptime(row['TrigTime'], '%H:%M:%S') - expected_trigger_time > timedelta(minutes=5):
#         new_row = get_next_row(prev_row)
#         row_list.append(new_row)
#         prev_row = new_row
#         session_row_count += 1
#         if session_row_count > 12 :
#             print(f'{i} and {len(row_list)-1}')

#     else:
#         prev_row = row
#         row_list.append(row)
#         session_row_count += 1
#         i += 1
    
#     expected_trigger_time = expected_trigger_time + timedelta(minutes=30)


### Based on Number Matching

In [7]:
row_list = []
info = []
session_start_df = df[df['SurvName'] == 'After Your 1st Drink']

for index, row in tqdm(session_start_df.iterrows(), total=len(session_start_df)):

    if row['RespType'] == 'Missed':
        row_list.append(row.to_dict())
        # print(f'Check for missed first: {index}')
        continue

    row_list.append(row.to_dict())
    follow_up_survey_count = 0
    dummy_row_count = 0
    next_row = get_next_row(row, True) #follow_up_survey_count == 0)
    
    while follow_up_survey_count < 12:
        
        next_row_in_df = df[(df['SubID'] == row['SubID']) & (df['TrigDate'] == next_row['TrigDate']) & (df['TrigTime'] == next_row['TrigTime'])]
        if  len(next_row_in_df) == 1:
            next_row = next_row_in_df.iloc[0].to_dict()
            row_list.append(next_row)    
        elif len(next_row_in_df) == 0:
            dummy_row_count += 1
            row_list.append(next_row)
        else:
            # print(f'Check for duplicate submission time: {next_row_in_df.iloc[0].name}')
            submitted_survey_df = next_row_in_df[next_row_in_df['RespType'] == 'Submission']
            if len(submitted_survey_df)>0:
                next_row = submitted_survey_df.iloc[0].to_dict()
            else:
                next_row = next_row_in_df.iloc[0].to_dict()
            row_list.append(next_row)

        next_row =  get_next_row(next_row, False)
        follow_up_survey_count += 1

    if dummy_row_count > 0:
        info.append({'SubID': row['SubID'], 'Rows Added': dummy_row_count, 'TrigTime': row['TrigTime']})

100%|███████████████████████████████████████| 4416/4416 [02:01<00:00, 36.40it/s]


In [8]:
res_df = pd.DataFrame(row_list).reset_index(drop=True)
info_df = pd.DataFrame(info).reset_index(drop=True)

In [9]:
def df_num(df):
    
    session_count = len(df[ (df['SurvName'] == 'After Your 1st Drink') & (df['RespType'] != 'Missed')])
    missed_first_count = len(df[(df['SurvName'] == 'After Your 1st Drink') & (df['RespType'] == 'Missed')] )
    
    print(f'Session Count: {session_count}')
    follow_up_survey_count = len(df[df['SurvName'] == 'Drinking Follow-Ups'])
    print(f'Follow_up_survey_count: {follow_up_survey_count}')
    
    print(f'Required rows: {session_count*12 - follow_up_survey_count}')

In [10]:
df_num(df)
df_num(res_df)

Session Count: 3990
Follow_up_survey_count: 39537
Required rows: 8343
Session Count: 3990
Follow_up_survey_count: 47880
Required rows: 0


### New Column: Drinking Bin

In [11]:
res_df['Drinking_bin'] = res_df.apply(lambda x: 1 if ((pd.notnull(x['DrnkNum']) and x['DrnkNum'] > 1)  or (x['SurvName'] == 'After Your 1st Drink' and x['RespType'] != 'Missed' and x['RespType'] != 'Missed (Synthetic)')) else (0 if (pd.notnull(x['DrnkNum']) and x['DrnkNum'] == 1 and x['RespType'] != 'Missed'and x['RespType'] != 'Missed (Synthetic)') else ''), axis = 1)

### New Columns: 'DrnkNum_combine', 'DrnkNum_cumulative'

In [12]:
res_df['DrnkNum_combine'] = res_df.apply(lambda x: x['DrnkNum'] - 1 if (pd.notnull(x['DrnkNum']) and x['DrnkNum'] > 1 and x['RespType'] != 'Missed' and x['SurvName'] == 'Drinking Follow-Ups')  else (x['UI_DrnkNum'] if (x['SurvName'] == 'After Your 1st Drink' and x['RespType'] != 'Missed')  else (0 if (pd.notnull(x['DrnkNum']) and x['DrnkNum'] == 1 and x['RespType'] != 'Missed' and x['RespType'] != 'Missed (Synthetic)') else None)), axis = 1)
res_df['DrnkNum_combine'] = res_df['DrnkNum_combine'].astype('Int64')

In [13]:
grouped = res_df.groupby(['SubID', 'Weekend', 'Burst'])
res_df['DrnkNum_cumulative'] = None

for group_name, group_df in grouped:
    cumulativeDrinkNum = 0
    for index, row in group_df.iterrows():
        
        if row['SurvName'] == 'After Your 1st Drink':
            cumulativeDrinkNum = 0

        if row['RespType'] == 'Missed'  or row['RespType'] == 'Missed (Synthetic)':
            continue
            
        cumulativeDrinkNum += 0 if pd.isna(row['DrnkNum_combine']) else row['DrnkNum_combine']
        res_df.at[index, 'DrnkNum_cumulative'] = cumulativeDrinkNum

In [14]:
res_df = res_df[['SubID','Burst', 'Weekend', 'Day', 'RespID', 'RespType', 'UserID', 'SurvName', 'SurvType',
       'InitDate', 'InitTime', 'SubDate', 'SubTime', 'TrigDate', 'TrigTime', 'Drinking_bin',
       'UI_DrnkFin', 'UI_DrnkNum', 'DrnkNum', 'DrnkNum_combine', 'DrnkNum_cumulative', 'Plsur', 'Rliev', 'IntxNow',
       'MorAlc', 'enrgz', 'excit', 'sedat', 'slotht', 'slug', 'up', 'crntloc',
       'othrloc', 'wthothr', 'NumOth', 'OthTyp', 'OthWho', 'EnjInt', 'ActCld',
       'ActAcc', 'OthCld', 'OthAcc', 'Vrtl', 'AlnLke', 'AlnGd', 'WshOth',
       'lnly', 'rjct', 'incl', 'acpt','vrtint', 'vrtoth',
        'vrtwho', 'vrtenj', 'vrtcld', 'VrtAcc', 'VrtOthCld',
       'VrtOthAcc', 'NotInt', 'WshInt', 'UI_NtInt_lnly', 'UI_NtInt_rjct',
       'UI_NtInt_incl', 'UI_NtInt_acpt', 'said', 'spent', 'impt', 'spur']]

In [15]:
res_df.to_csv(output_file_path, index=False)