### Import dependecies

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import time
import traceback
import re
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

### Initialize File Paths

In [2]:
# input_file_path = '/Users/sauravyadav/Documents/Repos/Datasets/Drinking_Surveys/output/data_draft_v2_coded.xlsx'
input_file_path = '/Users/sauravyadav/Documents/Repos/Datasets/Drinking_Surveys/output/data_draft_v2_coded_plus.csv'
og_input_file_path = '/Users/sauravyadav/Documents/Repos/Datasets/Drinking_Surveys/output/data_draft_v2.csv'
output_file_path = '/Users/sauravyadav/Documents/Repos/Datasets/Drinking_Surveys/output/data_draft_v3.csv'

In [3]:
# df = pd.read_excel(input_file_path)
df = pd.read_csv(input_file_path)
og_df = pd.read_csv(og_input_file_path)

df['SurvNamePlus'] = og_df['SurvNamePlus']
df['TrigDate'] = pd.to_datetime(df['TrigDate']).dt.strftime('%Y-%m-%d')
df['TrigTime'] = pd.to_datetime(df['TrigTime'], format='%H:%M:%S').dt.strftime('%H:%M:%S')
df['SubDate'] = pd.to_datetime(df['SubDate']).dt.strftime('%Y-%m-%d')
df['SubTime'] = pd.to_datetime(df['SubTime'], format='%H:%M:%S').dt.strftime('%H:%M:%S')


In [4]:
df = og_df.merge(df[['Overlap', 'SubID', 'TrigDate', 'TrigTime', 'SubDate', 'SubTime']], on = ['SubID', 'TrigDate', 'TrigTime', 'SubDate', 'SubTime'], how = 'left').reset_index(drop=True)

### Checking if all submitted start session have 12 corresponding follow up records

In [5]:
def check_session_completeness(df):
    session_count_df = df[(df['RespType'] == 'Submission') & (df['SurvName'] == 'After Your 1st Drink')][['SubID', 'SurvName']].groupby(['SubID', 'SurvName']).size().reset_index(name='Frequency')
    session_count_df.drop(columns = ['SurvName'], inplace = True)
    session_count_df = session_count_df.rename(columns = {'Frequency' : 'session_count'})
    
    display(session_count_df.head())
    
    freq_df = df.groupby('SubID')['SurvName'].value_counts().reset_index()
    freq_df = freq_df.pivot_table(index='SubID', columns='SurvName', values='count', fill_value=0)
    freq_df.columns.name = None
    freq_df = freq_df.reset_index()
    freq_df = freq_df.drop(columns = ['After Your 1st Drink'])

    display(freq_df.head())

    match_df = pd.merge(session_count_df, freq_df, on = 'SubID')
    match_df['is_complete'] = (match_df['session_count'] * 12) == match_df['Drinking Follow-Ups']
    display(match_df['is_complete'].value_counts().reset_index())

    print(f'Missing users: {set(df['SubID']) - set(match_df['SubID'])}')
    return match_df

In [6]:
a = check_session_completeness(df)

Unnamed: 0,SubID,session_count
0,2001,13
1,2002,12
2,2004,21
3,2005,9
4,2006,4


Unnamed: 0,SubID,Drinking Follow-Ups
0,2001,156.0
1,2002,144.0
2,2004,252.0
3,2005,108.0
4,2006,48.0


Unnamed: 0,is_complete,count
0,True,390


Missing users: {2772, 2548}


## Removing Overlapping sessions

In [7]:
out_df = df[df['Overlap'] != 2]
out_df.sample(3)

Unnamed: 0,SubID,Burst,Weekend,Day,RespID,RespType,UserID,SurvName,SurvNamePlus,SurvType,InitDate,InitTime,SubDate,SubTime,TrigDate,TrigTime,Drinking_bin,UI_DrnkFin,UI_DrnkNum,DrnkNum,DrnkNum_combine,DrnkNum_cumulative,Plsur,Rliev,IntxNow,MorAlc,enrgz,excit,sedat,slotht,slug,up,crntloc,othrloc,wthothr,NumOth,OthTyp,OthWho,EnjInt,ActCld,ActAcc,OthCld,OthAcc,Vrtl,AlnLke,AlnGd,WshOth,lnly,rjct,incl,acpt,vrtint,vrtoth,vrtwho,vrtenj,vrtcld,VrtAcc,VrtOthCld,VrtOthAcc,NotInt,WshInt,UI_NtInt_lnly,UI_NtInt_rjct,UI_NtInt_incl,UI_NtInt_acpt,said,spent,impt,spur,Overlap
23438,2347,2,4,Sat,f23c4d29-801e-40e6-a1be-699c8c3e6a36,Missed,5cc8b2d44cd892706d0c4745,Drinking Follow-Ups,Long,,2019-11-23,01:04:00,2019-11-23,01:14:00,2019-11-23,01:04:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0
40958,2645,2,4,Fri,,Missed (Synthetic),6245e59ee65dd6d4327d3e49,Drinking Follow-Ups,Short,,,,,,2022-10-14,22:04:18,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0
5393,2068,2,3,Fri,e5039e3b-712f-4053-8f43-a62041d7ac7f,Missed,5ac6ab160dfaec05de407bad,Drinking Follow-Ups,Short,,2019-01-11,00:45:25,2019-01-11,00:55:25,2019-01-11,00:45:25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0


In [8]:
b = check_session_completeness(out_df)

Unnamed: 0,SubID,session_count
0,2001,13
1,2002,12
2,2004,16
3,2005,9
4,2006,4


Unnamed: 0,SubID,Drinking Follow-Ups
0,2001,156.0
1,2002,144.0
2,2004,192.0
3,2005,108.0
4,2006,48.0


Unnamed: 0,is_complete,count
0,True,390


Missing users: {2772, 2548}


In [9]:
b[b['is_complete'] == False]

Unnamed: 0,SubID,session_count,Drinking Follow-Ups,is_complete


In [10]:
print(df[df['SubID'] == 2018].shape)
print(out_df[out_df['SubID'] == 2018].shape)

(247, 70)
(156, 70)


In [11]:
# df[df['SubID'] == 2018]

In [12]:
out_df.to_csv(output_file_path, index = False)

In [13]:
out_df[out_df['Burst'].isna()].head()

Unnamed: 0,SubID,Burst,Weekend,Day,RespID,RespType,UserID,SurvName,SurvNamePlus,SurvType,InitDate,InitTime,SubDate,SubTime,TrigDate,TrigTime,Drinking_bin,UI_DrnkFin,UI_DrnkNum,DrnkNum,DrnkNum_combine,DrnkNum_cumulative,Plsur,Rliev,IntxNow,MorAlc,enrgz,excit,sedat,slotht,slug,up,crntloc,othrloc,wthothr,NumOth,OthTyp,OthWho,EnjInt,ActCld,ActAcc,OthCld,OthAcc,Vrtl,AlnLke,AlnGd,WshOth,lnly,rjct,incl,acpt,vrtint,vrtoth,vrtwho,vrtenj,vrtcld,VrtAcc,VrtOthCld,VrtOthAcc,NotInt,WshInt,UI_NtInt_lnly,UI_NtInt_rjct,UI_NtInt_incl,UI_NtInt_acpt,said,spent,impt,spur,Overlap
