In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import time
import traceback
import re
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [None]:
master_df_fp = '/Users/sauravyadav/Documents/Repos/Datasets/master_data_draft.csv'

distribution_df_fp = '/Users/sauravyadav/Documents/Repos/Datasets/master_data_distribution_draft.csv'

In [None]:
df = pd.read_csv(master_df_fp)

In [None]:
df.head(3)

### Total number of participants

In [None]:
df.SubID.nunique()

In [None]:
df.shape

In [None]:
df['SurveyCount'] = df['SurveyCount'].fillna(-1)

## Distribution by Survname

In [None]:
df['SurveyCount'].value_counts()

In [None]:
# Define the conditions
conditions = [
    (df['SurveyCount'] == 1),
    (df['SurveyCount'] == -1),
    (df['SurveyCount'] == 0),
    (df['SurveyCount'] == 2)
]

# Define the corresponding values
values = ['submission', 'missed (not enough info)', 'missed (penalize)', "missed (don't penalize)"]

In [None]:
df['SurveyCategory'] = np.select(conditions, values, default=np.nan)
dist_df = df.groupby(['SurvName', 'SurveyCategory']).size().reset_index(name='RecordCount')
pivot_df = dist_df.pivot(index='SurveyCategory', columns='SurvName', values='RecordCount').reset_index()
pivot_df = pivot_df.fillna(0)
display(pivot_df)

## Distribution: Pars who submitted at least 1 survey per weekend

In [None]:
weekend_groups = [[1],[2], [1,2] ,[3],[4],[3,4],[5],[6],[5,6], [1,2,3,4,5,6]]
total_pars = df.SubID.nunique()
res_list = []

for wgp in tqdm(weekend_groups):
    temp_df = df[['SubID', 'Brst', 'Wknd', 'RespType']]
    temp_df = temp_df[(temp_df['RespType'] == 'Submission') & (temp_df['Wknd'].isin(wgp))]
    wgp_readable = ','.join(map(str, wgp))
    res_dict = {'Burst' : temp_df['Brst'].iloc[0], 'Weekend' : wgp_readable}
    
    temp_df = temp_df.groupby(['SubID']).size().reset_index(name='Count')
    
    res_dict['Number of Pars (0)'] = total_pars - temp_df['SubID'].nunique()
    res_dict['Number of Pars (>=1)'] = temp_df[temp_df['Count']>=1]['SubID'].nunique()
    res_dict['Number of Pars (>=1 & <10)'] = temp_df[(temp_df['Count']>=1) & (temp_df['Count']<10)]['SubID'].nunique()
    res_dict['Number of Pars (>=10)'] = temp_df[temp_df['Count'] >= 10]['SubID'].nunique()
    res_list.append(res_dict)

res_dist_df = pd.DataFrame(res_list)

display(res_dist_df)
    
  


## Distribution: Pars who submitted at least 1 survey per weekend, and survey type

In [None]:
weekend_groups = [[1],[2], [1,2] ,[3],[4],[3,4],[5],[6],[5,6], [1,2,3,4,5,6]]
total_pars = df.SubID.nunique()
res_list2 = []

for wgp in tqdm(weekend_groups):
    temp_df = df[['SubID', 'Brst', 'Wknd', 'RespType', 'SurvName']]
    temp_df = temp_df[(temp_df['RespType'] == 'Submission') & (temp_df['Wknd'].isin(wgp))]
    wgp_readable = ','.join(map(str, wgp))
    
    temp_df = temp_df.groupby(['SubID', 'SurvName']).size().reset_index(name='Count')

    for survname in temp_df['SurvName'].unique():
        res_dict = {'Weekend' : wgp_readable, 'SurvName':survname}
        temp_survey_df = temp_df[temp_df['SurvName'] == survname]
        res_dict['Number of Pars (0)'] = total_pars - temp_survey_df['SubID'].nunique()
        res_dict['Number of Pars (>=1)'] = temp_survey_df[temp_survey_df['Count']>=1]['SubID'].nunique()
        res_dict['Number of Pars (>=1 & <10)'] = temp_survey_df[(temp_survey_df['Count']>=1) & (temp_survey_df['Count']<10)]['SubID'].nunique()
        res_dict['Number of Pars (>=10)'] = temp_survey_df[temp_survey_df['Count'] >= 10]['SubID'].nunique()
        res_list2.append(res_dict)
    
res_dist2_df = pd.DataFrame(res_list2)

display(res_dist2_df)

In [26]:
res_dist2_df.head(1)

Unnamed: 0,Weekend,SurvName,Number of Pars (0),Number of Pars (>=1),Number of Pars (>=1 & <10),Number of Pars (>=10)
0,1,After Your 1st Drink,10,383,383,0


In [28]:
melted_df = res_dist2_df.melt(
    id_vars=['Weekend', 'SurvName'],
    value_vars=['Number of Pars (0)', 'Number of Pars (>=1)', 'Number of Pars (>=1 & <10)', 'Number of Pars (>=10)'],
    var_name='Number of Pars Category',
    value_name='Count'
)

# Create a new column by concatenating 'Weekend' and 'SurvName'
melted_df['Weekend_SurvName'] = melted_df['Weekend'].astype(str) + '_' + melted_df['SurvName']

# Pivot the DataFrame to make 'Weekend_SurvName' columns and 'Number of Pars Category' as rows
pivot_df = melted_df.pivot(index='Number of Pars Category', columns='Weekend_SurvName', values='Count').reset_index()

display(pivot_df)


Weekend_SurvName,Number of Pars Category,"1,2,3,4,5,6_After Your 1st Drink","1,2,3,4,5,6_Daytime Surveys","1,2,3,4,5,6_Drinking Follow-Ups Long","1,2,3,4,5,6_Drinking Follow-Ups Short","1,2,3,4,5,6_Morning Reports","1,2_After Your 1st Drink","1,2_Daytime Surveys","1,2_Drinking Follow-Ups Long","1,2_Drinking Follow-Ups Short","1,2_Morning Reports",1_After Your 1st Drink,1_Daytime Surveys,1_Drinking Follow-Ups Long,1_Drinking Follow-Ups Short,1_Morning Reports,2_After Your 1st Drink,2_Daytime Surveys,2_Drinking Follow-Ups Long,2_Drinking Follow-Ups Short,2_Morning Reports,"3,4_After Your 1st Drink","3,4_Daytime Surveys","3,4_Drinking Follow-Ups Long","3,4_Drinking Follow-Ups Short","3,4_Morning Reports",3_After Your 1st Drink,3_Daytime Surveys,3_Drinking Follow-Ups Long,3_Drinking Follow-Ups Short,3_Morning Reports,4_After Your 1st Drink,4_Daytime Surveys,4_Drinking Follow-Ups Long,4_Drinking Follow-Ups Short,4_Morning Reports,"5,6_After Your 1st Drink","5,6_Daytime Surveys","5,6_Drinking Follow-Ups Long","5,6_Drinking Follow-Ups Short","5,6_Morning Reports",5_After Your 1st Drink,5_Daytime Surveys,5_Drinking Follow-Ups Long,5_Drinking Follow-Ups Short,5_Morning Reports,6_After Your 1st Drink,6_Daytime Surveys,6_Drinking Follow-Ups Long,6_Drinking Follow-Ups Short,6_Morning Reports
0,Number of Pars (0),3,0,6,4,1,6,4,13,10,4,10,4,22,16,5,51,16,71,68,20,71,60,88,84,61,88,65,110,103,64,103,65,124,127,70,88,73,95,94,72,103,75,117,115,75,113,81,135,127,79
1,Number of Pars (>=1 & <10),171,19,71,56,57,387,36,158,131,389,383,146,278,267,388,342,144,262,254,373,322,38,162,138,332,305,120,234,232,329,290,136,229,221,323,305,28,165,144,321,290,113,232,235,318,280,135,211,211,314
2,Number of Pars (>=1),390,393,387,389,392,387,389,380,383,389,383,389,371,377,388,342,377,322,325,373,322,333,305,309,332,305,328,283,290,329,290,328,269,266,323,305,320,298,299,321,290,318,276,278,318,280,312,258,266,314
3,Number of Pars (>=10),219,374,316,333,335,0,353,222,252,0,0,243,93,110,0,0,233,60,71,0,0,295,143,171,0,0,208,49,58,0,0,192,40,45,0,0,292,133,155,0,0,205,44,43,0,0,177,47,55,0


In [34]:
columns_to_keep = ['Number of Pars Category',
    '1_Morning Reports', '1_Daytime Surveys', '1_After Your 1st Drink',
    '2_Morning Reports', '2_Daytime Surveys', '2_After Your 1st Drink',
    '1,2_Morning Reports', '1,2_Daytime Surveys', '1,2_After Your 1st Drink',
    '3_Morning Reports', '3_Daytime Surveys', '3_After Your 1st Drink',
    '4_Morning Reports', '4_Daytime Surveys', '4_After Your 1st Drink',
    '3,4_Morning Reports', '3,4_Daytime Surveys', '3,4_After Your 1st Drink',
    '5_Morning Reports', '5_Daytime Surveys', '5_After Your 1st Drink',
    '6_Morning Reports', '6_Daytime Surveys', '6_After Your 1st Drink',
    '5,6_Morning Reports', '5,6_Daytime Surveys', '5,6_After Your 1st Drink',
    '1,2,3,4,5,6_Morning Reports', '1,2,3,4,5,6_Daytime Surveys', '1,2,3,4,5,6_After Your 1st Drink',
]
display(pivot_df[columns_to_keep])

Weekend_SurvName,Number of Pars Category,1_Morning Reports,1_Daytime Surveys,1_After Your 1st Drink,2_Morning Reports,2_Daytime Surveys,2_After Your 1st Drink,"1,2_Morning Reports","1,2_Daytime Surveys","1,2_After Your 1st Drink",3_Morning Reports,3_Daytime Surveys,3_After Your 1st Drink,4_Morning Reports,4_Daytime Surveys,4_After Your 1st Drink,"3,4_Morning Reports","3,4_Daytime Surveys","3,4_After Your 1st Drink",5_Morning Reports,5_Daytime Surveys,5_After Your 1st Drink,6_Morning Reports,6_Daytime Surveys,6_After Your 1st Drink,"5,6_Morning Reports","5,6_Daytime Surveys","5,6_After Your 1st Drink","1,2,3,4,5,6_Morning Reports","1,2,3,4,5,6_Daytime Surveys","1,2,3,4,5,6_After Your 1st Drink"
0,Number of Pars (0),5,4,10,20,16,51,4,4,6,64,65,88,70,65,103,61,60,71,75,75,103,79,81,113,72,73,88,1,0,3
1,Number of Pars (>=1 & <10),388,146,383,373,144,342,389,36,387,329,120,305,323,136,290,332,38,322,318,113,290,314,135,280,321,28,305,57,19,171
2,Number of Pars (>=1),388,389,383,373,377,342,389,389,387,329,328,305,323,328,290,332,333,322,318,318,290,314,312,280,321,320,305,392,393,390
3,Number of Pars (>=10),0,243,0,0,233,0,0,353,0,0,208,0,0,192,0,0,295,0,0,205,0,0,177,0,0,292,0,335,374,219
