In [182]:
from zipfile import ZipFile
import io
import os
import glob
from itertools import product

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import statistics as stats

###### Loneliness

In [45]:
path = '/Users/kristiancanler/Documents/Data/dartmouth_dataset/dataset/survey/LonelinessScale.csv'
loneliness_data = pd.read_csv(path)

# Printing unique values for survey questions
pd.unique(loneliness_data.iloc[:, 2:].values.ravel('K'))

array(['Sometimes', 'Rarely', 'Often', 'Never'], dtype=object)

In [46]:
# Creating copy of dataframe so rerunning this cell doesnt throw errors
loneliness = loneliness_data.copy()

# Lowering all strings
for col in loneliness.iloc[:, 2:]:
    loneliness[col] = loneliness[col].str.lower()

# Translating string responses to ordinal codes
loneliness[loneliness == 'never'] = 0
loneliness[loneliness == 'rarely'] = 1
loneliness[loneliness == 'sometimes'] = 2
loneliness[loneliness == 'often'] = 3

# Naming values to be flipped and pairing with reversed questions
flip_vars = {'1. I feel in tune with the people around me':
                 '1. I do not feel in tune with the people around me',
             '4. I do not feel alone':
                 '4. I feel alone',
             '5. I feel part of a group of friends':
                 '5. I do not feel part of a group of friends',
             '6. I have a lot in common with the people around me':
                 '6. I do not have a lot in common with the people around me',
             '9. I am an outgoing person':
                 '9. I am not an outgoing person',
             '10. There are people I feel close to':
                 '10. There are not any people I feel close to',
             '15. I can find companionship when I want it':
                 '15. I cannot find companionship when I want it',
             '16. There are people who really understand me':
                 '16. There are not people who really understand me',
             '19. There are people I can talk to':
                 '19. There are not any people I can talk to',
             '20. There are people I can turn to':
                 '20. There are not any people I can turn to'}

codebook = {
    'loneliness': {0: 'never',
                   1: 'rarely',
                   2: 'sometimes',
                   3: 'often',
                   4: 'always'}
}

# Flipping values
for var in flip_vars.keys():
    # Never to Always
    loneliness.loc[loneliness[var]==0, var] = 4
    # Rarely to Often
    loneliness.loc[loneliness[var]==1, var] = 3
    # Sometimes stays Sometimes
    # Often to Rarely
    loneliness.loc[loneliness[var]==3, var] = 1
    
# Creating feature which is average or sum of all answers
loneliness['loneliness_avg'] = loneliness.iloc[:, 2:].mean(axis=1)
loneliness['loneliness_sum'] = loneliness.iloc[:, 2:].sum(axis=1)

In [209]:
all_uid = pd.Series()
for df in [loneliness, stress, depression,
            flourishing, panas]:
    all_uid = all_uid.append(df['uid'])
    
all_uid = list(all_uid.unique())

def diff(df):
    return [uid for uid in all_uid if uid not in df.uid.to_list()]

missing = {
    'loneliness': diff(loneliness),
    'stress': diff(stress),
    'depression': diff(depression),
    'flourishing': diff(flourishing),
    'panas' : diff(panas)
}
missing
print(all_uid)

['u00', 'u01', 'u02', 'u03', 'u04', 'u05', 'u07', 'u08', 'u09', 'u10', 'u12', 'u13', 'u14', 'u15', 'u16', 'u17', 'u18', 'u19', 'u20', 'u22', 'u23', 'u24', 'u27', 'u30', 'u31', 'u32', 'u33', 'u34', 'u35', 'u36', 'u39', 'u42', 'u43', 'u44', 'u45', 'u46', 'u47', 'u49', 'u50', 'u51', 'u52', 'u53', 'u56', 'u57', 'u58', 'u59', 'u54']


###### Stress

In [47]:
path = '/Users/kristiancanler/Documents/Data/dartmouth_dataset/dataset/survey/PerceivedStressScale.csv'
stress_data = pd.read_csv(path)

# Printing unique values for survey questions
pd.unique(stress_data.iloc[:, 2:].values.ravel('K'))

array(['Sometime', 'Fairly often', 'Almost never', 'Very often', 'Never',
       nan], dtype=object)

In [49]:
# Creating copy of dataframe so rerunning this cell doesnt throw errors
stress = stress_data.copy()

# Lowering all strings
for col in stress.iloc[:, 2:]:
    stress[col] = stress[col].str.lower()

# Adding entry to codebook
codebook['stress'] = {
    0: 'never',
    1: 'almost never',
    2: 'sometime',
    3: 'fairly often',
    4: 'very often'
}

# Translating the answers to ordinal values
stress[stress == 'never'] = 0
stress[stress == 'almost never'] = 1
stress[stress == 'sometime'] = 2
stress[stress == 'fairly often'] = 3
stress[stress == 'very often'] = 4

flip_vars = {'4. In the last month, how often have you felt confident about your ability to handle your personal problems?':
                 '4. In the last month, how often have you not felt confident about your ability to handle your personal problems?',
             '5. In the last month, how often have you felt that things were going your way?':
                 '5. In the last month, how often have you felt that things were not going your way?',
             '7. In the last month, how often have you been able to control irritations in your life?':
                 '7. In the last month, how often have you not been able to control irritations in your life?',
             '8. In the last month, how often have you felt that you were on top of things?':
                 '8. In the last month, how often have you felt that you were not on top of things?'
            }

# Renaming flipped features. This needs to happen before flipping the values
stress.rename(flip_vars, axis=1, inplace=True)

# Flipping values for flip features
for var in list(flip_vars.values()):
    stress.loc[stress[var]==0, var] = 4
    stress.loc[stress[var]==1, var] = 3
    stress.loc[stress[var]==3, var] = 1
    stress.loc[stress[var]==4, var] = 0
    
# Creating feature which is average or sum of all answers
stress['stress_avg'] = stress.iloc[:, 2:].mean(axis=1)
stress['stress_sum'] = stress.iloc[:, 2:].sum(axis=1)

###### Depression

In [55]:
path = '/Users/kristiancanler/Documents/Data/dartmouth_dataset/dataset/survey/PHQ-9.csv'
depression_data = pd.read_csv(path)

# Printing unique values for survey questions
pd.unique(depression_data.iloc[:, 2:].values.ravel('K'))

array(['Not at all', 'Several days', 'More than half the days',
       'Nearly every day', 'Not difficult at all', 'Very difficult',
       'Somewhat difficult', nan, 'Extremely difficult'], dtype=object)

In [56]:
# Creating copy of dataframe so rerunning this cell doesnt throw errors
depression = depression_data.copy()

# Lowering all strings
for col in depression.iloc[:, 2:]:
    depression[col] = depression[col].str.lower()
    
# Adding entry to codebook
codebook['depression'] = {
    0: 'not at all / not difficult at all',
    1: 'several days / somewhat difficult',
    2: 'more than half the days / very difficult',
    3: 'nearly every day / extremely difficult'
}

# Creating numeric equivalents for ordinal variables
depression[depression == 'not at all'] = 0
depression[depression == 'several days'] = 1
depression[depression == 'more than half the days'] = 2
depression[depression == 'nearly every day'] = 3
depression[depression == 'not difficult at all'] = 0
depression[depression == 'somewhat difficult'] = 1
depression[depression == 'very difficult'] = 2
depression[depression == 'extremely difficult'] = 3

# This question is so long it makes the visualization hard to format. I'll just add
# a (+) as a reminder the original question is longer
depression.rename({'Moving or speaking so slowly that other people could have noticed. Or the opposite being so figety or restless that you have been moving around a lot more than usual':
                   'Moving or speaking so slowly that other people could have noticed. (+)'},
             axis=1,
             inplace=True)

# Creating mean and sum features
depression['depression_avg'] = depression.iloc[:, 2:].mean(axis=1)
depression['depression_sum'] = depression.iloc[:, 2:].sum(axis=1)

###### Flourishing

In [57]:
# More info on flourishing scale: https://ggsc.berkeley.edu/images/uploads/The_Flourishing_Scale.pdf

path = '/Users/kristiancanler/Documents/data/dartmouth_dataset/dataset/survey/FlourishingScale.csv'
flourishing = pd.read_csv(path)

# Creating mean and sum features
flourishing['flourishing_avg'] = flourishing.iloc[:, 2:].mean(axis=1)
flourishing['flourishing_sum'] = flourishing.iloc[:, 2:].sum(axis=1)

###### PANAS

In [81]:
# More info on panas scale: https://ogg.osu.edu/media/documents/MB%20Stream/PANAS.pdf
path = '/Users/kristiancanler/Documents/data/dartmouth_dataset/dataset/survey/panas.csv'
panas = pd.read_csv(path)

# Lowering column names
panas.columns = panas.columns.str.lower()

# Creating mean and sum features for positive and negative affect
pos_affect = ['interested', 'strong', 'enthusiastic', 'proud', 'alert', 'inspired',
              'determined', 'attentive', 'active']
panas['panas_pos_avg'] = panas.loc[:, pos_affect].mean(axis=1)
panas['panas_pos_sum'] = panas.loc[:, pos_affect].sum(axis=1)

neg_affect = ['distressed', 'upset', 'guilty', 'scared', 'hostile',
              'irritable', 'nervous', 'jittery', 'afraid']
panas['panas_neg_avg'] = panas.loc[:, neg_affect].mean(axis=1)
panas['panas_neg_sum'] = panas.loc[:, neg_affect].sum(axis=1)

panas['panas_net_avg'] = panas.panas_pos_avg - panas.panas_neg_avg
panas['panas_net_sum'] = panas.panas_pos_sum - panas.panas_neg_sum

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


# Generating Single Dataframe

# Generating Single Student-wise Dataframe

In [167]:
pivot = pd.pivot_table(
    panas[['uid', 'type',
          'panas_pos_avg', 'panas_pos_sum',
          'panas_neg_avg', 'panas_neg_sum',
          'panas_net_avg', 'panas_net_sum']],
   index='uid',
   columns='type'
)

In [212]:
def merge_dfs(type_):

        return panas.loc[
                panas.type==type_,
                ['uid',
                 'panas_pos_avg', 'panas_pos_sum',
                 'panas_neg_avg', 'panas_neg_sum',
                 'panas_net_avg', 'panas_net_sum']
        ].merge(
            flourishing.loc[
                flourishing.type==type_,
                ['uid',
                 'flourishing_avg', 'flourishing_sum']
            ],
            on='uid',
            how='outer'
        ).merge(
            depression.loc[
                depression.type==type_,
                ['uid',
                 'depression_avg', 'depression_sum']
            ],
            on='uid',
            how='outer'
        ).merge(
            stress.loc[
                stress.type==type_,
                ['uid',
                 'stress_avg', 'stress_sum']
            ],
            on='uid',
            how='outer'
        ).merge(
            loneliness.loc[
                loneliness.type==type_,
                ['uid',
                 'loneliness_avg', 'loneliness_sum']
            ],
            on='uid',
            how='outer'
        )

pre_survey_uid = merge_dfs('pre')
post_survey_uid = merge_dfs('post')
for df in [pre_survey_uid, post_survey_uid]:
    df.uid = df.uid.str.split('u').str.get(1).astype(int)

In [170]:
# Adding dfs with id indexed out
avg_survey = pre.iloc[:, 1:].add(post.iloc[:, 1:], fill_value=0)

# Producing average of the dfs by then dividing the df
# elementwise with .applymap
avg_survey = avg_survey.applymap(lambda x: x / 2)

# Adding id back in an returning it to beginning of df
avg_survey['uid'] = pre.uid.copy()
avg_survey = avg[avg.columns.to_list()[-1:] + avg.columns.to_list()[:-1]]

In [172]:
path = '/Users/kristiancanler/Documents/Data/dartmouth_dataset/dataset/my_dfs/'

dfs = {'pre_survey.csv': pre_survey,
       'post_survey.csv': post_survey,
       'avg_survey.csv': avg_survey}

for key in dfs:
    output_file = os.path.join(path, key)
    dfs[key].to_csv(output_file, index=False, header=True)