# This is a template. 
Please make a copy of this notebook to generate your own pulse survey data source.

In [1]:
from sklearn.pipeline import Pipeline, FeatureUnion
from pathlib import Path

# Load in custom transformers

In [2]:
%run cleaning_transformers.ipynb

In [3]:
%run singleselect_counter_transformers.ipynb

In [4]:
%run multiselect_counter_transformers.ipynb

# Define variables

## Fall 2020 PS#1 (REPLACE with your own pulse survey)

In [5]:
############# REPLACE  ###########
# folder paths; replace with your own
PATH_TO_RAW_DATA_FOLDER = Path('../pulse_survey_archived_data/pulse_survey_raw_data')
PATH_TO_CONTENT_FOLDER = Path('../pulse_survey_content')

# pulse survey num; replace when running a different pulse survey
SEMESTER = 'fa20'
SURVEY_NUM = 1
##################################

# raw data; change filename to f'{SEMESTER}_ps{SURVEY_NUM}_raw_data.csv'
RAW_SURVEY = pd.read_csv(PATH_TO_RAW_DATA_FOLDER/f'pulse_survey_{SURVEY_NUM}_raw_data.csv')

# data cleaning variables
COLUMNS_TO_REMOVE = ['Duration', 'RecordedDate'] ## may need to add:'PHQ2SCORE', 'GAD2SCORE', 'PHQ2', 'GAD2'
UNGRAD_GRAD_COL = 'TYPE' ## may need to replace
RESIDENCY_COL = 'Derived Residency Desc' ## may need to replace
ENTRY_STATUS_COL = 'Entry Status Desc' ## may need to replace
ETH_LEVEL1_COL = 'Ucb Level1 Ethnic Rollup Desc' ## may need to replace
ETH_LEVEL2_COL = 'Ucb Level2 Ethnic Rollup Desc' ## may need to replace
VALUES_TO_NULLIFY = [-99, '-99', -1, '-1', -999, '-999', 'Not selected'] ## may need to replace
############# OPTIONAL: use ONLY if Reporting College cols look like a stem id #############
# rename reporintg college columns to avoid them getting treated as a question
RAW_SURVEY = RAW_SURVEY.rename(columns={'REPORTCOLLEGE1':'Reporting College - First Plan',
                                        'REPORTCOLLEGE2':'Reporting College - Second Plan',
                                        'REPORTCOLLEGE3':'Reporting College - Third Plan'})
############################################################################################
COLLEGE_COLS = RAW_SURVEY.columns[RAW_SURVEY.columns.str.contains('Reporting College')]
MULTI_ETH_COLS = ['African American / Black',
                  'Asian / Asian American',
                  'Hispanic / Latinx',
                  'International',
                  'American Indian / Alaska Native',
                  'Pacific Islander',
                  'Southwest Asian / North African',
                  'White / Caucasian',
                  'No Response']

# counting variables
QUESTION_DESC = RAW_SURVEY.loc[[0]] #pd.read_csv(PATH_TO_CONTENT_FOLDER/f'ps{SURVEY_NUM}_question_desc.csv')
DATA = RAW_SURVEY[1:] #RAW_SURVEY.copy()
DEMOGRAPHIC_COLUMNS = ['Undergrad Grad',
                       'Derived Residency Desc',
                       'Entry Status Desc',
                       'Ucb Level1 Ethnic Rollup Desc',
                       'Ucb Level2 Ethnic Rollup Desc',
                       'Low-income Status',
                       'First Gen College',
                       'Person Gender Desc',
                       'Reporting College',
                       'Multiple Ethnicities']
SINGLE_SELECT_STEM_IDS = [] ## column names, add all single-select IDs
MULTI_SELECT_STEM_IDS = ['COVID_INFO'] ## may need to change

# Construct Pipeline

## Data cleaning

In [6]:
cleaning_pipeline = Pipeline([
    # drop null responses, remove duplicates and columns, make all missing/irrelevant values nan
    ('null rows remover', RemoveNullRowsTransformer()),
    ('values nullifier', ReplaceValuesTransformer(values_to_replace=VALUES_TO_NULLIFY)),
    ('duplicates remover', RemoveFirstDuplicateTransformer()),
    ('irrelevant columns remover', RemoveColumnsTransformer(columns_to_remove=COLUMNS_TO_REMOVE)),
    # rename column names
    ('undergrad grad col renamer', RenameColumnTransformer(UNGRAD_GRAD_COL, 'Undergrad Grad')),
    ('residency col renamer', RenameColumnTransformer(RESIDENCY_COL, 'Derived Residency Desc')),
    ('entry status col renamer', RenameColumnTransformer(ENTRY_STATUS_COL, 'Entry Status Desc')),
    ('ethnic lvl1 col renamer', RenameColumnTransformer(ETH_LEVEL1_COL, 'Ucb Level1 Ethnic Rollup Desc')),
    ('ethnic lvl2 col renamer', RenameColumnTransformer(ETH_LEVEL2_COL, 'Ucb Level2 Ethnic Rollup Desc')),
    # rename dataframe values
    ('undergrad value renamer', RelabelColumnTransformer(column_to_relabel='Undergrad Grad', new_label='U')),
    ('grad value renamer', RelabelColumnTransformer(column_to_relabel='Undergrad Grad', new_label='G')),
    ('first-year entry value renamer', RelabelColumnTransformer(column_to_relabel='Entry Status Desc', new_label='First-year')),
    # replace ADVANCED STANDING with NaN for all grad students
    ('advanced standing grad nullifier', ReplaceStringWithNaNTransformer(standing_col='Entry Status Desc')),
    # create columns for double counting demographics & mental health scores
    ('reporting clg col generator', UniqueStringListTransformer(columns_to_list=COLLEGE_COLS, unique_col_list='Reporting College')),
    ('multiple eth col generator', UniqueStringListTransformer(columns_to_list=MULTI_ETH_COLS, unique_col_list='Multiple Ethnicities')),
    ('depression col generator', AddColumnsTransformer(column_1='MHLTH1', column_2='MHLTH2', new_column='PHQ2', binary_column='DEPRESSION')),
    ('anxiety col generator', AddColumnsTransformer(column_1='MHLTH3', column_2='MHLTH4', new_column='GAD2', binary_column='ANXIETY'))
])

## Single-select counts

In [7]:
def build_counts_transformer(demographic, question_cols, question_desc, is_first_df, multiselect_stem_ids=[]):
    """
    helper function to build a counts transformer that is a pipeline that 1) creates a counts dataframe and 2) add question texts.
    returns a Pipeline object.
    """
    counts_transformer = Pipeline(steps=[
                             ('{} counter'.format(demographic.lower()), 
                                  QuestionCounter([demographic, 'Undergrad Grad'], question_cols, multiselect_stem_ids)),
                             ('description columns adder', 
                                  IdDescColumnsAdder(question_desc, use_feature_union=is_first_df))
                         ])
    return counts_transformer

# concatenate the data source (w/ counts and totals) for each demographic category
# Note that this is transposed (ie. very wide instead of very long); output is numpy arrays
single_select_pipeline = FeatureUnion([
    ('{} transformer'.format(demographic.lower()),
    Pipeline(steps=[
        # make double count dataframe (will only evoke changes for reporting college)
        ('double count dataframe transformer', DoubleCountDataframeTransformer(demographic)),
        # complete data source for a single demographic category
        ('counts and totals counter',
             FeatureUnion([ 
                 ('counts transformer', build_counts_transformer(demographic, SINGLE_SELECT_STEM_IDS, QUESTION_DESC, i==0, MULTI_SELECT_STEM_IDS)),
                 ('totals counter', TotalsCounter([demographic, 'Undergrad Grad'], SINGLE_SELECT_STEM_IDS, MULTI_SELECT_STEM_IDS, use_feature_union=(i==0)))
             ])
        ),
        # transpose single demog category data source for FeatureUnion
        ('dataframe transposer', DataframeTransposer())
    ]))
    for i, demographic in enumerate(DEMOGRAPHIC_COLUMNS)
])

## Multi-select counts

In [8]:
#returns a list of tuples 
#first element of tuple is a string describing which stem id pipeline
#second element of tuple is the Pipeline for the corresponding stem id 

def combine_pipelines(stem_ids):
    stem_pipelines = []
    for stem_id in stem_ids:
        one_stem_df = Pipeline(steps=[
                        ('make data source dataframe',
                             Pipeline([
                                ('make counts and demographic totals dataframe',
                                     MergeDataFrames(
                                        CountTotal(stem_id, QUESTION_DESC, DEMOGRAPHIC_COLUMNS[:-2], DEMOGRAPHIC_COLUMNS[-2:]), 
                                        DemoTotals(stem_id, DEMOGRAPHIC_COLUMNS[:-2], DEMOGRAPHIC_COLUMNS[-2:]),
                                         ["Demographic Value", "Demographic Category", 'Undergrad Grad', 'Question Item Id'],
                                         'inner')
                                )
                             ])
                        ),
                        ('reorder columns', ColumnsReindexer(use_feature_union=False)),
                        ('dataframe transposer', DataframeTransposer())
                        ])
        stem_pipelines.append(('{} counter'.format(stem_id), one_stem_df))
    return stem_pipelines

In [9]:
select_all_pipeline = Pipeline(steps=[
                        ('select all transformer', 
                            FeatureUnion(
                                combine_pipelines(MULTI_SELECT_STEM_IDS)
                            )
                        )
                      ])

# Full Pipeline

In [10]:
data_source_pipeline = Pipeline(steps=[
    ('data cleaning pipeline', cleaning_pipeline),
    ('data source creator',
         FeatureUnion([
            ('single-select counting transformer', single_select_pipeline),
            ('multi-select counting transformer', select_all_pipeline)
         ])
    ),
    ('dataframe transposer', DataframeTransposer()),
    ('array to dataframe transformer', ArrToDataframeTransformer()),
    ('drop low counts', DropLowCounts('Demographic Value Total', 'Demographic Value Total, by Undergrad Grad', 'Count'))                    
])

# Create Data Source

## PS#1 (REPLACE with your own pulse survey)

In [11]:
%%time
ps1_data_source = data_source_pipeline.fit_transform(DATA)

CPU times: user 9.8 s, sys: 275 ms, total: 10.1 s
Wall time: 10.1 s


In [12]:
ps1_data_source

Unnamed: 0,Question Stem Id,Question Item Id,Demographic Category,Demographic Value,Undergrad Grad,Question Response,Count,Question Item,Question Stem,Demographic Value Total,"Demographic Value Total, by Undergrad Grad",Question Stem Total,Question Item Total
0,INSTRUCT_HELP_1,INSTRUCT_HELP_1,Undergrad Grad,G,G,Up-to-date laptop,1201,Up-to-date laptop,Which of the following would improve your experience with remote instruction as a student,1201,1201,4161,4161
1,INSTRUCT_HELP_1,INSTRUCT_HELP_1,Undergrad Grad,U,U,Up-to-date laptop,2960,Up-to-date laptop,Which of the following would improve your experience with remote instruction as a student,2960,2960,4161,4161
2,INSTRUCT_HELP_2,INSTRUCT_HELP_2,Undergrad Grad,G,G,Wi-Fi hotspot,1401,Wi-Fi hotspot,Which of the following would improve your experience with remote instruction as a student,1401,1401,5556,5556
3,INSTRUCT_HELP_2,INSTRUCT_HELP_2,Undergrad Grad,U,U,Wi-Fi hotspot,4155,Wi-Fi hotspot,Which of the following would improve your experience with remote instruction as a student,4155,4155,5556,5556
4,INSTRUCT_HELP_3,INSTRUCT_HELP_3,Undergrad Grad,G,G,Webcamera,672,Webcamera,Which of the following would improve your experience with remote instruction as a student,672,672,2078,2078
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9986,COVID_INFO,COVID_INFO_8,Person Gender Desc,Nonbinary,U,None of the above,2,None of the above,Which of the following do you find helpful to get UC Berkeley information about COVID-19 updates and changes (Check all that apply.),56,56,14996,691
9987,COVID_INFO,COVID_INFO_8,Person Gender Desc,Transgender Man/Trans Man,U,None of the above,-1,None of the above,Which of the following do you find helpful to get UC Berkeley information about COVID-19 updates and changes (Check all that apply.),-1,-1,14996,691
9988,COVID_INFO,COVID_INFO_8,Person Gender Desc,Transgender Woman/Trans Woman,U,None of the above,-1,None of the above,Which of the following do you find helpful to get UC Berkeley information about COVID-19 updates and changes (Check all that apply.),-1,-1,14996,691
9989,COVID_INFO,COVID_INFO_8,Person Gender Desc,Woman,G,None of the above,8,None of the above,Which of the following do you find helpful to get UC Berkeley information about COVID-19 updates and changes (Check all that apply.),6077,241,14996,691


In [13]:
ps1_data_source.to_csv(f'pulse_survey_data_source/{SEMESTER}_ps{SURVEY_NUM}_data_source.csv', index=False, quotechar='"')