In [1]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.pipeline import Pipeline, FeatureUnion

### QuestionCounter

In [2]:
class QuestionCounter(BaseEstimator, TransformerMixin):
    """
    A custom Pipeline transformer class used to generate counts of a set of questions by a single given demogrpahic.

    ...

    Attributes
    ----------
    demographics : list of str
        a list of demographic column names to break down the counts by
    question_columns : list of str or pandas.Index of str, default=[]
        a list of column names representing the questions we want counts of
    multiselect_stem_ids : list of str, default=[]
        a list of question stem IDs of multi-select questions
    stem_id_dict : dict
        a dictionary with question stem IDs as keys and their corresponding list of question item IDs as values
    use_feature_union : bool, default=False
        whether or not the result is input to FeatureUnion. Stores column names into 1st row of dataframe if True.

    Methods
    -------
    fit(self, X, y=None)
    transform(self, X, y=None)
        Returns a dataframe with counts for each demographic value, for each question item
    get_question_stem_ids(self)
        Returns a list of question stem IDs
    """
    def __init__(self, demographics, question_columns=[], multiselect_stem_ids=[], use_feature_union=False):
        self.demographics = demographics
        self.question_columns = pd.Index(question_columns)
        self.multiselect_stem_ids = multiselect_stem_ids
        self.stem_id_dict = dict()
        self.use_feature_union = use_feature_union
        
    def fit(self, X, y=None):
        def build_question_stem_ids(survey_df, multiselect_stems):
            """Populates question_columns; Creates the stem_id_dict dictionary with keys as question stem ids, values as corresponding question item ids
            """
            if len(self.question_columns) == 0:
                # excludes open-text columns
                self.question_columns = survey_df.columns[survey_df.columns.str.contains('^(?!.*_TEXT$)[A-Z\.0-9_]+$', regex=True)].str.strip()
            stem_id_dict = {}
            for i, item_id in enumerate(self.question_columns):
                stem_id = pd.Series(item_id).str.split('_\d+(?!\.)$', regex=True).values[0][0]
                if stem_id in multiselect_stems and stem_id not in stem_id_dict.keys():
                    question_item_ids = self.question_columns[self.question_columns.str.contains(stem_id)].tolist()
                    stem_id_dict[stem_id] = question_item_ids
                elif stem_id not in stem_id_dict.keys():
                    # single-select question item = question stem
                    stem_id_dict[item_id] = [item_id]     
            return stem_id_dict
            
        self.stem_id_dict = build_question_stem_ids(X, self.multiselect_stem_ids)
        return self
        
    def transform(self, X, y=None):
        def add_id_columns(count_df, item_id, stem_id=None):
            """Adds question item and question stem columns to dataframe
            """
            if stem_id is None:
                stem_id = item_id
            count_df.insert(0, 'Question Item Id', item_id)
            count_df.insert(0, 'Question Stem Id', stem_id)
            return count_df
            
        def count_responses(survey_df, demographics, stem_id):
            """Counts number of responses for a given question, broken down by given demographic(s)
            """
            # only for non-select-all-that-apply questions
            item_id = stem_id
            count_df = survey_df.groupby(demographics+[item_id]).size().to_frame('Count')
            # include rows where there are 0 responses; for uniformity purposes w/ demog total count
            count_df = count_df.unstack(fill_value=0).stack().reset_index()
            count_df = count_df.rename(columns={item_id: 'Question Response'})
            if count_df['Question Response'].dtype == 'object':
                count_df['Question Response'] = count_df['Question Response'].str.strip()
            return count_df
        
        def count_all_questions(survey_df, demographics, question_stem_ids):
            """Creates a dataframe with number of responses, by demographics, for each single-select question (stem)
            """
            question_counts_df = pd.DataFrame()
            has_duplicate_demographics = pd.Series(demographics).duplicated().sum() > 0
            if has_duplicate_demographics:
                # happens when the demographic category is the same as the secondary/additional demographic breakdown
                demographics = pd.Series(demographics)[~pd.Series(demographics).duplicated()].tolist()
            for stem_id in question_stem_ids:
                count_df = count_responses(survey_df, demographics, stem_id)
                #total_df = count_total(survey_df)
                ###########
                # TO-DO: make it compatible to barriers to discovery
                demographic_category = demographics[0]
                count_df.insert(0, 'Demographic Category', demographic_category)
                count_df = count_df.rename(columns={demographic_category: 'Demographic Value'})
                if has_duplicate_demographics:
                    count_df.insert(2, demographic_category, count_df['Demographic Value'].copy())
                ###########
                count_df = add_id_columns(count_df, item_id=stem_id, stem_id=stem_id)
                question_counts_df = pd.concat([question_counts_df, count_df], ignore_index=True)
            return question_counts_df
        
        question_stem_ids = self.get_question_stem_ids()
        single_select_stem_ids = [stem_id for stem_id in question_stem_ids if len(self.stem_id_dict[stem_id])==1]
        counts_data_source = count_all_questions(X, self.demographics, single_select_stem_ids)
        if self.use_feature_union: # store columnn names in 1st row of dataframe
            counts_data_source = pd.DataFrame([counts_data_source.columns] + counts_data_source.values.tolist())
        return counts_data_source

    def get_question_stem_ids(self):
        """Returns a list of question stem IDs
        """
        return self.stem_id_dict.keys()

### IdDescColumnsAdder

In [3]:
class IdDescColumnsAdder(BaseEstimator, TransformerMixin):
    """
    A custom Pipeline transformer class used to add to an existing dataframe question stem and item descriptions (ie. "Question Stem" and "Question Item" columns).

    ...

    Attributes
    ----------
    question_descriptions : DataFrame
        a single-row dataframe where column names are question (item) IDs and cell values are the question text
        NOTE: Columns should match columns of pulse survey data file.
              For demographic category columns, cell values should reflect the Demographic Category names used in final data source.
              Other columns can just match column names.
    use_feature_union : bool, default=False
        whether or not the result is input to FeatureUnion. Stores column names into 1st row of dataframe if True.

    Methods
    -------
    fit(self, X, y=None)
    transform(self, X, y=None)
        Returns the dataframe that was passed into it (the "X" variable), with 2 columns ("Question Stem" & "Question Item") added to it
    """
    def __init__(self, question_descriptions, use_feature_union=True):
        self.question_descriptions = pd.DataFrame(question_descriptions)
        self.use_feature_union = use_feature_union
        
    def fit(self, X, y = None):
        return self
        
    def transform(self, X, y = None):
        for item_id in X['Question Item Id'].unique():
            stem_id = X.loc[X['Question Item Id']==item_id, 'Question Stem Id'].unique()[0]
            regex_split_item_stem_str = '\?(?:\s?(?:Select all that apply\. - Selected Choice -|(?:Check|Select) all that apply\. -)\s?|\s?-\s?)'
            stem, item = '',''
            try:
                stem, item = self.question_descriptions[item_id].str.split(regex_split_item_stem_str, regex=True)[0]
            except ValueError as error:
                if str(error) == "not enough values to unpack (expected 2, got 1)": # question only has a stem
                    stem = self.question_descriptions[item_id].str.split(regex_split_item_stem_str, regex=True)[0][0]
            except KeyError as no_ques_desc_error:
                stem = item_id
            
            X.loc[X['Question Item Id']==item_id, 'Question Item'] = item.strip()
            X.loc[X['Question Stem Id']==stem_id, 'Question Stem'] = stem.strip()
        if self.use_feature_union:
            X = pd.DataFrame([X.columns] + X.values.tolist())
        return X

### TotalsCounter

In [4]:
class TotalsCounter(BaseEstimator, TransformerMixin):
    """
    A custom Pipeline transformer class used to generate counts of a set of questions by a demogrpahic.

    ...

    Attributes
    ----------
    demographics : list of str
        a list of demographic column names to break down the counts by
    question_columns : list of str or pandas.Index of str, default=[]
        a list of column names representing the questions we want counts of
    multiselect_stem_ids : list of str, default=[]
        a list of question stem IDs of multi-select questions
    stem_id_dict : dict
        a dictionary with question stem IDs as keys and their corresponding list of question item IDs as values
    use_feature_union : bool, default=False
        whether or not the result is input to FeatureUnion. Stores column names into 1st row of dataframe if True.

    Methods
    -------
    fit(self, X, y=None)
    transform(self, X, y=None)
        Returns a dataframe with columns showing each question item total count, question stem total count, each demographic value's total 
        count (per question item), and demographic value total by a demographic breakdown (eg. demographic value total, by undergrad/grad)
        NOTE: Row ordering should be the same as dataframes generated by QuestionCounter
    get_question_stem_ids(self)
        Returns a list of question stem IDs
    """
    def __init__(self, demographics, question_columns=[], multiselect_stem_ids=[], use_feature_union=True):
        self.demographics = demographics
        self.question_columns = pd.Index(question_columns)
        self.multiselect_stem_ids = multiselect_stem_ids
        self.stem_id_dict = dict()
        self.use_feature_union = use_feature_union

    def fit(self, X, y=None):
        def build_question_stem_ids(survey_df, multiselect_stems):
            """Populates question_columns; Creates the stem_id_dict dictionary with keys as question stem ids, values as corresponding question item ids
            """
            if len(self.question_columns) == 0:
                # excludes open-text columns
                self.question_columns = survey_df.columns[survey_df.columns.str.contains('^(?!.*_TEXT$)[A-Z\.0-9_]+$', regex=True)].str.strip()
            stem_id_dict = {}
            for i, item_id in enumerate(self.question_columns):
                stem_id = pd.Series(item_id).str.split('_\d+(?!\.)$', regex=True).values[0][0]
                if stem_id in multiselect_stems and stem_id not in stem_id_dict.keys():
                    question_item_ids = self.question_columns[self.question_columns.str.contains(stem_id)].tolist()
                    stem_id_dict[stem_id] = question_item_ids
                elif stem_id not in stem_id_dict.keys():
                    # single-select question item = question stem
                    stem_id_dict[item_id] = [item_id]     
            return stem_id_dict

        self.stem_id_dict = build_question_stem_ids(X, self.multiselect_stem_ids)
        return self
        
    def transform(self, X, y = None):
        def count_total(survey_df, demographics, stem_id, has_duplicate_demographics=False):
            """Counts total number of responses for a given question item/stem and given demographic groups
            """
            # calculate Question Item Total, Question Stem Total, Demographic Totals
            item_id = self.stem_id_dict[stem_id][0]
            if has_duplicate_demographics:
                total_df = survey_df.groupby(demographics[1:]+[item_id]).size().to_frame('Count')
            else:
                total_df = survey_df.groupby(demographics+[item_id]).size().to_frame('Count')
            # include rows where there are 0 responses; for uniformity purposes w/ demog total count
            total_df = total_df.unstack(fill_value=0).stack().reset_index()
            demographic_category = demographics[0]
            if total_df.shape[0] > 0:
                if len(demographics) == 1: # only 1 demographic breakdown
                    total_df['Demographic Value Total'] = total_df.groupby(demographic_category)['Count'].transform('sum')
                elif len(demographics) > 1: # 1 or more additional breakdowns (eg. entry status, by undergrad/grad OR entry status, by undegrad/grad by year)
                    total_df['Demographic Value Total'] = total_df.groupby(demographic_category)['Count'].transform('sum')
                    for demog in demographics[1:]:
                        if demographic_category == demog:
                            demographic_groups = [demographic_category]
                        else:
                            demographic_groups = [demographic_category, demog]
                        total_df['Demographic Value Total, by {}'.format(demog)] = total_df.groupby(demographic_groups)['Count'].transform('sum')
                    if len(demographics) > 2:
                        demographics = pd.Series(demographics)[~pd.Series(demographics).duplicated()].tolist()
                        total_df['Demographic Value Total, by {}'.format(' & '.join(demographics[1:]))] = total_df.groupby(demographics)['Count'].transform('sum')
            
                total_df = total_df.rename(columns={item_id: 'Question Response'})
                total_df['Question Stem Total'] = survey_df[stem_id].dropna().shape[0]
                total_df['Question Item Total'] = survey_df[item_id].dropna().shape[0] # only for non-select-all-that-apply questions
                total_cols = total_df.columns[total_df.columns.str.contains('Total')]
                return total_df[total_cols] # only return total columns
            else:
                return pd.DataFrame()
        
        def count_all_questions(survey_df, demographics, question_stem_ids):
            """Creates a dataframe with total number of responses, by demographics, for each single-select question (stem)
            """
            total_counts_df = pd.DataFrame()
            has_duplicate_demographics = pd.Series(demographics).duplicated().sum() > 0
            for stem_id in question_stem_ids:
                total_df = count_total(survey_df, demographics, stem_id, has_duplicate_demographics)
                total_counts_df = pd.concat([total_counts_df, total_df], ignore_index=True)
            return total_counts_df

        question_stem_ids = self.get_question_stem_ids()
        single_select_stem_ids = [stem_id for stem_id in question_stem_ids if len(self.stem_id_dict[stem_id])==1]
        total_counts = count_all_questions(X, self.demographics, single_select_stem_ids)
        if self.use_feature_union: # store columnn names in 1st row of dataframe
            total_counts = pd.DataFrame([total_counts.columns] + total_counts.values.tolist())
        return total_counts

    def get_question_stem_ids(self):
        """Returns a list of question stem IDs
        """
        return self.stem_id_dict.keys()

### DoubleCountDataframeTransformer

In [5]:
class DoubleCountDataframeTransformer(BaseEstimator, TransformerMixin):
    """
    A custom Pipeline transformer class used to generate duplicate rows for demographic variables that require double counting.
    ...

    Attributes
    ----------
    demographic : str
        a string indicating the demographic category to be double counted. Should match the column name in the dataframe passed in (the "X" variable)
        NOTE: This class will simply return the original dataframe if the demographic specified cannot be double counted.
              A double counting demographic will be a column with lists of to-be-double-counted values in each row.
        
    Methods
    -------
    fit(self, X, y=None)
    transform(self, X, y=None)
        Returns the dataframe that was passed into it (the "X" variable), double counted (ie. with duplicate responses that only has different values for the demographic)
    """
    def __init__(self, demographic):
        self.demographic = demographic
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        """Creates a dataframe that double counts ("explodes") rows by the given demographic column.

        Parameters
        ----------
        X : dataframe with a column name matching self.demographic

        Returns
        -------
        Pandas DataFrame
        """
        # won't change anything if the given demographic has no list of values to double count
        double_count_df = X.explode(self.demographic, ignore_index=True)
        return double_count_df

### ArrToDataframeTransformer

In [6]:
class ArrToDataframeTransformer(TransformerMixin):
    """
    A custom Pipeline transformer class used to convert 2D numpy arrays to a Pandas DataFrame.
    ...

    Attributes
    ----------
    None
            
    Methods
    -------
    fit(self, X, y=None)
    transform(self, X, y=None)
        Returns a dataframe with the first row of the input arrays as column names
    """
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """Creates a dataframe with the first row of the input array as column names 

        Parameters
        ----------
        X : 2D numpy array

        Returns
        -------
        Pandas DataFrame
        """
        df = pd.DataFrame(X, columns=X[0,:])
        return df.drop(0).reset_index(drop=True)

### DataframeTransposer

In [7]:
class DataframeTransposer(TransformerMixin):
    """
    A custom Pipeline transformer class used to transpose DataFrames or 2D numpy arrays
    ...

    Attributes
    ----------
    None
    
    Methods
    -------
    fit(self, X, y=None)
    transform(self, X, y=None)
        Returns the original dataframe (the "X" variable), transposed
    """
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """Transposes a dataframe or a 2D array

        Parameters
        ----------
        X : 2D numpy array or pandas dataframe

        Returns
        -------
        2D numpy array or pandas dataframe
        """
        return X.T