# Multi-select Counter Transformers

**These classes will create a dataframe of counts for multi-select questions ONLY and will be combined with the single-select dataframe.**

In [1]:
import numpy as np
import pandas as pd
import sys
import os
pd.set_option("display.max_column", None)
pd.set_option("display.max_colwidth", None)
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

import numpy as np
import pandas as pd
from copy import deepcopy
import re

## CountTotal

In [2]:
class CountTotal(BaseEstimator, TransformerMixin): 
    def __init__(self, stem_id, question_desc, demo_cat, double_demo_cat, use_feature_union=False):
        self.stem_id = stem_id
        self.item_ids = []
        self.question_desc = question_desc
        #list of all single count demographics columns
        self.demo_cat = demo_cat
        #list of all couble count demographics columns
        self.double_demo_cat = double_demo_cat
        # whether or not this transformer is used in a FeatureUnion
        self.use_feature_union = use_feature_union      
        
    def fit(self, X, y=None):
        '''Create list of all item_ids for self.stem_id. Take out reporting college if in demo_cat'''
        for col in self.question_desc.columns: 
            self.question_desc[col] = self.question_desc[col].to_string().replace(' - Selected Choice', '').replace('0', '')
        #self.question_desc['Reporting College'] = 'Reporting College'    
        self.item_ids = X.columns[X.columns.str.contains(rf"{self.stem_id}(?!.*TEXT)")]
        return self
        
    def transform(self, X): 
        def count_helper(df, fields, column, key, original_df = None):
            """HELPER: recursively get counts with multi-level break_down
            Args:
            df: dataframe with raw survey data
            fields ([str]):  breakdown variables, metrics of interest (demographic details, etc.)
            column (str): the question/column to be counted
            key (str): unique identifier to count with (usually ResponseId)
            """
            field = fields[0]
            #indices = unique responses to question in "column"
            indices = df[column].unique() if original_df is None else original_df[column].unique()
            #columns = unique demographic categories in field
            columns = df[field].unique() if original_df is None else original_df[field].unique()
            
            #base case
            if len(fields) == 1:
                result = pd.DataFrame(columns=[x for x in columns if x==x],
                                              index=[x for x in indices if x==x])
                for index in indices:
                        if index!=index or index not in df[column].unique(): continue
                        curr_index = df.loc[df[column] == index]
                        for col in columns:
                            if col!=col or col == " " or col not in df[field].unique(): continue
                            temp = curr_index.loc[df[field] == col] 
                            count = temp[key].count()
                            result.at[index, col] = count
                return result.sort_index(axis=1).sort_index(axis=0).fillna(0)
            
            #recurse
            temp = pd.DataFrame(index=[x for x in indices if x==x])
            d = {}
            for col in columns:
                if col!=col or col == " ": continue
                curr = df.loc[df[fields[0]] == col]
                d[col] = count_helper(curr, fields[1:], column, key, (original_df if original_df is not None else df))
                
            temp = pd.concat(d, axis=1)
            return temp.sort_index(axis=1).sort_index(axis=0).fillna(0)
        
        def double_count_helper(df, double_count_separator="<->"):
            """HELPER: account for multiple-selection data points
            DO NOT use directly. Called when double_count is set to True in count_by_fields
            Args:
            df: dataframe with counts (e.g: result of count_by_fields())
            double_count_separator (str): separator used in datafile between individual choices
            level
            
            Return:
            new dataframe with no multiple-selections - results separated and double counted into respective columns
            """
            result = df.copy()
            reg = '\S'+double_count_separator+'\S'
            test = []
            
            #grabs all columns in results (does not care about separation)
            cols = result.columns.levels[0] if isinstance(result.columns, pd.MultiIndex) else result.columns
            
            for col in cols:
                col_cp = str(col)
                for match in re.findall(reg, str(col)):
                    col_cp = col_cp.replace(match, match[0]+"<<~>>"+match[-1])
                separated_col = col_cp.split("<<~>>") 
                test.append(separated_col)
                if len(separated_col) > 1:
                    for item in separated_col:
                        if item in result.columns: 
                            result[item] = result[col] + result[item]
                        else:
                            #append new separated column to data frame 
                            #first add name of column to dataframe with MultiIndex.from_arrays
                            new_col = result[col]
                            name = [item for i in range(new_col.shape[1])]
                            new_col.columns = pd.MultiIndex.from_arrays([name, new_col.columns])
                            
                            #then concat
                            result = pd.concat([result, new_col], axis=1)
                    result = result.drop(col, axis=1)
            return result

        def count_by_fields(df, fields, column, key, 
                    double_count=False, double_count_separator="<->"):
            """HELPER: break down survey dataframes by 2 metrics
            Args:
            df: dataframe with raw survey data
            fields ([str]):  breakdown variables, metrics of interest (demographic details, etc.)
            column (str): the question/column to be counted
            key (str): unique identifier to count with (usually ResponseId)
            
            Return:
            list of new 2D dataframes, each with counts from "column" and unique values of "fields" as indexes
            """
            result = count_helper(df, fields, column, key).fillna(0)
            
            if double_count:
                result = double_count_helper(result, double_count_separator)
                
            return result
        
        def data_studio_pivot(df, title, index_name, 
                      column_names, value_name, 
                      remove_low_counts=True, min_cell_count=10):
            """HELPER: create a pivot table to be used as data sources
            Args:
            df: dataframe with counts (e.g: result of count_by_fields())
            title (str): common identifier for the pivot table (e.g: the common question)
            index_name (str): what to title the indexes of the original dataframe (e.g: "responses")
            column_names (List[str]): what to title the columns' names in the original dataframe (e.g: demographic)
            value_name (str): what to title the values of the original dataframe (e.g: "counts", "percentages")
            remove_low_counts (bool): whether to remove cells with low count for privacy reasons (default True)
            min_cell_count (int): lower bound for cell count before removal (default 10)
            
            Return:
            new pivot table (as a dataframe)
            """
            if isinstance(df.columns, pd.MultiIndex): 
                assert df.columns.nlevels == len(column_names)
            result = pd.DataFrame(columns=["Title", index_name] + column_names + [value_name])
            i = 0
            for col in df.columns:
                remove = remove_low_counts and np.sum(df[col]) <= min_cell_count
                for index in df.index:
                    val = df.at[index,col] if not remove else -1
                    df_dict = {"Title": title, index_name: index, value_name: val}
                    if isinstance(df.columns, pd.MultiIndex): 
                        df_dict.update({column_names[i]: col[i] for i in range(len(column_names))})
                    else:
                        df_dict.update({column_names[0]: col})
                    result = pd.concat([result, pd.DataFrame(df_dict, index=[i])])
                    i+=1
            return result
                          
        def add_cols(sub_df, item_id):
            '''HELPER: Adds [Question Item Id, Question Stem Id, Question Stem, Question Item] columns to create_counts_df output
            ------------------------------------------
            sub_df : df created from create_counts_df
            item_id : ONE string for item id 

            '''
            sub_df['Question Item Id'] = [item_id for i in range(len(sub_df.index))]
            sub_df['Question Stem Id'] = sub_df['Question Item Id'].str.replace(r'(_\d+)?$', '', regex=True)

            question_stem = self.question_desc[item_id][0].split(" - ")[0]
            question_item = self.question_desc[item_id][0].split(" - ")[1:]
            question_item = " ".join(question_item)
            sub_df['Question Stem'] = [question_stem for i in range(len(sub_df.index))]
            sub_df['Question Item'] = [question_item for i in range(len(sub_df.index))]
            return sub_df
        
    
        def create_counts_df(raw_survey, stem_id, demo_cat = self.demo_cat):
            '''Counts for each demographic 
            ---------------------------------------
            X: raw survey df (excluding questions) 
            stem_id: ONE string for stem id

            '''
            df = pd.DataFrame()
            for item in self.item_ids:
                for category in demo_cat: 
                    df1 = data_studio_pivot(
                        count_by_fields(raw_survey, [category, "Undergrad Grad"], 
                                        item, "ResponseId", double_count=True, 
                                        double_count_separator="<->"),
                                        " ", "Question Response", ["Demographic Value", "Undergrad Grad"], 
                                        "Count",
                                        remove_low_counts=False
                                        )
                    df1['Demographic Category'] = [category for i in range(len(df1.index))]
                    df1 = add_cols(df1, item) 
                    df = pd.concat([df, df1], ignore_index = True).drop(columns = ['Title']) 
            return df

        def add_stem_item_total(raw_survey, counts_df, stem_ids):
            def questionstemtotal(df, stem_ids):
                qtotal = {}
                for stem_id in stem_ids: 
                    total = df[df.columns[df.columns.str.contains(rf"{stem_id}(?!.*TEXT)")]].any(axis=1).sum()  
                    qtotal[stem_id] = total 
                return qtotal


            # itemtotal is a dictionary of question id to id total 
            # returns a dictionary 
            def questionitemtotal(df, stem_ids):
                itemtotal = {}
                for stem_id in stem_ids: 
                    for item in df.columns[df.columns.str.contains(rf"{stem_id}(?!.*TEXT)")]:
                        itemtotal[item] = df[item].dropna().shape[0]
                return itemtotal
                
            if type(stem_ids) != list:
                stem_ids = [stem_ids]
            q_stemtotal_dict = questionstemtotal(raw_survey, stem_ids) 
            q_itemtotal_dict = questionitemtotal(raw_survey, stem_ids)
            counts_df['Question Item Total'] = counts_df['Question Item Id'].map(q_itemtotal_dict) 
            counts_df['Question Stem Total'] = counts_df['Question Stem Id'].map(q_stemtotal_dict)
            return counts_df
        
        counts_data_source = create_counts_df(X, self.stem_id)
        counts_data_source = add_stem_item_total(X, counts_data_source, self.stem_id)
        
        ## ROSE EDIT: Concat to counts_data_source double counting columns (ex: Reporting College, Multiple Ethnicities) ##
        double_counts_data_source = pd.DataFrame()
        for demo in self.double_demo_cat: 
            explode_df = X.explode(demo).reset_index(drop=True)
            dbl_data_source = create_counts_df(explode_df, self.stem_id, [demo])
            dbl_data_source = add_stem_item_total(explode_df, dbl_data_source, self.stem_id)
            double_counts_data_source = pd.concat([double_counts_data_source, dbl_data_source], ignore_index=True)
            
        counts_data_source = pd.concat([double_counts_data_source, counts_data_source], ignore_index = True) 
        
        # get rid of grad rows if it's an undergrad only question, and vice versa
        population = X.loc[X[self.item_ids].any(axis=1), 'Undergrad Grad'].unique()
        if 'U' in population and 'G' not in population:
            counts_data_source = counts_data_source[counts_data_source['Undergrad Grad']=='U']
        if 'G' in population and 'U' not in population:
            counts_data_source = counts_data_source[counts_data_source['Undergrad Grad']=='G']
        # get rid of rows with count 0 that don't make sense (eg. undergrad "Masters")
        
        ## ROSE EDIT: Values for Undergrad Grad is not always U or G ## 
        for demog in self.demo_cat:
            if demog != 'Undergrad Grad':
                u_demog = X.loc[X['Undergrad Grad']=='U', demog].unique()
                g_demog = X.loc[X['Undergrad Grad']=='G', demog].unique()
                counts_data_source = counts_data_source[((counts_data_source['Demographic Category']!=demog)|((counts_data_source['Demographic Value'].isin(u_demog))&(counts_data_source['Undergrad Grad']=='U')))|
                                                        ((counts_data_source['Demographic Category']!=demog)|((counts_data_source['Demographic Value'].isin(g_demog))&(counts_data_source['Undergrad Grad']=='G')))]     
        
        # get rid of rows where "Demographic Value" = 'G' but "Undergrad Grad" is "U", and vice versa
        counts_data_source = counts_data_source[(counts_data_source['Demographic Category']!='Undergrad Grad')|
                                                (counts_data_source['Demographic Value']==counts_data_source['Undergrad Grad'])]
        counts_data_source = counts_data_source.reset_index(drop=True)
        
        if self.use_feature_union: # store columnn names in 1st row of dataframe
            counts_data_source = pd.DataFrame([counts_data_source.columns] + counts_data_source.values.tolist())
        
        return counts_data_source

## DemoTotals

In [3]:
class DemoTotals(CountTotal): 
    def __init__(self, stem_id, demo_cat, double_demo_cat, use_feature_union=False):
        #list of select all that apply columns grouping ID 
        self.stem_id = stem_id
        self.item_ids = []
        #list of all single count demographics columns
        self.demo_cat = demo_cat
        #list of all doublt count demographics columns
        self.double_demo_cat = double_demo_cat
        # whether or not this transformer is used in a FeatureUnion
        self.use_feature_union = use_feature_union
    
    def fit(self, X, y=None):
        '''Create list of all item_ids for self.stem_id'''
        self.item_ids = X.columns[X.columns.str.contains(rf"{self.stem_id}(?!.*TEXT)")]
        return self
    
    def transform(self, X): 
        
        def demototal(raw_survey, demo_cat = self.demo_cat):
            '''Dataframe with [Demographic Value, 'Demographic Value Total', 'Demographic Category'] columns for one stem id 
            ---------------------------------------
            raw_survey: raw survey df (excluding questions) 
            stem_id: ONE string for stem id

            '''
            df = pd.DataFrame()
            for category in demo_cat: 
                x = raw_survey[raw_survey[self.item_ids].any(axis=1)].groupby(category).count().iloc[:,0].to_frame('Demographic Value Total').reset_index()    
                x['Demographic Category'] = [category for i in range(len(x.index))]
                df = pd.concat([df, x], ignore_index = True)
            df['Demographic Value'] = df.drop(columns = ['Demographic Value Total', 'Demographic Category']).stack().values
            df = df[['Demographic Value', 'Demographic Value Total', 'Demographic Category']]
            
            return df


        def demoUGtotal(raw_survey, stem_id, demo_cat = self.demo_cat):
            '''Dataframe with [Demographic Value, 'Demographic Value Total', 'Undergrad Grad', 'Demographic Category'] columns for one stem id 
            ---------------------------------------
            raw_survey: raw survey df (excluding questions) 
            stem_id: ONE string for stem id

            '''
            df = pd.DataFrame()
            for category in demo_cat:
                if category != 'Undergrad Grad':
                    x = raw_survey[raw_survey[self.item_ids].any(axis=1)].groupby([category, 'Undergrad Grad']).count().iloc[:,0].to_frame('Demographic Value Total, by Undergrad Grad')
                    x = x.unstack(fill_value=0).stack().reset_index() 
                    x['Demographic Category'] = [category for i in range(len(x.index))]
                    df = pd.concat([df, x], ignore_index = True)
                
            df['Demographic Value'] = df.drop(columns = ['Demographic Value Total, by Undergrad Grad', 'Undergrad Grad', 'Demographic Category']).stack().values
            df = df[['Demographic Value', 'Undergrad Grad', 'Demographic Value Total, by Undergrad Grad', 'Demographic Category']]
    
            # Add Undergrad Grad rows
            additional_rows = []
            for val in ['U', 'G']:
                val_df = demototal(raw_survey)
                try:
                    ug_value = val_df.loc[(val_df['Demographic Category']=='Undergrad Grad')&(val_df['Demographic Value']==val), 'Demographic Value Total'].values[0]
                    ug_demo_counts_1 = {'Demographic Value': val, 'Undergrad Grad': 'G', 'Demographic Value Total, by Undergrad Grad': ug_value, 'Demographic Category': 'Undergrad Grad'}
                    additional_rows.append(pd.DataFrame([ug_demo_counts_1]))
                except Exception:
                    pass
                try: 
                    ug_value = val_df.loc[(val_df['Demographic Category']=='Undergrad Grad')&(val_df['Demographic Value']==val), 'Demographic Value Total'].values[0]
                    ug_demo_counts_2 = {'Demographic Value': val, 'Undergrad Grad': 'U', 'Demographic Value Total, by Undergrad Grad': ug_value, 'Demographic Category': 'Undergrad Grad'}
                    additional_rows.append(pd.DataFrame([ug_demo_counts_2]))
                except Exception:
                    pass
    
            df = pd.concat([df] + additional_rows, ignore_index=True)

            return df

        demo_only = demototal(X)
        demo_ug = demoUGtotal(X, self.stem_id)
        demos = demo_only.merge(demo_ug, on=["Demographic Value", "Demographic Category"])

        
        ## ROSE EDIT: add double counting dfs ## 
        double_counts_demos = pd.DataFrame()
        for demo in self.double_demo_cat: 
            explode_df = X.explode(demo).reset_index(drop=True)
            dbl_demo_only = demototal(explode_df, [demo])
            dbl_demo_ug = demoUGtotal(explode_df, self.stem_id, [demo])
            dbl_demos = dbl_demo_only.merge(dbl_demo_ug, on=["Demographic Value", "Demographic Category"])
            double_counts_demos = pd.concat([double_counts_demos, dbl_demos], ignore_index = True)
        demos = pd.concat([double_counts_demos, demos], ignore_index = True)

        # duplicate for however many item ids there are and concat on top of each other
        demos_all_items = pd.concat([demos]*len(self.item_ids), ignore_index=True)
        id_list = []
        for item in self.item_ids:
            duplicated_items = [item for _ in range(len(demos.index))]
            id_list.extend(duplicated_items)
        demos_all_items['Question Item Id'] = id_list
        
        
        # get rid of grad rows if it's an undergrad only question, and vice versa
        population = X.loc[X[self.item_ids].any(axis=1), 'Undergrad Grad'].unique()
        if 'U' in population and 'G' not in population:
            demos_all_items = demos_all_items[demos_all_items['Undergrad Grad']=='U']
        if 'G' in population and 'U' not in population:
            demos_all_items = demos_all_items[demos_all_items['Undergrad Grad']=='G']
       
        # get rid of rows with count 0 that don't make sense (eg. undergrad "Masters")
        for demog in self.demo_cat:
            if demog != 'Undergrad Grad':
                u_demog = X.loc[X['Undergrad Grad']=='U', demog].unique()
                g_demog = X.loc[X['Undergrad Grad']=='G', demog].unique()
                demos_all_items = demos_all_items[((demos_all_items['Demographic Category']!=demog)|((demos_all_items['Demographic Value'].isin(u_demog))&(demos_all_items['Undergrad Grad']=='U')))|
                                                  ((demos_all_items['Demographic Category']!=demog)|((demos_all_items['Demographic Value'].isin(g_demog))&(demos_all_items['Undergrad Grad']=='G')))]     

        # get rid of rows where "Demographic Value" = 'G' but "Undergrad Grad" is "U", and vice versa
        demos_all_items = demos_all_items[(demos_all_items['Demographic Category']!='Undergrad Grad')|
                                          (demos_all_items['Demographic Value']==demos_all_items['Undergrad Grad'])]
        demos_all_items = demos_all_items.reset_index(drop=True)
        

        if self.use_feature_union: # store columnn names in 1st row of dataframe
            demos_all_items = pd.DataFrame([demos_all_items.columns] + demos_all_items.values.tolist())
            
        return demos_all_items        

## MergeDataFrames

In [4]:
## merges two dataframes together 
## takes in two CLASSES (Ex: CountTotal(MULTI_SELECT_STEM_IDS[0], QUESTION_DESC, DEMOGRAPHIC_COLUMNS[:-2])), then fit_transforms them, then merges them 
class MergeDataFrames(BaseEstimator, TransformerMixin): 
    def __init__(self, class1, class2, merge_cols, join_type): 
        self.class1 = class1
        self.class2 = class2 
        self.merge_cols = merge_cols
        self.join_type = join_type
 
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df1 = self.class1
        df1 = df1.fit_transform(X)
        df2 = self.class2
        df2 = df2.fit_transform(X)
        #COMMENT OUT IF df2 IS NOT DEMOTOTALS
        #df2 = df2[['Demographic Value Total', 'Demographic Value Total, by Undergrad Grad', "Demographic Value", "Demographic Category", 'Undergrad Grad']]
        
        merged_df = pd.merge(df1, df2, on=self.merge_cols, how=self.join_type)
        
        return merged_df

## DropLowCounts

In [5]:
class DropLowCounts(BaseEstimator, TransformerMixin):
    """
    Changes counts/totals lower than lowest_count in a dataframe to -1

    ...

    Attributes
    ----------
    lowest_count : int
        Value of lowest count acceptable in dataframe (set to 11) 
        
    demo_only_col : string
        String of the column in dataframe that has the demographic total
        
    demo_ug_col : string
        String of the column in dataframe that has the demographic and U/G total
        
    count_col : string
        String of the column in dataframe that has the counts  


    Methods
    -------
    transform(X) 
        Replaces values in each column with -1 if the demo_only_col is less than lowest_count 
        If demo_ug_col has values less than lowest_count, redacts demo and U/G total ONLY 

    """
    def __init__(self, demo_only_col, demo_ug_col, count_col, lowest_count=11, use_feature_union=False):
        self.lowest_count = lowest_count
        self.demo_only_col = demo_only_col
        self.demo_ug_col = demo_ug_col
        self.count_col = count_col
        self.use_feature_union = use_feature_union

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):        
        # names of columns to change to -1 if count is low in demo_only
        change_cols = [self.demo_only_col, self.demo_ug_col, self.count_col]

        # if there's no more than 10 in a demographic group, redact all
        for col in change_cols: 
            X[col] = X[col].where(X[self.demo_only_col] >= self.lowest_count, -1)
        
        # check for values less than lowest_count in demo_ug --> redact demo & UG total only
        # b/c they could potentially be used for all respondents, by demographic (aka Demographic Total (Demog Only) > 10)
        X[self.demo_ug_col] = X[self.demo_ug_col].where(X[self.demo_ug_col] >= self.lowest_count, -1)

        if self.use_feature_union: # store columnn names in 1st row of dataframe
            X = pd.DataFrame([X.columns] + X.values.tolist())
        return X

## ColumnsReindexer

In [6]:
class ColumnsReindexer(TransformerMixin):
    def __init__(self, columns=[], use_feature_union=False):
        self.ordered_columns = columns
        self.use_feature_union = use_feature_union
    
    def fit(self, X, y=None):
        if len(self.ordered_columns) == 0:
            self.ordered_columns = ['Question Stem Id', 'Question Item Id', 
                                    'Demographic Category', 'Demographic Value', 
                                    'Undergrad Grad', 'Question Response', 'Count',
                                    'Question Item', 'Question Stem', 'Demographic Value Total',
                                    'Demographic Value Total, by Undergrad Grad', 
                                    'Question Stem Total', 'Question Item Total']
        return self
    
    def transform(self, X):
        X = X.reindex(columns=self.ordered_columns)
        if self.use_feature_union: # store columnn names in 1st row of dataframe
            X = pd.DataFrame([X.columns] + X.values.tolist())
        return X

## DataframeTransposer

In [7]:
class DataframeTransposer(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """Transposes a dataframe or a 2D array

        Parameters
        ----------
        X : 2D numpy array or pandas dataframe

        Returns
        -------
        2D numpy array or pandas dataframe
        """
        return X.T

## ArrToDataframeTransformer

In [8]:
class ArrToDataframeTransformer(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """Creates a dataframe with the first row of the input array as column names 

        Parameters
        ----------
        X : 2D numpy array

        Returns
        -------
        Pandas DataFrame
        """
        df = pd.DataFrame(X, columns=X[0,:])
        return df.drop(0).reset_index(drop=True)