In [91]:
import os, pprint, copy, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm
from theano import shared
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler
from typing import Set, Tuple, Dict

In [92]:
class Preprocessor:
    """Pipeline to preprocess tabular IID data ready for input to a binary
        classification model.

        After instantiation, run the fit_transform() method on the training
        data, then use the transform() method to preprocess subsequent data.

        I deliberately avoid making the data an attribute of the class. This
        means that we have to pass the DataFrame between functions a lot, but
        it should make pickling the fitted preprocessor object more
        practical."""

    def __init__(self,
                 continuous_scaler=StandardScaler,
                 continuous_fill_value: float = 0.0,
                 categorical_fill_value: str = 'MISSING',
                 verbose: bool = False):
        """
        Args:
            continuous_fill_value: Value to replace missing values in continuous
                variables with, after scaling
            categorical_fill_value: Value to replace missing values in
                categorical variables with
            verbose: If True, prints reports on some parts of the preprocessing
        """
        self.continuous_scaler = continuous_scaler
        self.continuous_fill_value = continuous_fill_value
        self.categorical_fill_value = categorical_fill_value
        self.verbose = verbose
        self.y_name = None
        self.x_names = None
        self.force_categorical = None
        self.columns = None
        self.col_i = None
        self.encoders = {}

    def fit_transform(self,
                      df: pd.DataFrame,
                      y_name: str,
                      force_categorical: Set[str] = set()) -> Tuple[np.ndarray,
                                                                    np.ndarray]:
        """Transforms a table of training data ready for input to a binary
            classification model. Fits encoders for continuous and categorical
            variables in the process.

        Args:
            df: Rows are samples, columns are features. May contain missing
                values.
            y_name: Name of column (in df) containing the binary labels
            force_categorical: Column names for continuous variables to encode
                as categorical

        Returns:
            (features, binary labels)
        """
        self.y_name = y_name
        self.force_categorical = force_categorical
        df = df.reset_index(drop=True)
        self._set_columns(df)
        self._force_continuous_to_categorical()
        self._remove_y_from_categorical()
        df = self._drop_where_label_missing(df)
        df = self._fit_transform_continuous(df)
        df = self._make_continuous_missingness_indicators(df)
        df = self._impute_continuous_missingness(df)
        df = self._cast_categorical_as_str(df)
        df = self._add_missingness_category_for_categorical(df)
        df = self._fit_transform_all_categorical(df)
        self._set_x_names(df)
        self._set_col_i()
        df = self._fit_transform_y(df)
        return df[self.x_names].values, df[self.y_name].values

    def transform(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """Transforms (using already-fitted encoders) table of new data ready
            for input to a binary classification model.

        Args:
            df: Rows are samples, columns are features. Must match the format of
                the DataFrame passed to fit_transform() earlier. May contain
                missing values.

        Returns:
            (features, binary labels)
        """
        df = df.reset_index(drop=True)
        self._pre_transform_sanity_check(df)
        df = self._drop_where_label_missing(df)
        df = self._transform_continuous(df)
        df = self._make_continuous_missingness_indicators(df)
        df = self._impute_continuous_missingness(df)
        df = self._cast_categorical_as_str(df)
        df = self._add_missingness_category_for_categorical(df)
        df = self._transform_all_categorical(df)
        df = self._transform_y(df)
        return df[self.x_names].values, df[self.y_name].values

    def _pre_transform_sanity_check(self, df: pd.DataFrame):
        # TODO: Add some tests here
        pass

    def _set_columns(self, df: pd.DataFrame):
        """Divides DataFrame column names into categorical and continuous."""
        self.columns = {'cat': df.select_dtypes(
                            include=['object']).columns.tolist(),
                        'cont': df.select_dtypes(
                            exclude=['object']).columns.tolist()}

    def _force_continuous_to_categorical(self):
        """Forces categorical encoding of specified continuous variables."""
        for feature in self.force_categorical:
            self.columns['cont'].remove(feature)
            self.columns['cat'].append(feature)

    def _remove_y_from_categorical(self):
        """Removes y_name from self.columns['cat'] to avoid inappropriate
            processing as a categorical feature."""
        self.columns['cat'].remove(self.y_name)

    def _drop_where_label_missing(self, df: pd.DataFrame) -> pd.DataFrame:
        """Drops any rows in the data where the label is missing."""
        pre_drop_size = df.shape[0]
        df = df.dropna(subset=[self.y_name])
        n_dropped = pre_drop_size - df.shape[0]
        if n_dropped and self.verbose:
            print('Dropped {} rows where label missing.'.format(n_dropped))
        return df

    def _fit_transform_continuous(self, df: pd.DataFrame) -> pd.DataFrame:
        """Fits encoders for non-missing values of continuous variables, and
            scales them. Casts to float64."""
        self.encoders['scalers'] = dict()
        for col_name in self.columns['cont']:
            self.encoders['scalers'][col_name] = self.continuous_scaler()
            df.loc[df[col_name].notnull(), col_name] = self.encoders[
                'scalers'][col_name].fit_transform(df.loc[
                    df[col_name].notnull(),
                    col_name].values.reshape(-1, 1).astype('float64'))
        return df

    def _transform_continuous(self, df: pd.DataFrame) -> pd.DataFrame:
        """Scales non-missing values of continuous variables. Casts to
            float64."""
        for col_name in self.columns['cont']:
            if df.loc[df[col_name].notnull(), col_name].shape[0]:
                df.loc[df[col_name].notnull(), col_name] = self.encoders[
                    'scalers'][col_name].transform(df.loc[
                        df[col_name].notnull(),
                        col_name].values.reshape(-1, 1).astype('float64'))
        return df

    def _make_continuous_missingness_indicators(
            self, df: pd.DataFrame) -> pd.DataFrame:
        """For every continous variable, makes separate indicator column for
            each continuous variable, which is 1 where it has a missing value,
            else 0."""
        for col_name in self.columns['cont']:
            missing_indicator = df[col_name].isnull().astype(int)
            if self.verbose:
                n_missing = missing_indicator.sum()
                print('{} has {} missing values.'.format(col_name, n_missing))
            df['{}_MISSING'.format(col_name)] = missing_indicator
        return df

    def _impute_continuous_missingness(self, df: pd.DataFrame) -> pd.DataFrame:
        """Fill in missing values for continuous variables."""
        df.loc[:, self.columns['cont']] = df.loc[
            :, self.columns['cont']].fillna(self.continuous_fill_value)
        return df

    def _cast_categorical_as_str(self, df: pd.DataFrame) -> pd.DataFrame:
        """Casts non-missing values of categorical variables as strings, to
            ensure consistent typing once missingness category is added."""
        for col_name in self.columns['cat']:
            if df.loc[df[col_name].notnull(), col_name].shape[0]:
                df.loc[df[col_name].notnull(), col_name] = (
                    df.loc[df[col_name].notnull(), col_name].astype(str))
        return df

    def _add_missingness_category_for_categorical(
            self, df: pd.DataFrame) -> pd.DataFrame:
        """Fill in missing values for categorical variables with a separate
            string, that will end up being encoded as a separate category."""
        df.loc[:, self.columns['cat']] = df.loc[
            :, self.columns['cat']].fillna(self.categorical_fill_value)
        return df

    def _fit_transform_all_categorical(self, df: pd.DataFrame) -> pd.DataFrame:
        """Fits one-hot encoders for all categorical variables, then does the
            encoding. Adds a custom .columns_ attribute to each OHE object
            containing the corresponding DataFrame column names."""
        self.encoders['ohe'] = dict()
        for col_name in self.columns['cat']:
            ohe_var = pd.DataFrame(
                self._fit_transform_single_categorical(df, col_name),
                columns=self.encoders['ohe'][col_name].categories_[0])
            ohe_var.columns = ['{}_{}'.format(
                col_name, category.lstrip().replace(' ', '_').upper())
                for category in ohe_var.columns]
            self.encoders['ohe'][col_name].columns_ = list(ohe_var.columns)
            df = pd.concat((df.drop(col_name, axis=1), ohe_var), axis=1)
        return df

    def _fit_transform_single_categorical(self, df: pd.DataFrame,
                                          col_name: str) -> np.ndarray:
        """Fits a one-hot encoder then does the encoding for a single
            categorical variable."""
        categories = list(df[col_name].unique())
        if self.categorical_fill_value not in categories:
            categories.append(self.categorical_fill_value)
        self.encoders['ohe'][col_name] = OneHotEncoder(
            sparse=False,
            categories=[categories]
        )
        return self.encoders['ohe'][col_name].fit_transform(
            df[col_name].values.reshape(-1, 1))

    def _transform_all_categorical(self, df: pd.DataFrame) -> pd.DataFrame:
        """One-hot encodes the categories for all categorical variables."""
        for col_name in self.columns['cat']:
            ohe_var = pd.DataFrame(
                self.encoders['ohe'][col_name].transform(
                    df[col_name].values.reshape(-1, 1)),
                columns=self.encoders['ohe'][col_name].columns_)
            df = pd.concat((df.drop(col_name, axis=1), ohe_var), axis=1)
        return df

    def _set_x_names(self, df: pd.DataFrame):
        """Gets column names for features."""
        self.x_names = df.columns.tolist()
        self.x_names.remove(self.y_name)
    
    def _set_col_i(self):
        """Assign index to each column."""
        self.col_i = {k: v for k, v in zip(self.x_names,
                                           range(len(self.x_names)))}

    def _fit_transform_y(self, df: pd.DataFrame) -> pd.DataFrame:
        """Fits a LabelBinarizer to y column, then transforms it. Uses
            LabelBinarizer rather than OneHotEncoder to ovoid multiple-column
            encoding for a binary label."""
        self.encoders['lb'] = {self.y_name: LabelBinarizer()}
        df[self.y_name] = self.encoders['lb'][self.y_name].fit_transform(
            df[self.y_name].values)
        return df

    def _transform_y(self, df: pd.DataFrame) -> pd.DataFrame:
        """Binarizes y column."""
        df[self.y_name] = self.encoders['lb'][self.y_name].transform(
            df[self.y_name].values)
        return df

In [93]:
DATA = {'age': '70', 'workclass': 'Federal_gov', 'education': '7th-8th',
 'education-num': '4', 'marital_status': 'Married-spouse-absent',
 'occupation': 'Other-service', 'relationship': 'Other-relative',
 'race': 'Amer-Indian-Eskimo', 'sex': 'Male', 'capital_gain': '1000',
 'capital_loss': '0', 'hours_per_week': '23', 'native_country': ' Puerto-Rico',
 'label': '<=50K'  # dummy label, not used in prediction
}

In [94]:
pickle_in = open('my_dict.pkl', 'rb')
my_dict = pickle.load(pickle_in)

In [95]:
X = shared(my_dict['x_test'])
pp = my_dict['pp']
blr = my_dict['blr']

In [96]:
def convert_input_to_dataframe(user_input: dict) -> pd.DataFrame:
    '''Converts users' input ready for input to preprocessor. We put the dict
        in a list so that pandas makes a dataframe with 1 row, rather than a
        series.

    Args:
        user_input: Values inputted by user into web form.

    Return:
        Converted user input.
    '''
    return pd.DataFrame.from_dict([user_input])

In [97]:
# my_dict['pp'].__dict__

In [98]:
new_input = convert_input_to_dataframe(DATA)

x_test, y_test = pp.transform(new_input)  # new_input is DataFrame with 1 row

X.set_value(x_test)

ppc = pm.sample_ppc(my_dict['trace_blr'], samples=1000, model=my_dict['blr'], vars=[p])

KeyError: 'capital-gain'

In [None]:
predictions = ppc['p'].flatten()
predictions