# Programming Problem


### Overview
One of the key features of AutoML is finding the best predictive model for users’ data, making
lots of data science decisions automatically along the way.

Imagine you are adding a new binary classification model to the AutoML repository of algorithms
code using Python.

Choose one of the following models:
- Regularized Logistic Regression (scikit-learn)
- Gradient Boosting Machine (any: scikit-learn, XGBoost or LightGBM)
- Neural Network (Keras), with the architecture of your choice

Write a Python class, using principles of object-oriented design, to wrap the appropriate
estimator with the following functionality.

**Note** that all preprocessing should be done inside of this class.
You can delegate preprocessing to other helper classes if you’d like to, but the user
should be able to supply raw, unencoded features to all methods above.  

### Dataset
It contains a binary classification target for predicting loan defaults: https://drive.google.com/file/d/1rZaXyPd03GJ_xlLP39YGhCD7FdVkhh2o/view?usp=sharing

### Unit Tests
Please write the unit tests to check whether your model:
- is reproducible
- can handle missing values
- can handle new category levels at prediction time
- returns results in the expected format
- other useful unit tests you may think of (if time allows)

### Notes
Assume the data can have numeric and categorical variables. Both your training and prediction functions will take in a two-dimensional pandas DataFrame with a mixture of categorical and numeric variables.

Please adhere to [PEP-8](https://www.python.org/dev/peps/pep-0008) guidelines and following standard OOP practices.

In [1]:
import numpy as np
import pandas as pd

from typing import (
    Any,
    Dict,
    List,
    Set,
    Tuple
)

In [2]:
class Estimator:
    """Base class for all estimators"""
    
    
    def fit(self, X: pd.DataFrame, y: np.ndarray) -> None:
        """Fits on training data.

        Parameters
        ----------
        X : pd.DataFrame
            Input features
        y : np.ndarray
            Ground truth labels as a numpy array of 0-s and 1-s.
 
 
        Examples
        --------
        >>> X = pd.DataFrame({'feat1': ['a', 'b', 'a'], 'feat2': [1, 2, 3]})
        >>> y = np.array([0, 0, 1])
        >>> self.fit(X, y)
        """
        pass
    
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """Predicts class labels on new data.

        Parameters
        ----------
        X : pd.DataFrame
            Input features


        Return
        ------
        np.ndarray
            Predicted class labels


        Examples
        --------
        >>> X = pd.DataFrame({'feat1': ['a', 'b', 'a'], 'feat2': [1, 2, 3]})
        >>> self.predict(X)
        np.array([0, 0, 1])
        """
        return np.array()
    
    def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
        """Predicts the probability of each label.

        Parameters
        ----------
        X : pd.DataFrame
            Input features


        Return
        ------
        np.ndarray
            Predicted probability of each label


        Examples
        --------
        >>> X = pd.DataFrame({'feat1': ['a', 'b', 'a'], 'feat2': [1, 2, 3]})
        >>> self.predict_proba(X)
        np.array([[0.2, 0.8], [0.9, 0.1], [0.5, 0.5]])
        """
        return np.array()
    
    def evaluate(self, X: pd.DataFrame, y: np.ndarray) -> Dict[str, float]:
        """Evaluates "under the hood" model.
        
        Providing features X and Ground truth labels gets 
        the value of the following metrics: 
            1. `F1-score <https://en.wikipedia.org/wiki/F1_score>`_
            2. `LogLoss <https://en.wikipedia.org/wiki/Loss_functions_for_classification#Logistic_loss>`_

        Parameters
        ----------
        X : pd.DataFrame
            Input features
        y : np.ndarray
            Ground truth labels as a numpy array of 0-s and 1-s.

        Return
        ------
        np.ndarray
            Predicted probability of each label

        Examples
        --------
        >>> X = pd.DataFrame({'feat1': ['a', 'b', 'a'], 'feat2': [1, 2, 3]})
        >>> y = np.array([0, 0, 1])
        >>> self.evaluate(X, y)
        {'f1_score': 0.3, 'logloss': 0.7}
        """
        return {'f1_score': 0.0, 'logloss': 0.0}

    def tune_parameters(self, X: pd.DataFrame, y: np.ndarray) -> Dict[str, Dict]:
        """Tunes parameters of "under the hood" model.
        
        Finds the best hyperparameters using K-Fold cross-validation for evaluation.
        The user is not required to provide a parameter search space. This Estimator 
        picks a search space on its own.

        Parameters
        ----------
        X : pd.DataFrame
            Input features
        y : np.ndarray
            Ground truth labels as a numpy array of 0-s and 1-s.

        Return
        ------
        np.ndarray
            Output the best parameters and the mean CV score they achieve.

        Examples
        --------
        >>> X = pd.DataFrame({'feat1': ['a', 'b', 'a'], 'feat2': [1, 2, 3]})
        >>> y = np.array([0, 0, 1])
        >>> self.tune_parameters(X, y)
        {
            'best_parameters': {'C': 1.0, 'fit_intercept': False},
            'best_scores': {'f1_score': 0.3, 'logloss': 0.7},
        }
        """
        return {'best_parameters': {}, 'best_scores': {'f1_score': 0.0, 'logloss': 0.0}}


In [3]:
import copy

from dataclasses import dataclass, field, replace

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


@dataclass
class FeatureStats:
    """Holds summary stats about feature values"""
    
    name: str
    index: int
    unique_num: int = 0
    missed_num: int = 0
    
    def reindex(self, index):        
        return replace(self, index=index)
    
    def __str__(self):
        return (
            f'name={self.name}\n'
            f'index={self.index}\n'
            f'unique_num={self.unique_num}\n'
            f'missed_num={self.missed_num}'
        )
    
    
@dataclass
class ExploratoryReport:
    """Holds summary of data set main characteristics""" 

    objects_num: int = 0
    numeric_features: List[FeatureStats] = field(default_factory=list)
    categor_features: List[FeatureStats] = field(default_factory=list)
    numeric_features_idx: Dict[str, FeatureStats] = field(init=False, compare=False, repr=False, hash=None, default_factory=dict) 
    categor_features_idx: Dict[str, FeatureStats] = field(init=False, compare=False, repr=False, hash=None, default_factory=dict)
    
    def __post_init__(self):
        self.numeric_features_idx = {f.name: f for f in self.numeric_features}
        self.categor_features_idx = {f.name: f for f in self.categor_features}

    def __str__(self):
        return (
            f'objects_num={self.objects_num}\n'
            f'numeric_features={self.numeric_features}\n'
            f'categor_features={self.categor_features}'
        )
    
    
class ExploratoryDataAnalyst:
    """Base class for different approaches to analyzing data sets to summarize 
       their main characteristics
    """    
    
    def analyze(self, X: pd.DataFrame, y: np.ndarray) -> ExploratoryReport:
        """Analyzes provided dataset.

        Parameters
        ----------
        X: pd.DataFrame
            A collection of objects (objects-features matrix)
        y: np.ndarray
            Target variables (ground truth labels).
            

        Return
        ------
        ExploratoryReport


        Examples
        --------
        >>> X = pd.DataFrame({'feat1': ['a', 'b', 'a'], 'feat2': [1, 2, 3]})
        >>> y = np.array([0, 0, 1])
        >>> report = ExploratoryDataAnalyst.analyze(X, y)
        """
        
        numeric_cols = X.select_dtypes(include=['number']).columns
        categor_cols = X.select_dtypes(include=['object', 'bool']).columns
        
        numeric_features = [
            FeatureStats(
                name=c,
                index=X.columns.get_loc(c),
                unique_num=len(c_values.unique()),
                missed_num=c_values.isna().sum()
            ) for (c, c_values) in ((c, X[c]) for c in numeric_cols)]

        categor_features = [
            FeatureStats(
                name=c,
                index=X.columns.get_loc(c),
                unique_num=len(c_values.unique()),
                missed_num=c_values.isna().sum()
            ) for (c, c_values) in ((c, X[c]) for c in categor_cols)]

        return ExploratoryReport(
            objects_num=len(X), 
            numeric_features=numeric_features, 
            categor_features=categor_features
        )
        

class PreprocessingExpert(BaseEstimator, TransformerMixin):  
    """Plans a strategy of data preprocessing."""

    def __init__(self, 
                 correct_misses_numerics_strategy: str = 'mean', 
                 correct_misses_category_strategy: str = 'most_frequent', 
                 transform_feat_numerics_strategy: str = 'standardize', 
                 transform_feat_category_strategy: str = 'ohe'):       
        """Creates auto-preprocessing estimator and transformer which makes preprocessing decisions on its own.

        Parameters
        ----------
        correct_misses_numerics_strategy : one from {'mean', 'median'}, default 'mean'. 
            If multiple strategies is selected than correct_misses_numerics_columns must be specified.
                
        correct_misses_category_strategy : list of items from {'most_frequent'}, default ['most_frequent']. 
            If multiple strategies is selected than correct_misses_category_columns must be specified. 
        
        transform_feat_numerics_strategy : list of items from {'standardize', 'minmax'}, default ['standardize']. 
            If multiple strategies is selected than transform_feat_numerics_columns must be specified. 
                
        transform_feat_category_strategy : list of items from {'ohe'}, default ['ohe']. 
            If multiple strategies is selected than transform_feat_numerics_columns must be specified.  
        """
        
        # this default parameters map can be changed to reflect input data characteristics after
        # fit method is called
        self._possible_strategies = {
            'correct_misses': {
                'numerics_strategy': ['mean', 'median'],
                'category_strategy': ['most_frequent']
            },
            'transform_feat': {
                'numerics_strategy': ['standardize', 'minmax'],
                'category_strategy': ['ohe']
            }
        }
                
        self.correct_misses_numerics_strategy = correct_misses_numerics_strategy if correct_misses_numerics_strategy else 'mean'
        self.correct_misses_category_strategy = correct_misses_category_strategy if correct_misses_category_strategy else 'most_frequent'
        self.transform_feat_numerics_strategy = transform_feat_numerics_strategy if transform_feat_numerics_strategy else 'standardize'
        self.transform_feat_category_strategy = transform_feat_category_strategy if transform_feat_category_strategy else 'ohe'
                    
            
    def fit(self, X, y=None, data_report: ExploratoryReport = None):
        if not data_report:
            data_report = ExploratoryDataAnalyst().analyze(X, y)
            
        self._preprocessing_strategy = self.plan(data_report)
        self._preprocessing_strategy.fit(X, y)
        
        return self

    def transform(self, X):
        return self._preprocessing_strategy.transform(X)

    def get_possible_strategies(self):
        return copy.deepcopy(self._possible_strategies)
    
    def plan(self, data_report: ExploratoryReport) -> Pipeline:
        """Plans a strategy of dataset preprocessing based on its data report.

        Parameters
        ----------
        data_report : ExploratoryReport
            Dataset description with different statistics


        Return
        ------
        Pipeline
            The suggested strategy represented as Pipeline


        Examples
        --------
        >>> X = pd.DataFrame({'feat1': ['a', 'b', 'a'], 'feat2': [1, 2, 3]})
        >>> y = np.array([0, 0, 1])
        >>> dr = ExploratoryDataAnalyst.analyze(X, y)
        >>> self.plan(data_report=dr)
        """
        
        correct_misses_strategy, adj_report1 = self.plan_missing_values_strategy(data_report=data_report)
        transform_feat_strategy, adj_report2 = self.plan_transform_values_strategy(data_report=adj_report1)
    
        return Pipeline(steps=[
            ('correct_misses', correct_misses_strategy),
            ('transform_feat', transform_feat_strategy)
        ])
        
    def plan_missing_values_strategy(self, data_report: ExploratoryReport) -> Tuple[Pipeline, ExploratoryReport]:
        """This method owns knowledge about how to deal with missing values.

        Plans a strategy of dealing with missing values for dataset based on its data report.
        It tries to guess what are the possible strategies and what is the best strategy 
        to apply to such dataset. Different strategies will lead to different output shape,
        so planning also includes adjusting the input data report to correctly handle output 
        data in further preprocessing.

        Strategies depend on data statistics, selected model, etc. and can be various:
           - remove objects
           - remove feature
           - remove feature if collinear with other features
           - impute mean, moda
           - impute using KNN
          ...
          
          
        Parameters
        ----------
        data_report : ExploratoryReport
            Dataset description with different statistics
            
            
        Return
        ------
        Tuple[Pipeline, ExploratoryReport]
            the first tuple component is the suggested strategy represented as Pipeline
            the second tuple component is the adjusted data report for future use
        """
            
        numeric_cols = [f.index for f in data_report.numeric_features]
        categor_cols = [f.index for f in data_report.categor_features]
        
        nums_sz = len(numeric_cols)
        cats_sz = len(categor_cols)
        
        num_nan_miss_vals_imputer = SimpleImputer(missing_values=np.nan, strategy=self.correct_misses_numerics_strategy)
        cat_nan_miss_vals_imputer = SimpleImputer(missing_values=np.nan, strategy=self.correct_misses_category_strategy)
        
        adjusted_report = replace(
            data_report, 
            numeric_features=[f.reindex(index=i) 
                              for i, f in zip(range(nums_sz), data_report.numeric_features)], 
            categor_features=[f.reindex(index=i) 
                              for i, f in zip(range(nums_sz, nums_sz+cats_sz), data_report.categor_features)]
        )
        
        ct = ColumnTransformer([
            # these transformers do not change the input shape
            # TODO: open question is what to do if the input shape changed and we
            #       can not rely on data_report in next transformers
            ('num_nan_miss_vals_imputer', num_nan_miss_vals_imputer, numeric_cols),
            ('cat_nan_miss_vals_imputer', cat_nan_miss_vals_imputer, categor_cols)
        ])
        
        return ct, adjusted_report
 
    def plan_transform_values_strategy(self, data_report: ExploratoryReport) -> Tuple[Pipeline, ExploratoryReport]:
        """This method owns knowledge about how to transform different features.

        Strategies depend on data statistics and can be various for different feature
        scales - absolute aka numeric, interval, ordinal, nominal aka categorical - and 
        admissible set of operations defined for each scale.

        Upproaches can vary depending on selected model, data sparsity, number of outliers, 
        correlation and others.
        
        
        Parameters
        ----------
        report : ExploratoryReport
            Dataset description with different statistics
            
            
        Return
        ------
        Tuple[Pipeline, ExploratoryReport]
            the first tuple component is the suggested strategy represented as Pipeline
            the second tuple component is the adjusted data report for future use
        """
    
        numeric_cols = [f.index for f in data_report.numeric_features]
        categor_cols = [f.index for f in data_report.categor_features]
        
        if self.transform_feat_numerics_strategy == 'standardize':
            numerics_step = ('num_transform_standardize', StandardScaler(), numeric_cols)
        else:
            numerics_step = ('num_transform_minmax', MinMaxScaler(), numeric_cols)
            
        ct = ColumnTransformer([
            numerics_step,
            
            # these transformer will change the input shape, so futher
            # one can not use categor_cols from data_report
            ('cat_transform_oneh_encode', OneHotEncoder(sparse=False, handle_unknown='ignore'), categor_cols)
        ])

        return ct, None
    

In [4]:
from sklearn.metrics import f1_score, log_loss
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import f1_score, make_scorer

    
class LogRegClassifier(Estimator):
    """An automated version of Regularized Logistic Regression based on 
       scikit-learn SGDClassifier classifier.
    """
    
    def __init__(self, random_state: int = None):
        """Basic constructor.

        Parameters
        ----------
        random_state : int, default=None
            Random seed to force reproducibility while testing


        Examples
        --------
        >>> cls = LogRegClassifier(random_seed=242)
        >>> X = pd.DataFrame({'feat1': ['a', 'b', 'a'], 'feat2': [1, 2, 3]})
        >>> y = np.array([0, 0, 1])
        >>> cls.fit(X,y)
        """
        self._clf_pipeline = None
        self._expected_objects_features = None
        self._rseed = random_state
        self._epsilon = 1e-14

    def _assert_objects_features(self, X: pd.DataFrame):
        if list(X.columns) != self._expected_objects_features:
            raise Exception('Input features do not match expected list ' + self._expected_objects_features)
            
    def _assert_is_binary_target(self, y: np.ndarray):
        if len(np.unique(y)) != 2:
            raise Exception('Only Binary classification is supported')
            
    def _create_base_model(self):       
#         return SGDClassifier(
#             loss='log', 
#             penalty='l2', 
#             fit_intercept=False, 
#             max_iter=1000, 
#             tol=1e-4,
#             epsilon=self._epsilon,
#             shuffle=True,
#             random_state=self._rseed,
#             learning_rate='optimal'
#         )
        return LogisticRegression(
            penalty='l2',
            dual=False,
            tol=1e-4,
            C=1.0,
            solver='lbfgs',
            max_iter=1000,
            n_jobs=-1,
            random_state=self._rseed
        )

    def _create_default_pipeline(self):  
        return Pipeline(steps=[
            ('preprocessings', PreprocessingExpert()),
            ('logit_classify', self._create_base_model())
        ])
    
    def fit(self, X: pd.DataFrame, y: np.ndarray) -> None:
        self._assert_is_binary_target(y)
        
        self._expected_objects_features = list(X.columns)
       
        data_report = ExploratoryDataAnalyst().analyze(X, y)
        self._clf_pipeline = self._create_default_pipeline()
        
        self._clf_pipeline.fit(X, y, preprocessings__data_report=data_report)
    
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        self._assert_objects_features(X)
        return self._clf_pipeline.predict(X)
    
    def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
        self._assert_objects_features(X)
        return self._clf_pipeline.predict_proba(X)
    
    def evaluate(self, X: pd.DataFrame, y_true: np.ndarray) -> Dict[str, float]:
        self._assert_objects_features(X)
        self._assert_is_binary_target(y_true)
        
        y_pred = self._clf_pipeline.predict(X)
        y_pred_proba = self._clf_pipeline.predict_proba(X)
        
        return {
            'f1_score': f1_score(y_true=y_true, y_pred=y_pred, average='binary'), 
            'logloss': log_loss(y_true=y_true, y_pred=y_pred_proba, eps=self._epsilon)
        }
    
    def tune_parameters(self, X: pd.DataFrame, y: np.ndarray) -> Dict[str, Dict]:
        self._assert_is_binary_target(y)
        
        self._expected_objects_features = list(X.columns)
        
        data_report = ExploratoryDataAnalyst().analyze(X, y)
        preprocessings = PreprocessingExpert().fit(X, y, data_report=data_report)
        possible_strategies = preprocessings.get_possible_strategies()
        
        dict_merge = lambda a,b: a.update(b) or a
                    
        preprocessings_params = {
            'preprocessings__correct_misses_numerics_strategy': possible_strategies['correct_misses']['numerics_strategy'],
            'preprocessings__correct_misses_category_strategy': possible_strategies['correct_misses']['category_strategy'],
            'preprocessings__transform_feat_numerics_strategy': possible_strategies['transform_feat']['numerics_strategy'],
            'preprocessings__transform_feat_category_strategy': possible_strategies['transform_feat']['category_strategy']
        }
            
        base_params = dict_merge(preprocessings_params, {
            #'logit_classify__dual': [True, False],
            'logit_classify__C': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 5, 10, 50, 100, 1000]
        })
        
        param_grid = [
            dict_merge(base_params, {
                #'logit_classify__solver': ['liblinear', 'saga'],
                #'logit_classify__penalty': ['l1', 'l2']
            }), 
#             dict_merge(base_params, {
#                 #'logit_classify__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
#                 #'logit_classify__penalty': ['l2']
#             })
        ]
        
        print(param_grid)
        
        # n_splits should be adaptive
        kf = KFold(n_splits=10, shuffle=True, random_state=self._rseed)
        
        scoring = {
            'AUC': 'roc_auc', 
            'f1': make_scorer(f1_score, needs_proba=False, average='binary'),
            'logloss': make_scorer(log_loss, needs_proba=True, eps=self._epsilon)
        }
        
        self._clf_pipeline = self._create_default_pipeline()
        
        gs = GridSearchCV(
            self._clf_pipeline, 
            param_grid, 
            cv=kf, 
            n_jobs=-1, 
            verbose=1, 
            scoring=scoring,
            refit='AUC', 
            return_train_score=True
        )
        
        gs.fit(X, y)
        self._clf_pipeline = gs.best_estimator_
        
        return {
            'best_parameters': gs.best_params_, 
            'best_scores': {
                'f1_score': gs.cv_results_['mean_test_f1'][gs.best_index_], 
                'logloss': gs.cv_results_['mean_test_logloss'][gs.best_index_]
            }
        }

# Testing

In [34]:
import pathlib
import ipytest
import pytest

from unittest.mock import patch
from numpy.testing import assert_array_equal
from sklearn.model_selection import train_test_split 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer


ipytest.config(rewrite_asserts=True, magics=True)
__file__ = 'experiments.ipynb'


lending_club = pd.read_csv(pathlib.Path().cwd() / 'Lending_Club_reduced.csv')
lending_club_y = lending_club['is_bad'].to_numpy()
lending_club_x = lending_club.drop(['is_bad'], axis=1)
del lending_club


def assert_raise_message(exception, message, function, *args, **kwargs):
    try:
        function(*args, **kwargs)
    except exceptions as e:
        error_message = str(e)
        if message != error_message:
            raise AssertionError(f"Error message does equal to the expected string: {message}. Observed error message: {error_message}")


@pytest.fixture
def exploratory_test_report():
    numeric_features = [
        FeatureStats(
            name='num_test'+str(i),
            index=i,
            unique_num=42*i,
            missed_num=66*i) for i in range(2)]
 
    categor_features = [
        FeatureStats(
            name='cat_test'+str(i),
            index=len(numeric_features)+i,
            unique_num=42*(len(numeric_features)+i),
            missed_num=66*(len(numeric_features)+i)) for i in range(2)]

    return ExploratoryReport(
        objects_num=11, 
        numeric_features=numeric_features, 
        categor_features=categor_features
      )


@pytest.fixture
def lending_club_data():
    return (lending_club_x, lending_club_y)


@pytest.fixture
def lending_club_numeric_features():
    return [
        FeatureStats(name='Id', index=0, unique_num=10000, missed_num=0), 
        FeatureStats(name='annual_inc', index=3, unique_num=1902, missed_num=1), 
        FeatureStats(name='debt_to_income', index=9, unique_num=2585, missed_num=0), 
        FeatureStats(name='delinq_2yrs', index=10, unique_num=11, missed_num=5), 
        FeatureStats(name='inq_last_6mths', index=11, unique_num=21, missed_num=5), 
        FeatureStats(name='mths_since_last_delinq', index=12, unique_num=92, missed_num=6316), 
        FeatureStats(name='mths_since_last_record', index=13, unique_num=95, missed_num=9160), 
        FeatureStats(name='open_acc', index=14, unique_num=37, missed_num=5), 
        FeatureStats(name='pub_rec', index=15, unique_num=5, missed_num=5), 
        FeatureStats(name='revol_bal', index=16, unique_num=8130, missed_num=0), 
        FeatureStats(name='revol_util', index=17, unique_num=1028, missed_num=26), 
        FeatureStats(name='total_acc', index=18, unique_num=76, missed_num=5), 
        FeatureStats(name='collections_12_mths_ex_med', index=20, unique_num=2, missed_num=32), 
        FeatureStats(name='mths_since_last_major_derog', index=21, unique_num=3, missed_num=0)
    ]
    

@pytest.fixture
def lending_club_categor_features():
    return [
        FeatureStats(name='emp_length', index=1, unique_num=14, missed_num=0), 
        FeatureStats(name='home_ownership', index=2, unique_num=5, missed_num=0), 
        FeatureStats(name='verification_status', index=4, unique_num=3, missed_num=0), 
        FeatureStats(name='pymnt_plan', index=5, unique_num=2, missed_num=0), 
        FeatureStats(name='purpose_cat', index=6, unique_num=27, missed_num=0), 
        FeatureStats(name='zip_code', index=7, unique_num=720, missed_num=0), 
        FeatureStats(name='addr_state', index=8, unique_num=50, missed_num=0), 
        FeatureStats(name='initial_list_status', index=19, unique_num=2, missed_num=0), 
        FeatureStats(name='policy_code', index=22, unique_num=5, missed_num=0)
    ]
    

@pytest.fixture
def adjusted_data_report_after_correct_misses_step():
    return ExploratoryReport(
        objects_num=10000, 
        numeric_features=[FeatureStats(name='Id', index=0, unique_num=10000, missed_num=0), 
                          FeatureStats(name='annual_inc', index=1, unique_num=1902, missed_num=1), 
                          FeatureStats(name='debt_to_income', index=2, unique_num=2585, missed_num=0), 
                          FeatureStats(name='delinq_2yrs', index=3, unique_num=11, missed_num=5), 
                          FeatureStats(name='inq_last_6mths', index=4, unique_num=21, missed_num=5), 
                          FeatureStats(name='mths_since_last_delinq', index=5, unique_num=92, missed_num=6316), 
                          FeatureStats(name='mths_since_last_record', index=6, unique_num=95, missed_num=9160), 
                          FeatureStats(name='open_acc', index=7, unique_num=37, missed_num=5), 
                          FeatureStats(name='pub_rec', index=8, unique_num=5, missed_num=5), 
                          FeatureStats(name='revol_bal', index=9, unique_num=8130, missed_num=0), 
                          FeatureStats(name='revol_util', index=10, unique_num=1028, missed_num=26), 
                          FeatureStats(name='total_acc', index=11, unique_num=76, missed_num=5), 
                          FeatureStats(name='collections_12_mths_ex_med', index=12, unique_num=2, missed_num=32), 
                          FeatureStats(name='mths_since_last_major_derog', index=13, unique_num=3, missed_num=0)], 
        categor_features=[FeatureStats(name='emp_length', index=14, unique_num=14, missed_num=0), 
                          FeatureStats(name='home_ownership', index=15, unique_num=5, missed_num=0), 
                          FeatureStats(name='verification_status', index=16, unique_num=3, missed_num=0), 
                          FeatureStats(name='pymnt_plan', index=17, unique_num=2, missed_num=0), 
                          FeatureStats(name='purpose_cat', index=18, unique_num=27, missed_num=0), 
                          FeatureStats(name='zip_code', index=19, unique_num=720, missed_num=0), 
                          FeatureStats(name='addr_state', index=20, unique_num=50, missed_num=0), 
                          FeatureStats(name='initial_list_status', index=21, unique_num=2, missed_num=0), 
                          FeatureStats(name='policy_code', index=22, unique_num=5, missed_num=0)]
    )


preprocessing_expert_cm_numerics_strategy_params = pytest.mark.parametrize('cm_numerics_strategy', [None, 'mean', 'median'])
preprocessing_expert_cm_category_strategy_params = pytest.mark.parametrize('cm_category_strategy', [None, 'most_frequent'])
preprocessing_expert_tf_numerics_strategy_params = pytest.mark.parametrize('tf_numerics_strategy', [None, 'standardize', 'minmax'])
preprocessing_expert_tf_category_strategy_params = pytest.mark.parametrize('tf_category_strategy', [None, 'ohe'])

preprocessing_expert_cm_numerics_strategy_params_no_none = pytest.mark.parametrize('cm_numerics_strategy', ['mean', 'median'])
preprocessing_expert_cm_category_strategy_params_no_none = pytest.mark.parametrize('cm_category_strategy', ['most_frequent'])
preprocessing_expert_tf_numerics_strategy_params_no_none = pytest.mark.parametrize('tf_numerics_strategy', ['standardize', 'minmax'])
preprocessing_expert_tf_category_strategy_params_no_none = pytest.mark.parametrize('tf_category_strategy', ['ohe'])


@pytest.fixture
def default_preprocessing_expert_strategies():
    return {
        'correct_misses': {
            'numerics_strategy': ['mean', 'median'],
            'category_strategy': ['most_frequent']
        },
        'transform_feat': {
            'numerics_strategy': ['standardize', 'minmax'],
            'category_strategy': ['ohe']
        }
    }


@pytest.fixture
def random_state():
    return 242


In [112]:
data_report = ExploratoryDataAnalyst().analyze(lending_club_x, lending_club_y)


# pe = PreprocessingExpert(
 
# ) 

# pipeline = pe.plan(data_report=data_report)

lending_club_x

# print(repr(pipeline.get_params()))
for cm_numerics_strategy in ['mean', 'median']:
    for cm_category_strategy in ['most_frequent']:
        for tf_numerics_strategy in ['standardize', 'minmax']:
            for tf_category_strategy in ['ohe']:
            
                fn = [f.name for f in lending_club_numeric_features()]
                fc = [f.name for f in lending_club_categor_features()]

                t = SimpleImputer(missing_values=np.nan, strategy=cm_numerics_strategy)
                r = t.fit_transform(lending_club_x[fn], lending_club_y)

                t2 = SimpleImputer(missing_values=np.nan, strategy=cm_category_strategy)
                r2 = t2.fit_transform(lending_club_x[fc], lending_club_y)

                t3 = OneHotEncoder(sparse=False, handle_unknown='ignore')
                r3 = t3.fit_transform(r2, y)
                
                if tf_numerics_strategy == 'standardize':
                    t4 = StandardScaler()
                else:
                    t4 = MinMaxScaler()
                    
                r4 = t4.fit_transform(r, y)
                
                np.savez_compressed(f'cm__{cm_numerics_strategy}-{cm_category_strategy}__ft__{tf_numerics_strategy}-{tf_category_strategy}.npz', np.hstack((r4, r3)))

In [35]:
dict_merge = lambda a,b: a.update(b) or a


def test_feature_stats():
    fs = FeatureStats(
            name='test',
            index=21,
            unique_num=42,
            missed_num=66
         )
    
    assert fs.name == 'test'
    assert fs.index == 21
    assert fs.unique_num == 42
    assert fs.missed_num == 66
    assert str(fs) == 'name=test\nindex=21\nunique_num=42\nmissed_num=66'
            

def test_feature_stats_reindex():
    fs = FeatureStats(
            name='test',
            index=21,
            unique_num=42,
            missed_num=66
         )

    fs2 = fs.reindex(index=33)
    
    assert fs != fs2
    assert fs2.index == 33
    

def test_exploratory_report(exploratory_test_report):
    er = exploratory_test_report
    
    numeric_features = [
        FeatureStats(name='num_test0',index=0,unique_num=0,missed_num=0), 
        FeatureStats(name='num_test1',index=1,unique_num=42,missed_num=66)
    ]
    
    categor_features = [
        FeatureStats(name='cat_test0',index=2,unique_num=84,missed_num=132), 
        FeatureStats(name='cat_test1',index=3,unique_num=126,missed_num=198)
    ]
    
    numeric_features_idx = {f.name: f for f in numeric_features}
    categor_features_idx = {f.name: f for f in categor_features}
        
    assert er.objects_num == 11
    assert er.numeric_features == numeric_features
    assert er.categor_features == categor_features
    assert er.numeric_features_idx == numeric_features_idx
    assert er.categor_features_idx == categor_features_idx
    assert str(er) == "objects_num=11\nnumeric_features=[FeatureStats(name='num_test0', index=0, unique_num=0, missed_num=0), FeatureStats(name='num_test1', index=1, unique_num=42, missed_num=66)]\ncategor_features=[FeatureStats(name='cat_test0', index=2, unique_num=84, missed_num=132), FeatureStats(name='cat_test1', index=3, unique_num=126, missed_num=198)]"
    
    
def test_exploratory_data_analist(lending_club_data, lending_club_numeric_features, lending_club_categor_features):
    X, y = lending_club_data
    data_report = ExploratoryDataAnalyst().analyze(X, y)
            
    assert data_report.objects_num == 10000
    assert data_report.numeric_features == lending_club_numeric_features
    assert data_report.categor_features == lending_club_categor_features

In [36]:
@preprocessing_expert_tf_numerics_strategy_params
@preprocessing_expert_tf_category_strategy_params
def test_preprocessing_expert_plan_transform_values_strategy(lending_club_data,
                                                             tf_numerics_strategy, 
                                                             tf_category_strategy):
    X, y = lending_club_data
    data_report = ExploratoryDataAnalyst().analyze(X, y)

    pe = PreprocessingExpert(transform_feat_numerics_strategy=tf_numerics_strategy, 
                             transform_feat_category_strategy=tf_category_strategy) 

    pipeline, dr = pe.plan_transform_values_strategy(data_report=data_report)
    
    expected_parameters = {
        'None_None'  : ("{'n_jobs': None, 'remainder': 'drop', 'sparse_threshold': 0.3, 'transformer_weights': None, 'transformers': [('num_transform_standardize', StandardScaler(copy=True, with_mean=True, with_std=True), [0, 3, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21]), ('cat_transform_oneh_encode', OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,\n"
                        "              handle_unknown='ignore', sparse=False), [1, 2, 4, 5, 6, 7, 8, 19, 22])], 'verbose': False, 'num_transform_standardize': StandardScaler(copy=True, with_mean=True, with_std=True), 'cat_transform_oneh_encode': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,\n"
                        "              handle_unknown='ignore', sparse=False), 'num_transform_standardize__copy': True, 'num_transform_standardize__with_mean': True, 'num_transform_standardize__with_std': True, 'cat_transform_oneh_encode__categories': 'auto', 'cat_transform_oneh_encode__drop': None, 'cat_transform_oneh_encode__dtype': <class 'numpy.float64'>, 'cat_transform_oneh_encode__handle_unknown': 'ignore', 'cat_transform_oneh_encode__sparse': False}"),        
        'minmax_None': ("{'n_jobs': None, 'remainder': 'drop', 'sparse_threshold': 0.3, 'transformer_weights': None, 'transformers': [('num_transform_minmax', MinMaxScaler(copy=True, feature_range=(0, 1)), [0, 3, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21]), ('cat_transform_oneh_encode', OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,\n"
                        "              handle_unknown='ignore', sparse=False), [1, 2, 4, 5, 6, 7, 8, 19, 22])], 'verbose': False, 'num_transform_minmax': MinMaxScaler(copy=True, feature_range=(0, 1)), 'cat_transform_oneh_encode': OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,\n"
                        "              handle_unknown='ignore', sparse=False), 'num_transform_minmax__copy': True, 'num_transform_minmax__feature_range': (0, 1), 'cat_transform_oneh_encode__categories': 'auto', 'cat_transform_oneh_encode__drop': None, 'cat_transform_oneh_encode__dtype': <class 'numpy.float64'>, 'cat_transform_oneh_encode__handle_unknown': 'ignore', 'cat_transform_oneh_encode__sparse': False}")
    }
    
    expected_parameters = dict_merge(expected_parameters, {
        'None_ohe': expected_parameters['None_None'],
        'standardize_None': expected_parameters['None_None'],
        'standardize_ohe': expected_parameters['None_None'],
        'minmax_ohe': expected_parameters['minmax_None']
    })
    
    params_combi = str(tf_numerics_strategy) + '_' + str(tf_category_strategy)
    
    assert type(pipeline) == ColumnTransformer
    assert repr(pipeline.get_params()) == expected_parameters[params_combi]
    assert dr is None

    
@preprocessing_expert_cm_numerics_strategy_params
@preprocessing_expert_cm_category_strategy_params
def test_preprocessing_expert_plan_missing_values_strategy(lending_club_data,
                                                           adjusted_data_report_after_correct_misses_step,
                                                           cm_numerics_strategy, 
                                                           cm_category_strategy):
    X, y = lending_club_data
    data_report = ExploratoryDataAnalyst().analyze(X, y)

    pe = PreprocessingExpert(correct_misses_numerics_strategy=cm_numerics_strategy, 
                             correct_misses_category_strategy=cm_category_strategy) 

    pipeline, dr = pe.plan_missing_values_strategy(data_report=data_report)
    
    expected_parameters = {
        'None_None'  : ("{'n_jobs': None, 'remainder': 'drop', 'sparse_threshold': 0.3, 'transformer_weights': None, 'transformers': [('num_nan_miss_vals_imputer', SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n"
                        "              missing_values=nan, strategy='mean', verbose=0), [0, 3, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21]), ('cat_nan_miss_vals_imputer', SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n"
                        "              missing_values=nan, strategy='most_frequent', verbose=0), [1, 2, 4, 5, 6, 7, 8, 19, 22])], 'verbose': False, 'num_nan_miss_vals_imputer': SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n"
                        "              missing_values=nan, strategy='mean', verbose=0), 'cat_nan_miss_vals_imputer': SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n"
                        "              missing_values=nan, strategy='most_frequent', verbose=0), 'num_nan_miss_vals_imputer__add_indicator': False, 'num_nan_miss_vals_imputer__copy': True, 'num_nan_miss_vals_imputer__fill_value': None, 'num_nan_miss_vals_imputer__missing_values': nan, 'num_nan_miss_vals_imputer__strategy': 'mean', 'num_nan_miss_vals_imputer__verbose': 0, 'cat_nan_miss_vals_imputer__add_indicator': False, 'cat_nan_miss_vals_imputer__copy': True, 'cat_nan_miss_vals_imputer__fill_value': None, 'cat_nan_miss_vals_imputer__missing_values': nan, 'cat_nan_miss_vals_imputer__strategy': 'most_frequent', 'cat_nan_miss_vals_imputer__verbose': 0}"),        
        'median_None': ("{'n_jobs': None, 'remainder': 'drop', 'sparse_threshold': 0.3, 'transformer_weights': None, 'transformers': [('num_nan_miss_vals_imputer', SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n"
                        "              missing_values=nan, strategy='median', verbose=0), [0, 3, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21]), ('cat_nan_miss_vals_imputer', SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n"
                        "              missing_values=nan, strategy='most_frequent', verbose=0), [1, 2, 4, 5, 6, 7, 8, 19, 22])], 'verbose': False, 'num_nan_miss_vals_imputer': SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n"
                        "              missing_values=nan, strategy='median', verbose=0), 'cat_nan_miss_vals_imputer': SimpleImputer(add_indicator=False, copy=True, fill_value=None,\n"
                        "              missing_values=nan, strategy='most_frequent', verbose=0), 'num_nan_miss_vals_imputer__add_indicator': False, 'num_nan_miss_vals_imputer__copy': True, 'num_nan_miss_vals_imputer__fill_value': None, 'num_nan_miss_vals_imputer__missing_values': nan, 'num_nan_miss_vals_imputer__strategy': 'median', 'num_nan_miss_vals_imputer__verbose': 0, 'cat_nan_miss_vals_imputer__add_indicator': False, 'cat_nan_miss_vals_imputer__copy': True, 'cat_nan_miss_vals_imputer__fill_value': None, 'cat_nan_miss_vals_imputer__missing_values': nan, 'cat_nan_miss_vals_imputer__strategy': 'most_frequent', 'cat_nan_miss_vals_imputer__verbose': 0}")
    }
    
    expected_parameters = dict_merge(expected_parameters, {
        'mean_None': expected_parameters['None_None'],
        'None_most_frequent': expected_parameters['None_None'],
        'mean_most_frequent': expected_parameters['None_None'],
        'median_most_frequent': expected_parameters['median_None']
    })
        
    assert type(pipeline) == ColumnTransformer
    assert repr(pipeline.get_params()) == expected_parameters[str(cm_numerics_strategy) + '_' + str(cm_category_strategy)]
    assert dr is not None
    assert dr != data_report
    assert dr == adjusted_data_report_after_correct_misses_step


# TODO probably unsafe test, how to do it better?
@patch.object(PreprocessingExpert, 'plan_transform_values_strategy')
@patch.object(PreprocessingExpert, 'plan_missing_values_strategy')
@preprocessing_expert_cm_numerics_strategy_params
@preprocessing_expert_cm_category_strategy_params
@preprocessing_expert_tf_numerics_strategy_params
@preprocessing_expert_tf_category_strategy_params
def test_preprocessing_expert_plan(plan_transform_values_strategy_mock,
                                   plan_missing_values_strategy_mock,
                                   lending_club_data,
                                   cm_numerics_strategy, 
                                   cm_category_strategy, 
                                   tf_numerics_strategy, 
                                   tf_category_strategy):
    X, y = lending_club_data
    data_report = ExploratoryDataAnalyst().analyze(X, y)

    plan_transform_values_strategy_mock.return_value = FunctionTransformer(), data_report
    plan_missing_values_strategy_mock.return_value = FunctionTransformer(), data_report
    
    pe = PreprocessingExpert(correct_misses_numerics_strategy=cm_numerics_strategy, 
                             correct_misses_category_strategy=cm_category_strategy, 
                             transform_feat_numerics_strategy=tf_numerics_strategy, 
                             transform_feat_category_strategy=tf_category_strategy) 
    
    pipeline = pe.plan(data_report=data_report)
    
    plan_missing_values_strategy_mock.assert_called_with(data_report=data_report)
    plan_transform_values_strategy_mock.assert_called_with(data_report=data_report)
    
    assert type(pipeline) == Pipeline
    assert repr(pipeline.get_params()) == ("{'memory': None, 'steps': [('correct_misses', FunctionTransformer(accept_sparse=False, check_inverse=True, func=None,\n"
                                           "                    inv_kw_args=None, inverse_func=None, kw_args=None,\n"
                                           "                    validate=False)), ('transform_feat', FunctionTransformer(accept_sparse=False, check_inverse=True, func=None,\n"
                                           "                    inv_kw_args=None, inverse_func=None, kw_args=None,\n"
                                           "                    validate=False))], 'verbose': False, 'correct_misses': FunctionTransformer(accept_sparse=False, check_inverse=True, func=None,\n"
                                           "                    inv_kw_args=None, inverse_func=None, kw_args=None,\n"
                                           "                    validate=False), 'transform_feat': FunctionTransformer(accept_sparse=False, check_inverse=True, func=None,\n"
                                           "                    inv_kw_args=None, inverse_func=None, kw_args=None,\n"
                                           "                    validate=False), 'correct_misses__accept_sparse': False, 'correct_misses__check_inverse': True, 'correct_misses__func': None, 'correct_misses__inv_kw_args': None, 'correct_misses__inverse_func': None, 'correct_misses__kw_args': None, 'correct_misses__validate': False, 'transform_feat__accept_sparse': False, 'transform_feat__check_inverse': True, 'transform_feat__func': None, 'transform_feat__inv_kw_args': None, 'transform_feat__inverse_func': None, 'transform_feat__kw_args': None, 'transform_feat__validate': False}")

    
@preprocessing_expert_cm_numerics_strategy_params
@preprocessing_expert_cm_category_strategy_params
@preprocessing_expert_tf_numerics_strategy_params
@preprocessing_expert_tf_category_strategy_params
def test_preprocessing_expert_get_possible_strategies_without_fit(default_preprocessing_expert_strategies,
                                                                  cm_numerics_strategy, 
                                                                  cm_category_strategy, 
                                                                  tf_numerics_strategy, 
                                                                  tf_category_strategy):    
    pe = PreprocessingExpert(correct_misses_numerics_strategy=cm_numerics_strategy, 
                             correct_misses_category_strategy=cm_category_strategy, 
                             transform_feat_numerics_strategy=tf_numerics_strategy, 
                             transform_feat_category_strategy=tf_category_strategy) 
    
    possible_strategies1 = pe.get_possible_strategies()
    assert possible_strategies1 == default_preprocessing_expert_strategies 
    

@preprocessing_expert_cm_numerics_strategy_params
@preprocessing_expert_cm_category_strategy_params
@preprocessing_expert_tf_numerics_strategy_params
@preprocessing_expert_tf_category_strategy_params
def test_preprocessing_expert_get_possible_strategies_immutable(default_preprocessing_expert_strategies,
                                                                cm_numerics_strategy, 
                                                                cm_category_strategy, 
                                                                tf_numerics_strategy, 
                                                                tf_category_strategy):    
    pe = PreprocessingExpert(correct_misses_numerics_strategy=cm_numerics_strategy, 
                             correct_misses_category_strategy=cm_category_strategy, 
                             transform_feat_numerics_strategy=tf_numerics_strategy, 
                             transform_feat_category_strategy=tf_category_strategy) 
    
    possible_strategies1 = pe.get_possible_strategies()
    possible_strategies2 = pe.get_possible_strategies()
    
    assert possible_strategies1 == default_preprocessing_expert_strategies
    assert possible_strategies2 == default_preprocessing_expert_strategies
    assert possible_strategies1 is not possible_strategies2
    assert possible_strategies1 == possible_strategies2
            
    possible_strategies1['correct_misses']['numerics_strategy'].append('test')
    assert possible_strategies1 != possible_strategies2
    possible_strategies1['correct_misses']['numerics_strategy'].remove('test')
    assert possible_strategies1 == possible_strategies2

    possible_strategies1['correct_misses']['category_strategy'].append('test')
    assert possible_strategies1 != possible_strategies2
    possible_strategies1['correct_misses']['category_strategy'].remove('test')
    assert possible_strategies1 == possible_strategies2
    
    possible_strategies1['transform_feat']['numerics_strategy'].append('test')
    assert possible_strategies1 != possible_strategies2
    possible_strategies1['transform_feat']['numerics_strategy'].remove('test')
    assert possible_strategies1 == possible_strategies2
    
    possible_strategies1['transform_feat']['category_strategy'].append('test')
    assert possible_strategies1 != possible_strategies2
    possible_strategies1['transform_feat']['category_strategy'].remove('test')
    assert possible_strategies1 == possible_strategies2
    
    
@preprocessing_expert_cm_numerics_strategy_params
@preprocessing_expert_cm_category_strategy_params
@preprocessing_expert_tf_numerics_strategy_params
@preprocessing_expert_tf_category_strategy_params
def test_preprocessing_expert_get_possible_strategies_with_fit(default_preprocessing_expert_strategies,
                                                               lending_club_data,
                                                               cm_numerics_strategy, 
                                                               cm_category_strategy, 
                                                               tf_numerics_strategy, 
                                                               tf_category_strategy):
    X, y = lending_club_data
    data_report = ExploratoryDataAnalyst().analyze(X, y)
    
    pe = PreprocessingExpert(correct_misses_numerics_strategy=cm_numerics_strategy, 
                             correct_misses_category_strategy=cm_category_strategy, 
                             transform_feat_numerics_strategy=tf_numerics_strategy, 
                             transform_feat_category_strategy=tf_category_strategy) 
        
    pe.fit(X, y, data_report=data_report)
    
    possible_strategies = pe.get_possible_strategies()
    assert possible_strategies == default_preprocessing_expert_strategies 
        

@preprocessing_expert_cm_numerics_strategy_params_no_none
@preprocessing_expert_cm_category_strategy_params_no_none
@preprocessing_expert_tf_numerics_strategy_params_no_none
@preprocessing_expert_tf_category_strategy_params_no_none
def test_preprocessing_expert_fit_transform(lending_club_data,
                                            cm_numerics_strategy, 
                                            cm_category_strategy, 
                                            tf_numerics_strategy, 
                                            tf_category_strategy):        
    X, y = lending_club_data
    data_report = ExploratoryDataAnalyst().analyze(X, y)

    pe = PreprocessingExpert(correct_misses_numerics_strategy=cm_numerics_strategy, 
                             correct_misses_category_strategy=cm_category_strategy, 
                             transform_feat_numerics_strategy=tf_numerics_strategy, 
                             transform_feat_category_strategy=tf_category_strategy) 
        
    pe.fit(X, y, data_report=data_report)
        
    expected_file_name = f'cm__{cm_numerics_strategy}-{cm_category_strategy}__ft__{tf_numerics_strategy}-{tf_category_strategy}.npz'
    expected = np.load(expected_file_name, 'r')['arr_0']
    
    assert_array_equal(pe.transform(X), expected)

In [43]:
@pytest.fixture
def lending_club_categor_features2():
    return [
        'emp_length',
        'home_ownership',
        'verification_status',
        'pymnt_plan',
        'purpose_cat',
        'zip_code',
        'addr_state',
        'initial_list_status',
        'policy_code'
    ]


@pytest.fixture
def new_unique_category():
    return 'aee8eab3aeb0d2f20146ce8a4c1ebbb5'


def test_fit_predict(random_state, lending_club_data):
    X, y = lending_club_data
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_state)
    
    cls = LogRegClassifier(random_state=random_state)
    cls.fit(X_train, y_train)
    
    pred = cls.predict(X_test)
    expected = expected = np.load('LogRegClassifier_fit_predict.npz', 'r')['arr_0']
        
    assert_array_equal(pred, expected)
        
    
def test_fit_predict_proba(random_state, lending_club_data):
    X, y = lending_club_data
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_state)
    
    cls = LogRegClassifier(random_state=random_state)
    cls.fit(X_train, y_train)
    
    pred_proba = cls.predict_proba(X_test)
    expected = expected = np.load('LogRegClassifier_fit_predict_proba.npz', 'r')['arr_0']
        
    assert_array_equal(pred_proba, expected)

    
def test_evaluate(random_state, lending_club_data):
    X, y = lending_club_data
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_state)
    
    cls = LogRegClassifier(random_state=random_state)
    cls.fit(X_train, y_train)
    
    eval_res = cls.evaluate(X_test, y_test)
                       
    assert eval_res == {'f1_score': 0.21739130434782608, 'logloss': 0.3369127601475267}

    
def test_tune_parameters(random_state, lending_club_data):
    X, y = lending_club_data
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_state)
    
    cls = LogRegClassifier(random_state=random_state)
    
    best_params = cls.tune_parameters(X_train, y_train)
    eval_res = cls.evaluate(X_test, y_test)
    
    assert best_params == {
        'best_parameters': {
            'logit_classify__C': 0.1, 
            'preprocessings__correct_misses_category_strategy': 'most_frequent', 
            'preprocessings__correct_misses_numerics_strategy': 'median', 
            'preprocessings__transform_feat_category_strategy': 'ohe', 
            'preprocessings__transform_feat_numerics_strategy': 'standardize'
        }, 
        'best_scores': {
            'f1_score': 0.15587968850860037, 
            'logloss': 0.3573686967964508
        }
    }
    
    assert eval_res == {'f1_score': 0.14155251141552513, 'logloss': 0.338917581762135}
    

def test_fit_predict_with_new_categories(random_state, new_unique_category, lending_club_data, lending_club_categor_features2):
    X, y = lending_club_data

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_state)

    cls = LogRegClassifier(random_state=random_state)
    cls.fit(X_train, y_train)

    X_test = X_test.copy(deep=True)
    
    for c in lending_club_categor_features2:
        X_test.loc[:,c] = new_unique_category

    pred_proba = cls.predict_proba(X_test)
    expected = np.load('LogRegClassifier_fit_predict_proba_with_new_categories.npz', 'r')['arr_0']

    assert_array_equal(pred_proba, expected)

In [44]:
# import hashlib

# l =  [
#         FeatureStats(name='emp_length', index=1, unique_num=14, missed_num=0), 
#         FeatureStats(name='home_ownership', index=2, unique_num=5, missed_num=0), 
#         FeatureStats(name='verification_status', index=4, unique_num=3, missed_num=0), 
#         FeatureStats(name='pymnt_plan', index=5, unique_num=2, missed_num=0), 
#         FeatureStats(name='purpose_cat', index=6, unique_num=27, missed_num=0), 
#         FeatureStats(name='zip_code', index=7, unique_num=720, missed_num=0), 
#         FeatureStats(name='addr_state', index=8, unique_num=50, missed_num=0), 
#         FeatureStats(name='initial_list_status', index=19, unique_num=2, missed_num=0), 
#         FeatureStats(name='policy_code', index=22, unique_num=5, missed_num=0)
#     ]

# cf = [f.name for f in l]
# cf
# X, y = lending_club_x, lending_club_y
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=242)

# for c in cf:
#     X_test[c] = 'aee8eab3aeb0d2f20146ce8a4c1ebbb5'

# cls = LogRegClassifier(random_state=242)
# cls.fit(X_train, y_train)

# pred = cls.predict_proba(X_test)
# np.savez_compressed('LogRegClassifier_fit_predict_proba_with_new_categories.npz', pred)

In [45]:
ipytest.run('-qq')

................................................................................................................ [ 67%]
.....................................................                                                            [100%]
