In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.model_selection import train_test_split,GridSearchCV, \
    cross_val_score, StratifiedKFold, validation_curve, learning_curve

from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import make_scorer, classification_report, confusion_matrix, fbeta_score

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set; sns.set_style('whitegrid')
%matplotlib inline  

# display of all columns in df - check if pd option below isn't better
from IPython.display import display
pd.options.display.max_columns = None

### Check data

In [2]:
XX = pd.read_csv('Financial Distress.csv')

In [3]:
XX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3670 entries, 0 to 3669
Columns: 127 entries, Company to x124
dtypes: float64(114), int64(13)
memory usage: 3.6 MB


In [4]:
XX.head(2)

Unnamed: 0,Company,Time,Financial Distress,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36,x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49,x50,x51,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61,x62,x63,x64,x65,x66,x67,x68,x69,x70,x71,x72,x73,x74,x75,x76,x77,x78,x79,x80,x81,x82,x83,x84,x85,x86,x87,x88,x89,x90,x91,x92,x93,x94,x95,x96,x97,x98,x99,x100,x101,x102,x103,x104,x105,x106,x107,x108,x109,x110,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120,x121,x122,x123,x124
0,1,1,0.010636,1.281,0.022934,0.87454,1.2164,0.06094,0.18827,0.5251,0.018854,0.18279,0.006449,0.85822,2.0058,0.12546,6.9706,4.6512,0.0501,2.1984,0.018265,0.024978,0.027264,1.4173,9.5554,0.14872,0.66995,214.76,12.641,6.4607,0.043835,0.20459,0.35179,8.3161,0.28922,0.76606,2.5825,77.4,0.026722,1.6307,0.015016,0.005478,0.1273,9.6951,-0.73622,0.98559,0.18016,1.5006,0.026224,7.0513,1174.9,5.3399,0.85128,12.837,0.061737,0.1809,209.87,-0.58255,0.47101,0.1099,0.0,0.0,0.22009,0.13076,0.14952,0.19518,0.1075,1224.5,1.0422,4.892,6.7291,0.5386,104.41,0.49844,2.3224,300.0,0.14653,1.0214,24.402,-47.071,129,1200.0,-0.4623,391.0,2870000.0,8990000000.0,31,31400000000.0,9.98e-09,25.75,0.19693,74.25,38.44,15.93,0.0,0.0,74.25,1,2,0,5,0,0,0.8,7.1241,15.381,3.2702,17.872,34.692,30.087,12.8,7991.4,364.95,15.8,61.476,4.0,36.0,85.437,27.07,26.102,16.0,16.0,0.2,22,0.06039,30,49
1,1,2,-0.45597,1.27,0.006454,0.82067,1.0049,-0.01408,0.18104,0.62288,0.006423,0.035991,0.001795,0.85152,-0.48644,0.17933,4.5764,3.7521,-0.014011,2.4575,0.027558,0.028804,0.041102,1.1801,7.2952,0.056026,0.67048,38.242,12.877,5.5506,0.26548,0.15019,0.41763,9.5276,0.41561,0.81699,2.6033,95.947,0.00758,0.83754,0.027425,0.045434,0.13774,5.6035,-0.64385,1.3019,0.046857,1.0095,0.007864,4.6022,1062.5,3.7389,0.94397,12.881,-0.000565,0.056298,250.14,-0.47477,0.38599,0.36933,0.0,0.0,0.0,-0.042671,-0.051995,-0.063643,-0.042465,-252.83,-0.23795,-2.0869,-0.98939,-0.23212,-10.857,-0.18801,0.90531,100.0,0.4039,1.8484,25.588,88.667,229,1964.0,3.5409,126.0,371000.0,541000000.0,27,724000000.0,5.32e-08,26.78,0.2299,73.22,42.86,15.94,0.0,0.0,73.22,1,2,0,5,0,0,0.6,7.4166,7.105,14.321,18.77,124.76,26.124,11.8,8322.8,0.1896,15.6,24.579,0.0,36.0,107.09,31.31,30.194,17.0,16.0,0.4,22,0.010636,31,50


In [5]:
# make sure there's no missing data in the set
assert XX.isnull().sum().sum() == 0, "NaN present"

### Prepare Data

In [6]:
"""prepare target feature"""

y = np.array(XX['Financial Distress'].values.tolist())
y = np.array([0 if i > -0.50 else 1 for i in y])

# check result
unique, counts = np.unique(y, return_counts=True)
display(np.asarray((unique, counts)).T)

array([[   0, 3535],
       [   1,  135]], dtype=int64)

In [7]:
"""define numerical and categorical features"""

num_features = list(XX.iloc[:, np.r_[0:94, 95, 97, 100:120, 121:123]].columns.values)
cat_features = list(XX.iloc[:, np.r_[94, 96, 98, 99, 120]].columns.values)
cat_indices = [94, 96, 98, 99, 120]

# check results
for feature in cat_features:
    print(feature, ", number of unique values (categories): ", XX['x121'].nunique())

x92 , number of unique values (categories):  37
x94 , number of unique values (categories):  37
x96 , number of unique values (categories):  37
x97 , number of unique values (categories):  37
x118 , number of unique values (categories):  37


[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 95,
 97,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 121,
 122]

In [8]:
"""split data into train and test, define datatypes"""

indices = np.arange(y.shape[0])
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(XX, y, indices, stratify=y, test_size=0.3,
                                                                 random_state=42)


### Build Pipeline

In [9]:
"""define column selector class"""

## Select Columns
class MultiColumn(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X[self.columns]

In [10]:
"""assemble pipeline (define function)"""

def build_pipe(X_train, y_train, clf, sampler):
    """Build a pipeline for preprocessing (including oversampling)
    and classification.
    
    ARGUMENTS:
        X_train: training features (df or array)
        y_train: training labels (df or array)
        clf: classifier (sk-learn model object)
        sampler: sampler (imblearn sampling class)
        
    RETURNS:
        full_pipe: pipeline object
    """
    
    full_pipe = Pipeline([
    ('features', FeatureUnion([

        ('cat', Pipeline([
            ('cat_select', MultiColumn(cat_features)),
            ('ohe', OneHotEncoder()),
        ])),

        ('num', Pipeline([
            ('num_select', MultiColumn(num_features)),
            ('scaling', StandardScaler()),
        ])),
    ])),
        ('sample', sampler),
        ('clf', clf)])
    
    return full_pipe

In [35]:
"""initialize  classifier and SMOTENC sampler, build pipeline""" 

rg = LogisticRegression(class_weight = { 0:1, 1:8 }, random_state = 42, solver = 'saga',
                        max_iter=100, n_jobs=-1, intercept_scaling=1, C=0.02, penalty='l1')

sampler = SMOTENC(categorical_features=cat_indices, n_jobs=-1)

full_pipe = build_pipe(X_train, y_train, rg, sampler)

### Fit and Tune

In [36]:
def fit_pipe(X_train, y_train, pipe, scorer, cv=StratifiedKFold(3)):
    """Fit training data to a pipeline with GridSearchCV
    for best parameter tuning.
    
    ARGUMENTS:
        X_train: training features (df or array)
        y_train: training labels (df or array)
        pipe: pipeline (sk-learn pipeline object)
        scorer: evaluation metric for validation
        cv: type of CV, default is StratifiedKFold(3)
        
    RETURNS:
        grid: grid search object
        grid_results: dict with grid search results
    """
    parameters = {'clf__C': [0.001, 0.01, 0.025], 
                  'clf__class_weight':   [{ 0:1, 1:11 }, { 0:1, 1:8 }]}

    cv = GridSearchCV(pipe, param_grid=parameters, scoring=scorer, n_jobs= -1, 
                      cv=cv, error_score='raise', return_train_score=False, verbose=1)

    grid = cv.fit(X_train, y_train) 
    grid_results = grid.cv_results_

    return grid, grid_results

In [37]:
# call the function and evaluate on fbeta score

scorer = make_scorer(fbeta_score, beta=4)
cv = 3

grid, grid_results = fit_pipe(X_train, y_train, full_pipe, scorer, cv=cv)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   25.8s finished
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


- i have renamed cat_indices to cat_features - indices is for naming rows only
- i have integrated the SMOTENC sampler into the pipeline, to make sure the oversampling is at the end because oversampling transforms your dataframe into an np.array and you lose all your feature labels / column names that probably caused the problem