In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, learning_curve, ShuffleSplit
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn.feature_selection import SelectPercentile, SelectorMixin
from sklearn.base import TransformerMixin, BaseEstimator
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('../input/company-bankruptcy-prediction/data.csv')
df.head()

# EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['Bankrupt?'].value_counts()

Dataset is highly imbalanced

In [None]:
df[df.isnull().any(axis=1)]

# Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Bankrupt?', axis = 1), 
                                                    df['Bankrupt?'], 
                                                    test_size=0.3, 
                                                    random_state=101)

# Data preparation

### Interactions and polinoms

In [None]:
class InterPolinomsFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, degree = 1, interaction_only = False):
        self.degree = degree
        self.interaction_only = interaction_only
        self.encoder_inner = PolynomialFeatures(
            degree = degree, 
            interaction_only = interaction_only,
            include_bias = False
        )
        self.encoder = make_column_transformer(
                    (self.encoder_inner,make_column_selector(dtype_exclude='int64')),
                    remainder='passthrough'
                )
        
    @staticmethod
    def __convert_to_float(X):
        X_copy = X.copy()
        for name in X.columns:
            if 'polynomialfeatures__' in name:
                X_copy[name] = X_copy[name].astype(float)
        return X_copy
    
    def __columns_name_change(self, old_name):
        name = old_name.replace('polynomialfeatures__', '')
        pattern_matches = re.findall(r'(?:\b|_)(x\d+\b)', name)
        for col_name in pattern_matches:
            pattern_full = col_name + r'\b'
            feature_name = self.object_columns_dict[col_name]
            name = re.sub(pattern_full, feature_name, name)
        return name
    
    def fit(self, X, y = None):
        X_copy = X.copy()
        self.encoder.fit(X_copy)
        object_columns = X_copy.select_dtypes(exclude='int64').columns
        self.object_columns_dict = dict()
        for i in enumerate(object_columns):
            self.object_columns_dict[f'x{i[0]}'] = i[1]
        return self
                 
    def transform(self, X, y = None):
        X_copy = X.copy()
        X_copy = self.encoder.transform(X_copy)
        X_copy = pd.DataFrame(X_copy, columns = self.encoder.get_feature_names())
        X_copy = self.__convert_to_float(X_copy)
        X_copy.columns = [self.__columns_name_change(name) for name in X_copy.columns]
        return X_copy

In [None]:
InterPolinomsFeatures(
    degree = 2, 
    interaction_only = False
).fit_transform(X_train).head()

### Feature selection

In [None]:
class Selector(BaseEstimator, TransformerMixin):
    
    def __init__(self, percent = 50):
        self.percent = percent
        self.selector_inner = SelectPercentile(percentile=percent)
        
    def fit(self, X, y):
        self.selector_inner.fit(X, y)
        self.columns_names = X.columns[self.selector_inner.get_support()]
        return self
        
    def transform(self, X, y = None):
        X_copy = X.copy()
        X_proc = self.selector_inner.transform(X_copy)
        X_proc = pd.DataFrame(X_proc, columns = self.columns_names)
        return X_proc

In [None]:
Selector(percent = 5).fit_transform(X_train, y_train).head()

### Scaling

In [None]:
class Scaler(BaseEstimator, TransformerMixin):
    
    scalers = {
        'standart': StandardScaler(), 
        'minmax'  : MinMaxScaler(),
        'none'    : None
    }
    
    def __init__(self, mode = 'minmax'):
        if mode in self.scalers.keys():
            self.mode = mode
            self.scaler_inner = self.scalers[self.mode]
        else:
            raise AttibuteError('Wrong mode name')
        
    def fit(self, X, y = None):
        if self.mode != 'none':
            self.scaler_inner.fit(X)
            self.columns_names = X.columns
        return self
        
    def transform(self, X, y = None):
        X_copy = X.copy()
        if self.mode != 'none':
            X_copy = self.scaler_inner.transform(X_copy)
            X_copy = pd.DataFrame(X_copy, columns = self.columns_names)
        return X_copy

In [None]:
Scaler(mode = 'minmax').fit_transform(X_train).head()

In [None]:
Scaler(mode = 'none').fit_transform(X_train).head()

# Modeling

In [None]:
def plot_learning_curve(estimator, X, y, axes=None, ylim=None, cv=5,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    '''
    https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py
    '''

    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title('Learning curve')
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")
    plt.show()



def precision_recall(model, X_test, y_test):
    precision, recall, thresholds = precision_recall_curve(
        y_test, 
        model.decision_function(X_test)
    )
    close_zero = np.argmin(np.abs(thresholds))
    plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10,
     label="threshold 0", fillstyle="none", c='k', mew=2)
    plt.plot(precision, recall, label="precision recall curve")
    plt.xlabel("Precision")
    plt.ylabel("Recall")
    plt.legend(loc="best")
    plt.show()


def eval_result(model, X_test, y_test, X_train, y_train, validation = False):
    if type(model) == GridSearchCV:
        model = model.best_estimator_
    pipeline = False
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pred = model.predict(X_test)
        print(classification_report(y_test, pred, target_names = ['No Bankruptcy', 'Bankruptcy']))
        display(pd.DataFrame(confusion_matrix(y_test, pred), 
                         columns = ['Bankruptcy Not Predicted', 'Bankruptcy Predicted'],
                         index = ['No Bankruptcy', 'Bankruptcy']))
        
        if type(model) == Pipeline:
            pipeline = True
            pipe = model[:-1]
            model = model[-1]
        
        if pipeline:
            X_test = pipe.transform(X_test)
            X_train = pipe.transform(X_train)
            
        if ((hasattr(model, 'feature_importances_') 
        or hasattr(model, 'coef_')) 
            and not validation):
            try:
                model_feat_imp = model.feature_importances_
            except:
                model_feat_imp = [abs(i) for i in model.coef_[0]]
            
                
            features = pd.DataFrame({
                'Variable'  :X_test.columns,
                'Importance':model_feat_imp
            })
            features.sort_values('Importance', ascending=False, inplace=True)
            display(features.head(20))
        if not validation:
            try:
                precision_recall(model, X_test, y_test)
                plot_learning_curve(model, X_train, y_train, n_jobs=-1)
            except:
                pass

In [None]:
pipe = Pipeline([
    ('interpolinomsfeatures', InterPolinomsFeatures()),
    ('selector', Selector()),
    ('scaler', Scaler()),
    ('classifier', LogisticRegression())
])
pipe

In [None]:
param_grid = [
    {
        'classifier': [LogisticRegression(random_state = 1)],
        'interpolinomsfeatures__interaction_only': [True, False],
        'interpolinomsfeatures__degree': [1, 2, 3],
        'selector__percent': [10, 30, 50, 100],
        'scaler__mode': ['standart', 'minmax'],
        'classifier__class_weight': [
            {0:1, 1:4},
            {0:1, 1:5},
            {0:1, 1:6},
            {0:1, 1:7},
            {0:1, 1:8},
            {0:1, 1:10},
            {0:1, 1:12},
            {0:1, 1:15},
        ],
        'classifier__C': [0.1, 0.5, 0.7, 1, 2, 5]
              },
    {
        'classifier': [XGBClassifier(random_state = 1,eval_metric = 'logloss')],
        'interpolinomsfeatures__interaction_only': [True, False],
        'interpolinomsfeatures__degree': [1, 2, 3],
        'selector__percent': [100],
        'scaler__mode': ['none'],
        'classifier__scale_pos_weight': [1, 2, 3, 4 ,5, 7, 8, 9, 10],
              }
             ]
param_grid

In [None]:
pipe[:-1].fit_transform(X_train, y_train).head()

In [None]:
grid = GridSearchCV(
        pipe, 
        param_grid=param_grid, 
        cv=5, 
        n_jobs = -1, 
        #verbose = 2,
        scoring = 'f1_macro'
    )
grid.fit(X_train, y_train)

Best and worst parameters combination:

In [None]:
model_results = pd.DataFrame(grid.cv_results_)
model_results.sort_values(by='mean_test_score', ascending = False, inplace = True)

display(model_results.head(4))
display(model_results.tail(4))

In [None]:
print(f"Best cross val score: {grid.best_score_}")
print(f"\nBest params:")
for param, val in grid.best_params_.items():
    print(f'{param}: {val}')
print('\n')
eval_result(grid, X_test, y_test, X_train, y_train)