In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Начальная информация


In [None]:
data = pd.read_csv('../input/Financial Distress.csv')

## Начальный обзор данных

In [None]:
data.info()

Пропусков нет.
Посмотрим информацию по столбцам.

In [None]:
data.describe()

In [None]:
data.head(5)

In [None]:
data.shape

## Один из признаков является категориальным
Посмотрим на него подбробнее

In [None]:
categories = data['x80'].unique()
print(categories)
print(len(categories))

## Посмотрим как меняются признаки объектов (компаний) с течением времени

In [None]:
g = sns.PairGrid(data, x_vars='Time', y_vars=list(data.columns[3:]), hue='Company', size=5)
g = g.map(plt.scatter, alpha=.3)

## Посмотрим распределение периодов сбора данных

In [None]:
sns.distplot(data['Time'])

## Построим матрицу корелляции признаков

In [None]:
data_corr = data.drop(labels=['Company'], axis=1).corr()
data_corr = data_corr.sort_values(ascending=False, axis=1, by='Financial Distress')
data_corr.head(10)

## И отобразим ее на карте

In [None]:
cmap = sns.diverging_palette(220, 10, as_cmap=True)
plt.figure(figsize = (20,20))
sns.heatmap(data_corr, cmap=cmap)

## Создаем вектор ответов для классификации
ОСгласно описанию данных, компании с уровнем Financial Distress не более -0.5 являются проблемными. Поставим задачу как задачу классификации.

In [None]:
distressed = [1 if row['Financial Distress'] <= -0.5 else 0 for _, row in data.iterrows()]
data_full = data
data_full['Distressed'] = pd.Series(distressed)
data_full.loc[data_full['Distressed'] == 1, ['Financial Distress', 'Distressed']].head(10)

## Посмотрим как состояние "Distressed" зависит от признаков

In [None]:
g = sns.PairGrid(data_full, x_vars='Time', y_vars=list(data.columns[3:]), hue='Distressed', size=5)
g = g.map(plt.scatter, alpha=.3)

## Разделим данные на обучающую и тестовую выборку.
Обучающая выборка будет использоваться для кросс-валидации и настройки гиперпараметров.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
SSS = StratifiedShuffleSplit(random_state=10, test_size=.3, n_splits=1)
X = data_full.iloc[:, 3:-1].drop('x80', axis=1)
y = data_full['Distressed'] 
for train_index, test_index in SSS.split(X, y):
    print("CV:", train_index, "HO:", test_index)
    X_cv, X_ho = X.iloc[train_index], X.iloc[test_index]
    y_cv, y_ho = y[train_index], y[test_index]
# X_cv, X_ho, y_cv, y_ho = StratifiedShuffleSplit(data_shuffled.iloc[:, 3:-1], data_shuffled['Distressed'],
#                                                    test_size=0.33, random_state=10)

In [None]:
print('CV distress:', sum(y_cv), '\nHO distress:', sum(y_ho))

## Классы очень несбалансированы

In [None]:
data_full['Distressed'].value_counts()

In [None]:
136/3536

## Первую серию моделей будем строить для несбалансированных классов без дополнительного feature engineering

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from pprint import pprint

## Случайный лес

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 50)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 55, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 3, 4]
bootstrap = [True, False]
class_weight = ['balanced', None]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'class_weight': class_weight}

In [None]:
rf_clsf = RandomForestClassifier(random_state=10, class_weight='balanced')
rf_random = RandomizedSearchCV(estimator = rf_clsf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=10, n_jobs = -1, refit='f1', scoring=['f1', 'precision', 'recall'])
rf_random.fit(X_cv, y_cv)

In [None]:
# print(rf_random.best_score_, '\n', rf_random.cv_results_)
print(rf_random.best_score_)

In [None]:
best_rf_clsf = rf_random.best_estimator_
best_rf_clsf.fit(X_cv, y_cv)

In [None]:
print(recall_score(y_ho, best_rf_clsf.predict(X_ho)),
      precision_score(y_ho, best_rf_clsf.predict(X_ho)),
      f1_score(y_ho, best_rf_clsf.predict(X_ho)))

In [None]:
print(sum(y_ho), sum(best_rf_clsf.predict(X_ho)), sum(y_ho.multiply(best_rf_clsf.predict(X_ho))))

## Логистическая регрессия

In [None]:
lr_penalty = ['l1']
lr_class_weight = ['balanced', None]
lr_C = [0.001, 0.01, 0.1, 1, 10]
#lr_max_iter = [int(x) for x in np.linspace(100, 1000, num = 100)]
lr_solver = ['liblinear', 'saga']

lr_grid = {'penalty': lr_penalty,
           'class_weight': lr_class_weight,
           'C': lr_C,
           #'max_iter': lr_max_iter,
           'solver': lr_solver}

In [None]:
lr_clsf = LogisticRegression(random_state=10, max_iter=1000)
lr_grid = GridSearchCV(estimator = lr_clsf, param_grid = lr_grid, cv = 3, verbose=2, n_jobs = -1, refit='f1', scoring=['f1', 'precision', 'recall'])
lr_grid.fit(X_cv, y_cv)

In [None]:
lr_grid.best_score_

In [None]:
best_lr_clsf = lr_grid.best_estimator_
best_lr_clsf.fit(X_cv, y_cv)

In [None]:
print(recall_score(y_ho, best_lr_clsf.predict(X_ho)),
      precision_score(y_ho, best_lr_clsf.predict(X_ho)),
      f1_score(y_ho, best_lr_clsf.predict(X_ho)))

In [None]:
print(sum(y_ho), sum(best_lr_clsf.predict(X_ho)), sum(y_ho.multiply(best_lr_clsf.predict(X_ho))))

## И, наконец, XGBoost

In [None]:
xgb_learning_rate = [x for x in np.linspace(start = 0.001, stop = 0.1, num = 10)]
xgb_n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
xgb_booster = ['gbtree', 'dart']
xgb_colsample_bytree = [0.4, 0.6, 0.8, 1.0]
xgb_colsample_bylevel = [0.5, 0.75, 1.0]
xgb_scale_pos_weight = [(len(y_cv) - sum(y_cv))/sum(y_cv)]
xgb_min_child_weight = [1]
xgb_subsample = [0.5, 1.0]


random_grid = {'learning_rate': xgb_learning_rate,
               'n_estimators': xgb_n_estimators,
               'booster': xgb_booster,
               'colsample_bytree': xgb_colsample_bytree,
               'colsample_bylevel': xgb_colsample_bylevel,
               'scale_pos_weight': xgb_scale_pos_weight,
               'min_child_weight': xgb_min_child_weight,
               'subsample': xgb_subsample}

In [None]:
xgb_clsf = xgb.XGBClassifier(random_state=10)
xgb_random = RandomizedSearchCV(estimator = xgb_clsf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=10, n_jobs = -1, refit='f1', scoring=['f1', 'precision', 'recall'])
xgb_random.fit(X_cv, y_cv)
#xgb_clsf.fit(X_train, y_train)

In [None]:
xgb_random.best_score_

In [None]:
best_xgb_clsf = xgb_random.best_estimator_
best_xgb_clsf.fit(X_cv, y_cv)

In [None]:
print(recall_score(y_ho, best_xgb_clsf.predict(X_ho)),
      precision_score(y_ho, best_xgb_clsf.predict(X_ho)),
      f1_score(y_ho, best_xgb_clsf.predict(X_ho)))

In [None]:
print(sum(y_ho), sum(best_xgb_clsf.predict(X_ho)), sum(y_ho.multiply(best_xgb_clsf.predict(X_ho))))

## Для второй серии моделей кодируем категориальный признак, прошкалируем данные, а так же будем оверсэмплить класс, находящийся в меньшинстве

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE

In [None]:
X = data_full.iloc[:, 3:-1]
y = data_full['Distressed'] 

In [None]:

# data_scaled = pd.concat([data_full['x80'],data_scaled], axis=1)
# enc = OneHotEncoder(n_values=len(X['x80'].unique()), categorical_features=X.columns.get_loc("x80"))
X_encoded = pd.get_dummies(X, columns=['x80'], prefix='x80_')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
X_scaled = pd.DataFrame(X_scaled, index=X_encoded.index, columns=X_encoded.columns)
#data_scaled['x80'] = data_full['x80'].values
for train_index, test_index in SSS.split(X_scaled, y):
    print("CV:", train_index, "HO:", test_index)
    X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
sm = SMOTE(random_state=10)
X_train, y_train = sm.fit_sample(X_train, y_train)
# X_train = pd.DataFrame(X_train, index=X_encoded.index, columns=X_encoded.columns)
# y_train = pd.DataFrame(y, index=y.index, columns=y.columns)


In [None]:
y_test

## Случайный лес после обработки признаков

In [None]:
rf_n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
rf_max_features = ['auto', 'sqrt']
rf_max_depth = [int(x) for x in np.linspace(50, 100, num = 10)]
rf_max_depth.append(None)
rf_min_samples_split = [2, 5, 10]
rf_min_samples_leaf = [1, 2, 3, 4]
rf_bootstrap = [True, False]
rf_class_weight = ['balanced', None]

rf_random_grid = {'n_estimators': rf_n_estimators,
               'max_features': rf_max_features,
               'max_depth': rf_max_depth,
               'min_samples_split': rf_min_samples_split,
               'min_samples_leaf': rf_min_samples_leaf,
               'bootstrap': rf_bootstrap,
               'class_weight': rf_class_weight}

In [None]:
rf_clsf = RandomForestClassifier(random_state=10)
rf_random_2 = RandomizedSearchCV(estimator = rf_clsf, param_distributions = rf_random_grid, n_iter = 10, cv = 3, verbose=2, random_state=10, n_jobs = -1, refit='f1', scoring=['f1', 'precision', 'recall'])
rf_random_2.fit(X_train, y_train)

In [None]:
best_rf_clsf = rf_random_2.best_estimator_
best_rf_clsf.fit(X_train, y_train)

In [None]:
print(rf_random_2.best_score_)
print(recall_score(y_test, best_rf_clsf.predict(X_test)),
      precision_score(y_test, best_rf_clsf.predict(X_test)),
      f1_score(y_test, best_rf_clsf.predict(X_test)))
print(sum(y_test), sum(best_rf_clsf.predict(X_test)), sum(y_test.multiply(best_rf_clsf.predict(X_test))))

## Логистическая регрессия после обработки признаков

In [None]:
lr_penalty = ['l1', 'l2']
lr_class_weight = ['balanced', None]
lr_C = [0.1, 1, 10, 100]
#lr_max_iter = [int(x) for x in np.linspace(100, 1000, num = 100)]
lr_solver = ['liblinear', 'saga']

lr_grid = {'penalty': lr_penalty,
           'class_weight': lr_class_weight,
           'C': lr_C,
           #'max_iter': lr_max_iter,
           'solver': lr_solver}

In [None]:
lr_clsf = LogisticRegression(random_state=10, max_iter=2000)
lr_grid_2 = GridSearchCV(estimator = lr_clsf, param_grid = lr_grid, cv = 3, verbose=2, n_jobs = -1, refit='f1', scoring=['f1', 'precision', 'recall'])
lr_grid_2.fit(X_train, y_train)

In [None]:
best_lr_clsf = lr_grid_2.best_estimator_
best_lr_clsf.fit(X_train, y_train)

In [None]:
print(lr_grid_2.best_score_)
print(recall_score(y_test, best_lr_clsf.predict(X_test)),
      precision_score(y_test, best_lr_clsf.predict(X_test)),
      f1_score(y_test, best_lr_clsf.predict(X_test)))
print(sum(y_test), sum(best_lr_clsf.predict(X_test)), sum(y_test.multiply(best_lr_clsf.predict(X_test))))

## XGBoost после обработки признаков

In [None]:
xgb_learning_rate = [x for x in np.linspace(start = 0.001, stop = 0.1, num = 10)]
xgb_n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
xgb_booster = ['gbtree', 'dart']
xgb_colsample_bytree = [0.4, 0.6, 0.8, 1.0]
xgb_colsample_bylevel = [0.5, 0.75, 1.0]
xgb_scale_pos_weight = [(len(y_cv) - sum(y_cv))/sum(y_cv)]
xgb_min_child_weight = [1]
xgb_subsample = [0.5, 1.0]


random_grid = {'learning_rate': xgb_learning_rate,
               'n_estimators': xgb_n_estimators,
               'booster': xgb_booster,
               'colsample_bytree': xgb_colsample_bytree,
               'colsample_bylevel': xgb_colsample_bylevel,
               'scale_pos_weight': xgb_scale_pos_weight,
               'min_child_weight': xgb_min_child_weight,
               'subsample': xgb_subsample}

In [None]:
xgb_clsf = xgb.XGBClassifier(random_state=10)
xgb_random_2 = RandomizedSearchCV(estimator = xgb_clsf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=10, n_jobs = -1, refit='f1', scoring=['f1', 'precision', 'recall'])
xgb_random_2.fit(X_train, y_train)

In [None]:
best_xgb_clsf = xgb_random_2.best_estimator_
best_xgb_clsf.fit(X_train, y_train)

In [None]:
print(xgb_random_2.best_score_)
print(recall_score(y_test, best_xgb_clsf.predict(X_test.values)),
      precision_score(y_test, best_xgb_clsf.predict(X_test.values)),
      f1_score(y_test, best_xgb_clsf.predict(X_test.values)))
print(sum(y_test), sum(best_xgb_clsf.predict(X_test.values)), sum(y_test.multiply(best_xgb_clsf.predict(X_test.values))))