In [4]:
print (' ~ ~ ~ ~ ~ ~ ~ ~  Logarthmic Regression ~ ~ ~ ~ ~ ~ ~ ~ ')

 ~ ~ ~ ~ ~ ~ ~ ~  Logarthmic Regression ~ ~ ~ ~ ~ ~ ~ ~ 


### Problem

<pre> 
    Model the probability that a business goes bankrupt 
    The data set includes the following variables:
      1. industrial_risk: 0=low risk, 0.5=medium risk, 1=high risk.
      2. management_risk: 0=low risk, 0.5=medium risk, 1=high risk.
      3. financial flexibility: 0=low flexibility, 0.5=medium flexibility, 1=high flexibility.
      4. credibility: 0=low credibility, 0.5=medium credibility, 1=high credibility.
      5. competitiveness: 0=low competitiveness, 0.5=medium competitiveness, 1=high competitiveness.
      6. operating_risk: 0=low risk, 0.5=medium risk, 1=high risk.
      7. class: bankruptcy, non-bankruptcy (target variable).able).

</pre>

### Library imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score 
from sklearn.metrics import brier_score_loss, log_loss, matthews_corrcoef

from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

import time as t

import ydata_profiling as yd

import warnings
warnings.filterwarnings(action='ignore')

ModuleNotFoundError: No module named 'ydata_profiling'

## <font color='#e74c3c'> Read data </font>

In [None]:
df = pd.read_csv('bankruptcy-prevention.csv')
df.head()

### <font color='#1abc9c'> Change the name of the target column from 'class' (Python keyword) to 'category'. </font>

In [None]:
df.columns = ['industrial_risk', 'management_risk', 'financial_flexibility',
        'credibility', 'competitiveness', 'operating_risk', 'category']
df.head()

### <font color='#1abc9c'> Maintaining a deep copy of original data </font>

In [None]:
df_orig = df.copy()
id(df), id(df_orig)

## <font color='#e74c3c'> Pandas Profiling Automated EDA </font>

In [None]:
# profile = yd.ProfileReport(df, title="Report")
# profile

## <font color='#e74c3c'> Data Processing and EDA </font>

In [None]:
df.info()

In [None]:
print(f'Unique categories: {df.category.unique()}')

#### <font color='#1abc9c'> Convert the target column 'category' into numeric </font>

In [None]:
df['category'] = df['category'].apply(lambda x: 0 if x=='non-bankruptcy' else 1)

In [None]:
print(f'Unique categories: {df.category.unique()}')
df.head()

In [None]:
print(df['industrial_risk'].unique())
print(df['management_risk'].unique()) 
print(df['financial_flexibility'].unique())
print(df['credibility'].unique())
print(df['competitiveness'].unique())
print(df['operating_risk'].unique())

#### <font color='#1abc9c'> Check for null or NaN values </font>

In [None]:
df.isna().sum(), df.isnull().sum()

In [None]:
df.describe()

#### <font color='#1abc9c'> Check for duplicate values </font>

In [None]:
print(f'Are there any duplicated rows? {df.duplicated().any()}')

In [None]:
#De-duplicating rows
df.drop_duplicates(subset=None, keep='first', inplace=False)

<hr/><font color='#3636a9'> Since most of the feature values are discrete, there are bound to be duplicates. But this is valid data for individual businesses (significant). So it doesn't make sense to de-duplicate the dataset </font><hr/>

#### <font color='#1abc9c'> Calculate VIF </font>

In [None]:
X = df.drop(columns=['category'])
X = add_constant(X)
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
vif_data

## <font color='#e74c3c'> Visualization </font>

#### <font color='#1abc9c'> Distribution </font>

In [None]:
df.hist(figsize=(10,8))
plt.show()

In [None]:
fig, ax = plt.subplots(2,3)
fig.set_size_inches(15,8)

sns.distplot(df.industrial_risk, ax=ax[0,0], color='#ef6937')
sns.distplot(df.management_risk, ax=ax[0,1], color='#69bf37')
sns.distplot(df.financial_flexibility, ax=ax[0,2], color='#3769ef')
sns.distplot(df.credibility, ax=ax[1,0], color='#3769ef')
sns.distplot(df.competitiveness, ax=ax[1,1], color='#69bf37')
sns.distplot(df.operating_risk, ax=ax[1,2], color='#ef6937')

plt.show()


 <ul> 
     <font color='676799> 
         <li> Distributions of all features indicate multi-modal distribution.  </li>
         <li> All features have discrete values which seem to be representative values of categories </li>
         <li> 'management_risk' and 'operating_risk' show slight negative skewness while 'financial_flexibility' shows positive skewness. The rest of the features are more or less zero skewed. </li>
     </font>
 </ul>


<hr/>
<ul>
    <i>
        <font color='#3636a9'>
            <li>All features show multi-modal distributions</li>
            <li>All features have discrete values which is indicative of representation for categorical data.</li>
            <li>While 'management_risk' and 'operating_risk' show slightly negative skewness, 'financial_flexibility' is showing slightly positive skewness. The rest of the features are showing more or less zero skewness. </li>
        </font>
    </i>
</ul>
<hr/>

In [None]:
fig, ax = plt.subplots(2,3, figsize=(8,5))

features = ['industrial_risk', 'management_risk', 'financial_flexibility',
        'credibility', 'competitiveness', 'operating_risk']

for i in range(0, len(features)):    
    counts = df[features[i]].value_counts().reset_index()
    sns.barplot(data=counts, x='index', y=features[i], ax=ax[i%2, i%3], width=0.5)

plt.tight_layout()
plt.show()

In [None]:
# Calculate frequencies
df_freq = df.iloc[:,:6].apply(lambda x: x.value_counts()).T.fillna(0)

colors = ["#2c805c", "#dfe7e4", "#ff9f3e"]

df_freq.plot(kind='bar', stacked=True, color=colors)
plt.title('Stacked Bar Chart of Features')
plt.xlabel('Features')
plt.ylabel('Frequency')
plt.show()

#### <font color='#1abc9c'> Correlation </font>

In [None]:
corr = df.corr()
corr

In [None]:
sns.heatmap(corr, annot=True, cmap='viridis')
plt.show()

<hr/>
<ul>
    <i>
        <font color='#3636a9'>
            <li>There is a moderate positive correlation between 'competitiveness' and 'financial_flexibility' and 'credibility'</li>
            <li>There is a medium positive correlation between 'financial_flexibility' and 'credibility'</li>
            <li>The target 'category' is negatively correlated with 'competitiveness', 'financial_flexibility' and 'credibility'. This means that if a business is rated lower of 'competitiveness', 'financial_flexibility' and 'credibility' there are higher chances of the business going bankrupt. </li>
        </font>
    </i>
</ul>
<hr/>

In [None]:
sns.pairplot(data=df, hue='category', palette='Set3')
plt.show()

<hr/>
<ul>
    <i>
        <font color='#676799'> The above pair plots indicate the following:
            <li>Low score on 'competitiveness' invariable leads to the business having a high risk of bankruptcy, regardless of its score on any other features.</li>
            <li>A business has higher risk of bankruptcy with higher score on 'industrial_risk' and lower score on 'credibility'</li>
            <li>So also, a business has a higher risk of going bankrupt when it scores low on both 'management_risk' as well as 'credibility'</li>
        </font>
    </i>
</ul>
<hr/>

#### <font color='#1abc9c'> Determining Outliers </font>

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(10, 2))
fig.subplots_adjust(wspace=0.05, hspace=0.75)

sns.boxplot(x='industrial_risk',data=df,palette='hls',ax=ax[0,0], fliersize=4, flierprops={'marker':'o','markerfacecolor':'c'})
sns.boxplot(x='management_risk',data=df,palette='hls',ax=ax[0,1], fliersize=4, flierprops={'marker':'o','markerfacecolor':'c'})
sns.boxplot(x='financial_flexibility',data=df,palette='hls',ax=ax[0,2], fliersize=4, flierprops={'marker':'o','markerfacecolor':'c'})
sns.boxplot(x='credibility',data=df,palette='hls',ax=ax[1,0], fliersize=4, flierprops={'marker':'o','markerfacecolor':'c'})
sns.boxplot(x='operating_risk',data=df,palette='hls',ax=ax[1,1], fliersize=4, flierprops={'marker':'o','markerfacecolor':'c'})
sns.boxplot(x='competitiveness',data=df,palette='hls',ax=ax[1,2], fliersize=4, flierprops={'marker':'o','markerfacecolor':'c'})

ax[0,0].set_xlabel('industrial_risk', color='#f57345')
ax[0,1].set_xlabel('management_risk', color='#f57345')
ax[0,2].set_xlabel('financial_flexibility', color='#f57345')
ax[1,0].set_xlabel('credibility', color='#f57345')
ax[1,1].set_xlabel('operating_risk', color='#f57345')
ax[1,2].set_xlabel('competitiveness', color='#f57345')

ax[0,0].tick_params(labelsize=8)
ax[0,1].tick_params(labelsize=8)
ax[0,2].tick_params(labelsize=8)
ax[1,0].tick_params(labelsize=8)
ax[1,1].tick_params(labelsize=8)
ax[1,2].tick_params(labelsize=8)

plt.show()

<hr/><font color='#3636a9'> There is no indication of any any outliers in any of the feature values. </font><hr/>

## <font color='#e74c3c'> Data Preparation </font>

### <font color='#1abc9c'> Split dataset into features and target </font>

In [None]:
X = df.iloc[:,:6]
y = df.iloc[:,6:]

print(f'X:\r\n{X.head()}\r\n')
print(f'y:\r\n{y.head()}')

In [None]:
print('X:')
X.info()
print('\r\ny:')
y.info()

### <font color='#1abc9c'> Split each dataset into train and test </font>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#### <font color='#1abc9c'> Inspect the datasets </font>

In [None]:
print(f'X_train:\r\n{X_train.head()}')
print(f'X_test:\r\n{X_test.head()}\r\n')
print('---------------------\r\n')
print(f'y_train:{y_train.head()}')
print(f'y_test:{y_test.head()}')

## <font color='#e74c3c'> Modelling - Logistic Regression </font>

### <font color='#1abc9c'> Fit Model </font>

In [None]:
classifier = LogisticRegression()
model0 = classifier.fit(X_train, y_train)

### <font color='#1abc9c'> Predictions - training data </font>

In [None]:
y_pred_train_0 = model0.predict(X_train)
print(y_pred_train_0)

#### <font color='#1abc9c'> Metrics </font>

In [None]:
confusion_matrix(y_train, y_pred_train_0)

In [None]:
print(classification_report(y_train, y_pred_train_0))

In [None]:
accuracy_score(y_train, y_pred_train_0)

<hr/>
<ul>
    <i>
        <font color='#676799'> A very high accuracy score indicates that the model could be overfitted. Let us see how the model performs with test data.
        </font>
    </i>
</ul>
<hr/>

### <font color='#1abc9c'> Predictions - test data </font>

In [None]:
y_pred_test_0 = model0.predict(X_test)
print(y_pred_test_0)

#### <font color='#1abc9c'> Metrics </font>

In [None]:
confusion_matrix(y_test, y_pred_test_0)

In [None]:
print(classification_report(y_test, y_pred_test_0))

In [None]:
accuracy_score(y_test, y_pred_test_0)

<hr/>
<ul>
    <i>
        <font color='#676799'> The model seems to be performing well with test data too. However, there could be lack of variance which may make the model to perform poorly with new data. Therefore we will use K-Fold cross validation to introduce variance into the model fit. 
        </font>
    </i>
</ul>
<hr/>

## <font color='#e74c3c'> Re-Fit Model with Cross Validation </font>

In [None]:
# Start the timer
start_time = t.time()

classifier_cv = LogisticRegressionCV(cv=5, random_state=42)
model_cv = classifier_cv.fit(X_train, y_train)

# End the timer
end_time = t.time()

elapsed_time = end_time - start_time
print(f"Elapsed time for fitting: {elapsed_time:0.5f} seconds")

### <font color='#1abc9c'> Predictions - training data with CV </font>

In [None]:
# Start the timer
start_time = t.time()

y_pred_train = model_cv.predict(X_train)

# End the timer
end_time = t.time()

elapsed_time = end_time - start_time
print(f"Elapsed time for predictions: {elapsed_time:0.5f} seconds")

print(y_pred_train)

#### <font color='#1abc9c'> Metrics (for model with CV) </font>

In [None]:
confusion_matrix(y_train, y_pred_train)

In [None]:
print(classification_report(y_train, y_pred_train))

In [None]:
accuracy_score(y_train, y_pred_train)

<hr/>
<ul>
    <i>
        <font color='#676799'> There doesn't seem to be much of an improvement in the model after applying cross validation. Let us apply Regularization techniques to find out whether it makes any difference. 
        </font>
    </i>
</ul>
<hr/>

## <font color='#e74c3c'> Apply Lasso Regularization </font>

In [None]:
# Start the timer
start_time = t.time()

classifier_l = LogisticRegressionCV(penalty='l1', solver='liblinear', cv=5, random_state=42)
model_l = classifier_l.fit(X_train, y_train)

# End the timer
end_time = t.time()

elapsed_time = end_time - start_time
print(f"Elapsed time for fitting: {elapsed_time:0.5f} seconds")

### <font color='#1abc9c'> Predictions - training data with Lasso applied </font>

In [None]:
# Start the timer
start_time = t.time()

y_pred_train = model_l.predict(X_train)

# End the timer
end_time = t.time()

elapsed_time = end_time - start_time
print(f"Elapsed time for predictions: {elapsed_time:0.5f} seconds")

print(y_pred_train)

#### <font color='#1abc9c'> Metrics (for model with Lasso applied) </font>

In [None]:
confusion_matrix(y_train, y_pred_train)

In [None]:
print(classification_report(y_train, y_pred_train))

In [None]:
accuracy_score(y_train, y_pred_train)

<hr/>
<ul>
    <i>
        <font color='#676799'> There doesn't seem to be much of an improvement in the model after applying Lasso Regularization. Let us apply Ridge Regularization technique to find out whether it makes any difference. 
        </font>
    </i>
</ul>
<hr/>

## <font color='#e74c3c'> Apply Ridge Regularization </font>

In [None]:
# Start the timer
start_time = t.time()

classifier_r = LogisticRegressionCV(penalty='l2', solver='lbfgs', cv=5, random_state=42)
model_r = classifier_r.fit(X_train, y_train)

# End the timer
end_time = t.time()

elapsed_time = end_time - start_time
print(f"Elapsed time for fitting: {elapsed_time:0.5f} seconds")

### <font color='#1abc9c'> Predictions - training data with Ridge applied </font>

In [None]:
# Start the timer
start_time = t.time()

y_pred_train = model_r.predict(X_train)

# End the timer
end_time = t.time()

elapsed_time = end_time - start_time
print(f"Elapsed time for predictions: {elapsed_time:0.5f} seconds")

print(y_pred_train)

#### <font color='#1abc9c'> Metrics (for model with Lasso applied) </font>

In [None]:
confusion_matrix(y_train, y_pred_train)

In [None]:
print(classification_report(y_train, y_pred_train))

In [None]:
accuracy_score(y_train, y_pred_train)

<hr/>
<ul>
    <i>
        <font color='#676799'> There doesn't seem to be much of an improvement in the model after applying Ridge Regularization. We will accept the original model.
        </font>
    </i>
</ul>
<hr/>

## <font color='#e74c3c'> Decision Metrics </font>

### <font color='#1abc9c'> Precision, Recall, F1 and Accuracy </font>

In [None]:
print(f'The precision score is: {precision_score(y_test, y_pred_test_0)}')
print(f'The recall score is: {recall_score(y_test, y_pred_test_0)}')
print(f'The f1 score is: {f1_score(y_test, y_pred_test_0)}')
print(f'The accuracy score is: {accuracy_score(y_test, y_pred_test_0)}')

### <font color='#1abc9c'> ROC Curve and AUC </font>

In [None]:
print(f'The roc_auc score is: {roc_auc_score(y_test, y_pred_test_0)}')

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_test_0)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

## <font color='#e74c3c'> Evaluation Metrics </font>

## <font color='#1abc9c'> Brier Score </font>

In [None]:
# Brier Score closer to 0 is better
brier_score = brier_score_loss(y_test, y_pred_test_0)
print(f"The Brier score is {brier_score:.4f}")

## <font color='#1abc9c'> Log-Loss Score </font>

In [None]:
# Values closer to 0 indicate better prediction
lls = log_loss(y_test, y_pred_test_0)
print(f'The log loss score is {lls:0.4f}')

## <font color='#1abc9c'> Matthews Correlation Coefficient (MCC) </font>

In [None]:
# MCC=1: Perfect prediction.
# MCC=0: No better than random.
# MCC=−1: Total disagreement between prediction and observation.
mcc = matthews_corrcoef(y_test, y_pred_test_0)
print(f'The Matthews Correlation Coefficient is {mcc:0.4f}')

## <font color='#e74c3c'> Store the Model </font>

In [None]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(model0, f)