In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
df = pd.read_csv("data.csv")

In [4]:
df.columns = df.columns.str.strip()

In [5]:
df.corr()["Bankrupt?"].abs().sort_values(ascending=False).head(30)

Bankrupt?                                                  1.000000
Net Income to Total Assets                                 0.315457
ROA(A) before interest and % after tax                     0.282941
ROA(B) before interest and depreciation after tax          0.273051
ROA(C) before interest and depreciation before interest    0.260807
Net worth/Assets                                           0.250161
Debt ratio %                                               0.250161
Persistent EPS in the Last Four Seasons                    0.219560
Retained Earnings to Total Assets                          0.217779
Net profit before tax/Paid-in capital                      0.207857
Per Share Net profit before tax (Yuan ¥)                   0.201395
Current Liability to Assets                                0.194494
Working Capital to Total Assets                            0.193083
Net Income to Stockholder's Equity                         0.180987
Borrowing dependency                            

In [6]:
df['Altman_Z_Score'] = (
    1.2 * (df['Working Capital to Total Assets']) +  # A
    1.4 * (df['Retained Earnings to Total Assets']) +  # B
    3.3 * (df['Operating profit/Paid-in capital']) +  # C (EBIT to Total Assets proxy)
    0.6 * (df['Net Value Per Share (A)'] / df['Liability to Equity']) +  # D (Market Value of Equity to Total Liabilities proxy)
    1.0 * (df['Persistent EPS in the Last Four Seasons'] / df['Net worth/Assets'])  # E (Sales to Total Assets proxy)
)

df[['Altman_Z_Score']].head()


Unnamed: 0,Altman_Z_Score
0,2.907602
1,3.151521
2,3.169636
3,2.953751
4,3.100158


In [7]:
selected_columns = ['Bankrupt?','Net Income to Total Assets',
    'ROA(A) before interest and % after tax',
    'Net worth/Assets',
    'Debt ratio %',
    'Persistent EPS in the Last Four Seasons',
    'Net profit before tax/Paid-in capital',
    'Current Liability to Assets',
    'Working Capital to Total Assets',
    'Net Income to Stockholder\'s Equity',
    'Borrowing dependency',
    'Liability to Equity',
    'Net Value Per Share (A)',
    'Operating profit/Paid-in capital',
    'Equity to Long-term Liability',
    'CFO to Assets',"Altman_Z_Score"]
# columns with highest correlation and filtered for redundancy

In [14]:
df_fil = df[selected_columns]

In [16]:
df_fil #unscaled version of the data used for the models

Unnamed: 0,Bankrupt?,Net Income to Total Assets,ROA(A) before interest and % after tax,Net worth/Assets,Debt ratio %,Persistent EPS in the Last Four Seasons,Net profit before tax/Paid-in capital,Current Liability to Assets,Working Capital to Total Assets,Net Income to Stockholder's Equity,Borrowing dependency,Liability to Equity,Net Value Per Share (A),Operating profit/Paid-in capital,Equity to Long-term Liability,CFO to Assets,Altman_Z_Score
0,1,0.716845,0.424389,0.792424,0.207576,0.169141,0.137757,0.147308,0.672775,0.827890,0.390284,0.290202,0.147950,0.095885,0.126549,0.520382,2.907602
1,1,0.795297,0.538214,0.828824,0.171176,0.208944,0.168962,0.056963,0.751111,0.839969,0.376760,0.283846,0.182251,0.093743,0.120916,0.567101,3.151521
2,1,0.774670,0.499019,0.792484,0.207516,0.180581,0.148036,0.098162,0.829502,0.836774,0.379093,0.290189,0.177911,0.092318,0.117922,0.538491,3.169636
3,1,0.739555,0.451265,0.848535,0.151465,0.193722,0.147561,0.098715,0.725754,0.834697,0.379743,0.281721,0.154187,0.077727,0.120760,0.604105,2.953751
4,1,0.795016,0.538432,0.893491,0.106509,0.212537,0.167461,0.110195,0.751822,0.839973,0.375025,0.278514,0.167502,0.096927,0.110933,0.578469,3.100158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6814,0,0.799927,0.539468,0.875382,0.124618,0.216602,0.171111,0.103838,0.817769,0.840359,0.373823,0.279606,0.175045,0.098222,0.112622,0.587178,3.224374
6815,0,0.799748,0.538269,0.900747,0.099253,0.216697,0.171805,0.089901,0.793387,0.840306,0.372505,0.278132,0.181324,0.098572,0.112329,0.569498,3.214767
6816,0,0.797778,0.533744,0.961061,0.038939,0.210929,0.172287,0.024414,0.866047,0.840138,0.369637,0.275789,0.269521,0.100103,0.110933,0.589341,3.480235
6817,0,0.811808,0.559911,0.913021,0.086979,0.228326,0.182498,0.083199,0.832340,0.841084,0.369649,0.277547,0.213392,0.111722,0.110957,0.678338,3.394334


In [18]:
from sklearn.preprocessing import StandardScaler

X = df[selected_columns].drop(columns=['Altman_Z_Score', 'Bankrupt?'])  # Features
y = df['Bankrupt?']  # Target

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)


df_scaled = pd.DataFrame(X_scaled, columns=X.columns)


df_scaled['Altman_Z_Score'] = df['Altman_Z_Score'].values
df_scaled['Bankrupt?'] = y.values

df_scaled.head()


Unnamed: 0,Net Income to Total Assets,ROA(A) before interest and % after tax,Net worth/Assets,Debt ratio %,Persistent EPS in the Last Four Seasons,Net profit before tax/Paid-in capital,Current Liability to Assets,Working Capital to Total Assets,Net Income to Stockholder's Equity,Borrowing dependency,Liability to Equity,Net Value Per Share (A),Operating profit/Paid-in capital,Equity to Long-term Liability,CFO to Assets,Altman_Z_Score,Bankrupt?
0,-2.254317,-2.045798,-1.750845,1.750845,-1.794106,-1.460495,1.126267,-2.393729,-0.861611,0.959784,0.680171,-1.275228,-0.471275,0.558428,-1.247231,2.907602,1
1,-0.309033,-0.311068,-1.075727,1.075727,-0.597379,-0.446793,-0.670363,-1.067133,-0.029804,0.129305,0.240685,-0.250429,-0.548362,0.269947,-0.449376,3.151521,1
2,-0.82051,-0.90842,-1.749724,1.749724,-1.450153,-1.126595,0.148933,0.260402,-0.24982,0.27256,0.679247,-0.380102,-0.599655,0.116638,-0.937979,3.169636,1
3,-1.691222,-1.636209,-0.710131,0.710131,-1.055034,-1.142029,0.159921,-1.496543,-0.392864,0.312459,0.093765,-1.088901,-1.124902,0.261976,0.182559,2.953751,1
4,-0.316008,-0.307745,0.123674,-0.123674,-0.489361,-0.495543,0.388219,-1.055083,-0.029564,0.022791,-0.128014,-0.691067,-0.433757,-0.241268,-0.255245,3.100158,1


In [20]:
np.isinf(df_scaled).sum()

Net Income to Total Assets                 0
ROA(A) before interest and % after tax     0
Net worth/Assets                           0
Debt ratio %                               0
Persistent EPS in the Last Four Seasons    0
Net profit before tax/Paid-in capital      0
Current Liability to Assets                0
Working Capital to Total Assets            0
Net Income to Stockholder's Equity         0
Borrowing dependency                       0
Liability to Equity                        0
Net Value Per Share (A)                    0
Operating profit/Paid-in capital           0
Equity to Long-term Liability              0
CFO to Assets                              0
Altman_Z_Score                             2
Bankrupt?                                  0
dtype: int64

In [22]:
df['Altman_Z_Score'].replace([np.inf, -np.inf], np.nan, inplace=True)
df_cleaned = df.dropna(subset=['Altman_Z_Score'])
print(df_cleaned[['Altman_Z_Score']].isnull().sum())
#df_cleaned is the new user dataframe

Altman_Z_Score    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Altman_Z_Score'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [24]:
# splitting the data into train and test data

In [26]:
X = df_cleaned.drop("Bankrupt?", axis=1)
y = df_cleaned["Bankrupt?"]

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [32]:
# logistic regression done on df_cleaned

In [34]:
from sklearn.linear_model import LogisticRegression

In [36]:
from sklearn.metrics import classification_report, confusion_matrix

log_reg = LogisticRegression(max_iter=10000, class_weight='balanced')
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

           0       0.97      0.73      0.83      1324
           1       0.04      0.35      0.07        40

    accuracy                           0.72      1364
   macro avg       0.51      0.54      0.45      1364
weighted avg       0.95      0.72      0.81      1364



In [38]:
df_bankrupt = df_cleaned[df_cleaned['Bankrupt?'] == 1]
df_non_bankrupt = df_cleaned[df_cleaned['Bankrupt?'] == 0]

df_non_bankrupt_under = df_non_bankrupt.sample(len(df_bankrupt), random_state=101)

df_balanced = pd.concat([df_bankrupt, df_non_bankrupt_under])

df_balanced = df_balanced.sample(frac=1, random_state=101).reset_index(drop=True)

X_balanced = df_balanced.drop(columns=['Bankrupt?'])
y_balanced = df_balanced['Bankrupt?']

X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=101)

log_reg_bal = LogisticRegression(max_iter=10000)
log_reg_bal.fit(X_train_bal, y_train_bal)

y_pred_bal = log_reg_bal.predict(X_test_bal)
print(classification_report(y_test_bal, y_pred_bal))
### testing the model with a more balanced dataframe

              precision    recall  f1-score   support

           0       0.60      0.65      0.63        49
           1       0.51      0.46      0.49        39

    accuracy                           0.57        88
   macro avg       0.56      0.56      0.56        88
weighted avg       0.56      0.57      0.56        88



In [40]:
# now training a random forrest

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Separate features and target variable from df_filtered
X = df_cleaned.drop(columns=['Bankrupt?'])
y = df_cleaned['Bankrupt?']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# Initialize the Random Forest model with class_weight='balanced'
rf_model = RandomForestClassifier(n_estimators=100, random_state=101, class_weight='balanced')

# Train the model on the imbalanced training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))


              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1324
           1       0.86      0.15      0.26        40

    accuracy                           0.97      1364
   macro avg       0.92      0.57      0.62      1364
weighted avg       0.97      0.97      0.97      1364

Confusion Matrix:
[[1323    1]
 [  34    6]]


In [46]:
# random forrest on the scaled data, df_cleaned

In [48]:
# Separate features and target variable from df_filtered
X = df_cleaned.drop(columns=['Bankrupt?'])
y = df_cleaned['Bankrupt?']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# Initialize the Random Forest model with class_weight='balanced'
rf_model = RandomForestClassifier(n_estimators=100, random_state=101, class_weight='balanced')

# Train the model on the imbalanced training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate the a
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1324
           1       0.86      0.15      0.26        40

    accuracy                           0.97      1364
   macro avg       0.92      0.57      0.62      1364
weighted avg       0.97      0.97      0.97      1364

Confusion Matrix:
[[1323    1]
 [  34    6]]


In [50]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import classification_report, confusion_matrix

In [51]:
# Define X (features) and y (target) from df_cleaned
X = df_cleaned.drop(columns=['Bankrupt?'])  # Features (scaled data)
y = df_cleaned['Bankrupt?']  # Target (Bankruptcy status)

# Split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# Convert y_train and y_test to numpy arrays if needed
y_train = y_train.values if hasattr(y_train, 'values') else y_train
y_test = y_test.values if hasattr(y_test, 'values') else y_test


In [52]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Define the new neural network model
model = Sequential()

# Input layer and first hidden layer with 128 neurons and batch normalization
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())  # Batch normalization to stabilize training
model.add(Dropout(0.4))  # Dropout to prevent overfitting

# Second hidden layer with 64 neurons
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))

# Third hidden layer with 32 neurons
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))

# Output layer for binary classification
model.add(Dense(1, activation='sigmoid'))

# Step 2: Compile the model with a decaying learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)  # You can adjust the learning rate if needed
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Step 3: Set class weights to handle the class imbalance
class_weights = {0: 1, 1: 15}  # Moderate class weights to handle imbalance

# Step 4: Train the model
model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_test, y_test), class_weight=class_weights)

# Step 5: Make predictions on the test set
y_pred_nn = model.predict(X_test)
y_pred_nn = (y_pred_nn > 0.5).astype(int)  # Convert probabilities to binary values

# Step 6: Evaluate the model
print(classification_report(y_test, y_pred_nn))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nn))


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4703 - loss: 1.2875 - val_accuracy: 0.8805 - val_loss: 0.3982
Epoch 2/100
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 802us/step - accuracy: 0.6063 - loss: 1.2117 - val_accuracy: 0.7889 - val_loss: 0.4743
Epoch 3/100
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 790us/step - accuracy: 0.7088 - loss: 0.9346 - val_accuracy: 0.7889 - val_loss: 0.4727
Epoch 4/100
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 787us/step - accuracy: 0.7292 - loss: 1.0382 - val_accuracy: 0.8358 - val_loss: 0.4644
Epoch 5/100
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 792us/step - accuracy: 0.7694 - loss: 0.9391 - val_accuracy: 0.8519 - val_loss: 0.4648
Epoch 6/100
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7772 - loss: 0.9804 - val_accuracy: 0.8372 - val_loss: 0.4638
Epoch 7/100
[1m86/86[0m [32m━━━━━━━

In [53]:
# Assuming we already trained Logistic Regression and Random Forest before, we use their test predictions
log_reg_pred_test = log_reg.predict_proba(X_test)[:, 1]  # Probability of class 1
rf_pred_test = rf_model.predict_proba(X_test)[:, 1]  # Probability of class 1

# Get the predictions from the newly built Neural Network
nn_pred_test = model.predict(X_test).flatten()  # Predictions for class 1 (flatten for compatibility)

# Combine the predictions from the three models into a new test dataset
import numpy as np
X_test_stack = np.column_stack((log_reg_pred_test, rf_pred_test, nn_pred_test))


[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 696us/step


In [54]:
# Train the meta-model (Logistic Regression) with class weights to handle class imbalance
meta_model = LogisticRegression(class_weight='balanced')  # Class weight adjustment
meta_model.fit(X_test_stack, y_test)  # Fit the meta-model using the stacked predictions

# Make final predictions on the test set using the meta-model
y_pred_stack = meta_model.predict(X_test_stack)

# Evaluate the stacked model
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred_stack))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_stack))


              precision    recall  f1-score   support

           0       0.99      0.93      0.96      1324
           1       0.23      0.72      0.35        40

    accuracy                           0.92      1364
   macro avg       0.61      0.83      0.65      1364
weighted avg       0.97      0.92      0.94      1364

Confusion Matrix:
[[1225   99]
 [  11   29]]
