In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report, roc_auc_score, recall_score, f1_score


In [2]:
# Load data
df = pd.read_csv("alldata36.csv", delimiter=',')
df['public_date'] = pd.to_datetime(df['public_date'])
df = df.sort_values(by=['permno', 'public_date'])

In [3]:
# Get the last entry for each company
last_entry_df = df.groupby('permno').last().reset_index()

In [4]:
# Define features and target
features = ['capital_ratio', 'equity_invcap', 'debt_invcap', 'totdebt_invcap', 'at_turn',
            'inv_turn', 'pay_turn', 'rect_turn', 'sale_equity', 'sale_invcap', 'sale_nwc',
            'invt_act', 'rect_act', 'fcf_ocf', 'ocf_lct', 'cash_debt', 'cash_lt', 'cfm',
            'short_debt', 'profit_lct', 'curr_debt', 'debt_ebitda', 'dltt_be', 'int_debt',
            'int_totdebt', 'lt_debt', 'lt_ppent', 'cash_conversion', 'cash_ratio', 'curr_ratio',
            'quick_ratio', 'accrual', 'rd_sale', 'adv_sale', 'staff_sale', 'efftax', 'gprof',
            'aftret_eq', 'aftret_equity', 'aftret_invcapx', 'gpm', 'npm', 'opmad', 'opmbd',
            'pretret_earnat', 'pretret_noa', 'ptpm', 'roa', 'roce', 'roe', 'de_ratio',
            'debt_assets', 'debt_at', 'debt_capital', 'intcov', 'intcov_ratio', 'dpr', 'bm',
            'capei', 'divyield', 'evm', 'pcf', 'pe_exi', 'pe_inc', 'pe_op_basic', 'pe_op_dil',
            'ps', 'ptb', 'peg_1yrforward', 'peg_ltgforward', 'peg_trailing']
target = 'Bankruptcy'

In [5]:
# Create masks for train, test, and out-sample datasets based on the last entry dates
train_mask = (last_entry_df['public_date'] >= '1970-01-01') & (last_entry_df['public_date'] <= '2010-12-31')
test_mask = (last_entry_df['public_date'] >= '1970-01-01') & (last_entry_df['public_date'] <= '2020-12-31')
out_sample_mask = (last_entry_df['public_date'] >= '2011-01-01') & (last_entry_df['public_date'] <= '2020-12-31')

In [6]:
# Split the datasets
train_df = last_entry_df[train_mask]
test_df = last_entry_df[test_mask]
out_sample_df = last_entry_df[out_sample_mask]

In [7]:
# Extract features and target
X_train = train_df[features].values
y_train = train_df[target].values
X_test = test_df[features].values
y_test = test_df[target].values
X_out = out_sample_df[features].values
y_out = out_sample_df[target].values

In [8]:
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
X_out = imputer.transform(X_out)

In [9]:
# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_out_scaled = scaler.transform(X_out)

In [10]:
# Balance the dataset using SMOTE
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)

In [11]:
# Build MDA model
lda = LDA(n_components=None, solver='svd', tol=0.0001)
lda.fit(X_train_resampled, y_train_resampled)

LinearDiscriminantAnalysis()

In [12]:
# Predictions Training Data
y_pred = lda.predict(X_train_scaled)
y_pred_probs = lda.predict_proba(X_train_scaled)[:, 1]  # assuming the positive class is at index 1

In [13]:
# Calculate evaluation metrics
train_accuracy = accuracy_score(y_train, y_pred)
train_precision = precision_score(y_train, y_pred)
train_recall = recall_score(y_train, y_pred)
train_f1 = f1_score(y_train, y_pred)
train_roc_auc = roc_auc_score(y_train, y_pred_probs)

# Print the metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Precision: {train_precision:.4f}")
print(f"Train Recall: {train_recall:.4f}")
print(f"Train F1 Score: {train_f1:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")

Train Accuracy: 0.7627
Train Precision: 0.4046
Train Recall: 0.7474
Train F1 Score: 0.5250
Train ROC AUC: 0.8188


In [14]:
# Compute and print the confusion matrix Training Data 
cm = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[6822 2085]
 [ 479 1417]]


In [15]:
# Predictions Testing Data
y_pred = lda.predict(X_test_scaled)
y_pred_probs = lda.predict_proba(X_test_scaled)[:, 1]  # assuming the positive class is at index 1


In [16]:
# Calculate evaluation metrics
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_pred_probs)

# Print the metrics
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test ROC AUC: {test_roc_auc:.4f}")

Test Accuracy: 0.7626
Test Precision: 0.3904
Test Recall: 0.7380
Test F1 Score: 0.5106
Test ROC AUC: 0.8153


In [17]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[8180 2477]
 [ 563 1586]]


In [18]:
# Predictions Out-of-Sample LDA
y_pred = lda.predict(X_out_scaled)
y_pred_probs = lda.predict_proba(X_out_scaled)[:, 1]  # assuming the positive class is at index 1

In [19]:
# Calculate evaluation metrics
out_accuracy = accuracy_score(y_out, y_pred)
out_precision = precision_score(y_out, y_pred)
out_recall = recall_score(y_out, y_pred)
out_f1 = f1_score(y_out, y_pred)
out_roc_auc = roc_auc_score(y_out, y_pred_probs)

# Print the metrics
print(f"Out Accuracy: {out_accuracy:.4f}")
print(f"Out Precision: {out_precision:.4f}")
print(f"Out Recall: {out_recall:.4f}")
print(f"Out F1 Score: {out_f1:.4f}")
print(f"Out ROC AUC: {out_roc_auc:.4f}")

Out Accuracy: 0.7624
Out Precision: 0.3012
Out Recall: 0.6680
Out F1 Score: 0.4152
Out ROC AUC: 0.7879


In [20]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_out, y_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[1358  392]
 [  84  169]]


In [21]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=10000, random_state=42)

# Train the model
log_reg.fit(X_train_resampled, y_train_resampled)


LogisticRegression(max_iter=10000, random_state=42)

In [22]:
# Make predictions on the train data
y_pred_lr = log_reg.predict(X_train_scaled)
y_pred_probs_lr = log_reg.predict_proba(X_train_scaled)[:, 1]  # probabilities for the positive class

In [23]:
# Evaluate the model
train_accuracy_lr = accuracy_score(y_train, y_pred_lr)
train_precision_lr = precision_score(y_train, y_pred_lr, zero_division=1)
train_recall_lr = recall_score(y_train, y_pred_lr, zero_division=1)
train_f1_lr = f1_score(y_train, y_pred_lr, zero_division=1)
train_roc_auc_lr = roc_auc_score(y_train, y_pred_probs_lr)

# Print evaluation metrics
print(f"Logistic Regression Train Accuracy: {train_accuracy_lr:.4f}")
print(f"Logistic Regression Train Precision: {train_precision_lr:.4f}")
print(f"Logistic Regression Train Recall: {train_recall_lr:.4f}")
print(f"Logistic Regression Train F1 Score: {train_f1_lr:.4f}")
print(f"Logistic Regression Train ROC AUC: {train_roc_auc_lr:.4f}")


Logistic Regression Train Accuracy: 0.7790
Logistic Regression Train Precision: 0.4290
Logistic Regression Train Recall: 0.7838
Logistic Regression Train F1 Score: 0.5545
Logistic Regression Train ROC AUC: 0.8390


In [24]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_train, y_pred_lr)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[6929 1978]
 [ 410 1486]]


In [25]:
# Make predictions on the test data
y_pred_lr = log_reg.predict(X_test_scaled)
y_pred_probs_lr = log_reg.predict_proba(X_test_scaled)[:, 1]  # probabilities for the positive class

In [26]:
# Evaluate the model
test_accuracy_lr = accuracy_score(y_test, y_pred_lr)
test_precision_lr = precision_score(y_test, y_pred_lr, zero_division=1)
test_recall_lr = recall_score(y_test, y_pred_lr, zero_division=1)
test_f1_lr = f1_score(y_test, y_pred_lr, zero_division=1)
test_roc_auc_lr = roc_auc_score(y_test, y_pred_probs_lr)

# Print evaluation metrics
print(f"Logistic Regression Test Accuracy: {test_accuracy_lr:.4f}")
print(f"Logistic Regression Test Precision: {test_precision_lr:.4f}")
print(f"Logistic Regression Test Recall: {test_recall_lr:.4f}")
print(f"Logistic Regression Test F1 Score: {test_f1_lr:.4f}")
print(f"Logistic Regression Test ROC AUC: {test_roc_auc_lr:.4f}")


Logistic Regression Test Accuracy: 0.7816
Logistic Regression Test Precision: 0.4186
Logistic Regression Test Recall: 0.7748
Logistic Regression Test F1 Score: 0.5435
Logistic Regression Test ROC AUC: 0.8379


In [27]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_test, y_pred_lr)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[8344 2313]
 [ 484 1665]]


In [28]:
# Make predictions on the out-of-sample data
y_pred_lr = log_reg.predict(X_out_scaled)
y_pred_probs_lr = log_reg.predict_proba(X_out_scaled)[:, 1]  # probabilities for the positive class


In [29]:
# Calculate evaluation metrics
out_accuracy_lr = accuracy_score(y_out, y_pred_lr)
out_precision_lr = precision_score(y_out, y_pred_lr, zero_division=1)
out_recall_lr = recall_score(y_out, y_pred_lr, zero_division=1)
out_f1_lr = f1_score(y_out, y_pred_lr, zero_division=1)
out_roc_auc_lr = roc_auc_score(y_out, y_pred_probs_lr)

# Print the metrics
print(f"Logistic Regression out Accuracy: {out_accuracy_lr:.4f}")
print(f"Logistic Regression out Precision: {out_precision_lr:.4f}")
print(f"Logistic Regression out Recall: {out_recall_lr:.4f}")
print(f"Logistic Regression out F1 Score: {out_f1_lr:.4f}")
print(f"Logistic Regression out ROC AUC: {out_roc_auc_lr:.4f}")

Logistic Regression out Accuracy: 0.7958
Logistic Regression out Precision: 0.3482
Logistic Regression out Recall: 0.7075
Logistic Regression out F1 Score: 0.4668
Logistic Regression out ROC AUC: 0.8236


In [30]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_out, y_pred_lr)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[1415  335]
 [  74  179]]


In [31]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model with adjusted parameters
random_forest = RandomForestClassifier(
    n_estimators=50,  # Reduced number of trees
    max_depth=10,     # Limiting depth of each tree
    n_jobs=-1,        # Use all available cores
    random_state=42
)

# Train the model
random_forest.fit(X_train_resampled, y_train_resampled)

RandomForestClassifier(max_depth=10, n_estimators=50, n_jobs=-1,
                       random_state=42)

In [32]:
# Make predictions on the train data
y_pred_rf = random_forest.predict(X_train_scaled)
y_pred_probs_rf = random_forest.predict_proba(X_train_scaled)[:, 1]  # probabilities for the positive class

In [33]:
# Evaluate the model
train_accuracy_rf = accuracy_score(y_train, y_pred_rf)
train_precision_rf = precision_score(y_train, y_pred_rf, zero_division=1)
train_recall_rf = recall_score(y_train, y_pred_rf, zero_division=1)
train_f1_rf = f1_score(y_train, y_pred_rf, zero_division=1)
train_roc_auc_rf = roc_auc_score(y_train, y_pred_probs_rf)

# Print evaluation metrics
print(f"Random Forest Train Accuracy: {train_accuracy_rf:.4f}")
print(f"Random Forest Train Precision: {train_precision_rf:.4f}")
print(f"Random Forest Train Recall: {train_recall_rf:.4f}")
print(f"Random Forest Train F1 Score: {train_f1_rf:.4f}")
print(f"Random Forest Train ROC AUC: {train_roc_auc_rf:.4f}")

Random Forest Train Accuracy: 0.8822
Random Forest Train Precision: 0.6040
Random Forest Train Recall: 0.9541
Random Forest Train F1 Score: 0.7397
Random Forest Train ROC AUC: 0.9644


In [34]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_train, y_pred_rf)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[7721 1186]
 [  87 1809]]


In [35]:
# Make predictions on the test data
y_pred_rf = random_forest.predict(X_test_scaled)
y_pred_probs_rf = random_forest.predict_proba(X_test_scaled)[:, 1]  # probabilities for the positive class

In [36]:
# Evaluate the model
test_accuracy_rf = accuracy_score(y_test, y_pred_rf)
test_precision_rf = precision_score(y_test, y_pred_rf, zero_division=1)
test_recall_rf = recall_score(y_test, y_pred_rf, zero_division=1)
test_f1_rf = f1_score(y_test, y_pred_rf, zero_division=1)
test_roc_auc_rf = roc_auc_score(y_test, y_pred_probs_rf)

# Print evaluation metrics
print(f"Random Forest Test Accuracy: {test_accuracy_rf:.4f}")
print(f"Random Forest Test Precision: {test_precision_rf:.4f}")
print(f"Random Forest Test Recall: {test_recall_rf:.4f}")
print(f"Random Forest Test F1 Score: {test_f1_rf:.4f}")
print(f"Random Forest Test ROC AUC: {test_roc_auc_rf:.4f}")

Random Forest Test Accuracy: 0.8740
Random Forest Test Precision: 0.5763
Random Forest Test Recall: 0.9423
Random Forest Test F1 Score: 0.7152
Random Forest Test ROC AUC: 0.9566


In [37]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[9168 1489]
 [ 124 2025]]


In [38]:
# Make predictions on the out-of-sample data
y_pred_rf = random_forest.predict(X_out_scaled)
y_pred_probs_rf = random_forest.predict_proba(X_out_scaled)[:, 1]  # probabilities for the positive class

In [39]:
out_accuracy_rf = accuracy_score(y_out, y_pred_rf)
out_precision_rf = precision_score(y_out, y_pred_rf, zero_division=1)
out_recall_rf = recall_score(y_out, y_pred_rf, zero_division=1)
out_f1_rf= f1_score(y_out, y_pred_rf, zero_division=1)
out_roc_auc_rf = roc_auc_score(y_out, y_pred_probs_rf)

# Print the metrics
print(f"Random Forest out Accuracy: {out_accuracy_rf:.4f}")
print(f"Random Forest out Precision: {out_precision_rf:.4f}")
print(f"Random Forest out Recall: {out_recall_rf:.4f}")
print(f"Random Forest out F1 Score: {out_f1_rf:.4f}")
print(f"Random Forest out ROC AUC: {out_roc_auc_rf:.4f}")

Random Forest out Accuracy: 0.8303
Random Forest out Precision: 0.4162
Random Forest out Recall: 0.8538
Random Forest out F1 Score: 0.5596
Random Forest out ROC AUC: 0.9094


In [40]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_out, y_pred_rf)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[1447  303]
 [  37  216]]


In [46]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt

# Initialize the Decision Tree model
decision_tree = DecisionTreeClassifier(random_state=42)

# Train the model
decision_tree.fit(X_train_resampled, y_train_resampled)


DecisionTreeClassifier(random_state=42)

In [47]:
# Make predictions on the test data
y_pred_dt = decision_tree.predict(X_test_scaled)
y_pred_probs_dt = decision_tree.predict_proba(X_test_scaled)[:, 1]  # probabilities for the positive class

In [48]:
# Evaluate the model
test_accuracy_dt = accuracy_score(y_test, y_pred_dt)
test_precision_dt = precision_score(y_test, y_pred_dt, zero_division=1)
test_recall_dt = recall_score(y_test, y_pred_dt, zero_division=1)
test_f1_dt = f1_score(y_test, y_pred_dt, zero_division=1)
test_roc_auc_dt = roc_auc_score(y_test, y_pred_probs_dt)

# Print evaluation metrics
print(f"Decision Tree Test Accuracy: {test_accuracy_dt:.4f}")
print(f"Decision Tree Test Precision: {test_precision_dt:.4f}")
print(f"Decision Tree Test Recall: {test_recall_dt:.4f}")
print(f"Decision Tree Test F1 Score: {test_f1_dt:.4f}")
print(f"Decision Tree Test ROC AUC: {test_roc_auc_dt:.4f}")


Decision Tree Test Accuracy: 0.9649
Decision Tree Test Precision: 0.8565
Decision Tree Test Recall: 0.9497
Decision Tree Test F1 Score: 0.9007
Decision Tree Test ROC AUC: 0.9594


In [49]:
# Make predictions on the test data
y_pred_dt = decision_tree.predict(X_out_scaled)
y_pred_probs_dt = decision_tree.predict_proba(X_out_scaled)[:, 1]  # probabilities for the positive class

In [50]:
# Evaluate the model
out_accuracy_dt = accuracy_score(y_out, y_pred_dt)
out_precision_dt = precision_score(y_out, y_pred_dt, zero_division=1)
out_recall_dt = recall_score(y_out, y_pred_dt, zero_division=1)
out_f1_dt = f1_score(y_out, y_pred_dt, zero_division=1)
out_roc_auc_dt = roc_auc_score(y_out, y_pred_probs_dt)

# Print evaluation metrics
print(f"Decision Tree Out-Sample Accuracy: {out_accuracy_dt:.4f}")
print(f"Decision Tree Out-Sample Precision: {out_precision_dt:.4f}")
print(f"Decision Tree Out-Sample Recall: {out_recall_dt:.4f}")
print(f"Decision Tree Out-Sample F1 Score: {out_f1_dt:.4f}")
print(f"Decision Tree Out-Sample ROC AUC: {out_roc_auc_dt:.4f}")


Decision Tree Out-Sample Accuracy: 0.7768
Decision Tree Out-Sample Precision: 0.3020
Decision Tree Out-Sample Recall: 0.5850
Decision Tree Out-Sample F1 Score: 0.3984
Decision Tree Out-Sample ROC AUC: 0.6948


In [51]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_out, y_pred_dt)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[1408  342]
 [ 105  148]]
