In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report, roc_auc_score, recall_score, f1_score


In [None]:
# Load data
df = pd.read_csv("sampled_10_percent.csv", delimiter=',')
df['public_date'] = pd.to_datetime(df['public_date'])
df = df.sort_values(by=['permno', 'public_date'])

In [None]:
# Get the last entry for each company
last_entry_df = df.groupby('permno').last().reset_index()

In [None]:
# Define features and target
features = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10',
          'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19',
          'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28',
          'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37',
          'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46',
          'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55',
          'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64',
          'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71']

target = 'Bankruptcy'

In [None]:
# Create masks for train, test, and out-sample datasets based on the last entry dates
train_mask = (last_entry_df['public_date'] >= '1970-01-01') & (last_entry_df['public_date'] <= '2010-12-31')
test_mask = (last_entry_df['public_date'] >= '1970-01-01') & (last_entry_df['public_date'] <= '2020-12-31')
out_sample_mask = (last_entry_df['public_date'] >= '2011-01-01') & (last_entry_df['public_date'] <= '2020-12-31')

In [None]:
# Split the datasets
train_df = last_entry_df[train_mask]
test_df = last_entry_df[test_mask]
out_sample_df = last_entry_df[out_sample_mask]

In [None]:
# Extract features and target
X_train = train_df[features].values
y_train = train_df[target].values
X_test = test_df[features].values
y_test = test_df[target].values
X_out = out_sample_df[features].values
y_out = out_sample_df[target].values

In [None]:
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
X_out = imputer.transform(X_out)

In [None]:
# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_out_scaled = scaler.transform(X_out)

In [None]:
# Balance the dataset using SMOTE
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)

In [None]:
# Build MDA model
lda = LDA(n_components=None, solver='svd', tol=0.0001)
lda.fit(X_train_resampled, y_train_resampled)

In [None]:
# Predictions Training Data
y_pred = lda.predict(X_train_scaled)
y_pred_probs = lda.predict_proba(X_train_scaled)[:, 1]  # assuming the positive class is at index 1

In [None]:
# Calculate evaluation metrics
train_accuracy = accuracy_score(y_train, y_pred)
train_precision = precision_score(y_train, y_pred)
train_recall = recall_score(y_train, y_pred)
train_f1 = f1_score(y_train, y_pred)
train_roc_auc = roc_auc_score(y_train, y_pred_probs)

# Print the metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Precision: {train_precision:.4f}")
print(f"Train Recall: {train_recall:.4f}")
print(f"Train F1 Score: {train_f1:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")

In [None]:
# Compute and print the confusion matrix Training Data 
cm = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:")
print(cm)


In [None]:
# Predictions Testing Data
y_pred = lda.predict(X_test_scaled)
y_pred_probs = lda.predict_proba(X_test_scaled)[:, 1]  # assuming the positive class is at index 1


In [None]:
# Calculate evaluation metrics
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_pred_probs)

# Print the metrics
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test ROC AUC: {test_roc_auc:.4f}")

In [None]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


In [None]:
# Predictions Out-of-Sample LDA
y_pred = lda.predict(X_out_scaled)
y_pred_probs = lda.predict_proba(X_out_scaled)[:, 1]  # assuming the positive class is at index 1

In [None]:
# Calculate evaluation metrics
out_accuracy = accuracy_score(y_out, y_pred)
out_precision = precision_score(y_out, y_pred)
out_recall = recall_score(y_out, y_pred)
out_f1 = f1_score(y_out, y_pred)
out_roc_auc = roc_auc_score(y_out, y_pred_probs)

# Print the metrics
print(f"Out Accuracy: {out_accuracy:.4f}")
print(f"Out Precision: {out_precision:.4f}")
print(f"Out Recall: {out_recall:.4f}")
print(f"Out F1 Score: {out_f1:.4f}")
print(f"Out ROC AUC: {out_roc_auc:.4f}")

In [None]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_out, y_pred)
print("Confusion Matrix:")
print(cm)


In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=10000, random_state=42)

# Train the model
log_reg.fit(X_train_resampled, y_train_resampled)


In [None]:
# Make predictions on the train data
y_pred_lr = log_reg.predict(X_train_scaled)
y_pred_probs_lr = log_reg.predict_proba(X_train_scaled)[:, 1]  # probabilities for the positive class

In [None]:
# Evaluate the model
train_accuracy_lr = accuracy_score(y_train, y_pred_lr)
train_precision_lr = precision_score(y_train, y_pred_lr, zero_division=1)
train_recall_lr = recall_score(y_train, y_pred_lr, zero_division=1)
train_f1_lr = f1_score(y_train, y_pred_lr, zero_division=1)
train_roc_auc_lr = roc_auc_score(y_train, y_pred_probs_lr)

# Print evaluation metrics
print(f"Logistic Regression Train Accuracy: {train_accuracy_lr:.4f}")
print(f"Logistic Regression Train Precision: {train_precision_lr:.4f}")
print(f"Logistic Regression Train Recall: {train_recall_lr:.4f}")
print(f"Logistic Regression Train F1 Score: {train_f1_lr:.4f}")
print(f"Logistic Regression Train ROC AUC: {train_roc_auc_lr:.4f}")


In [None]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_train, y_pred_lr)
print("Confusion Matrix:")
print(cm)


In [None]:
# Make predictions on the test data
y_pred_lr = log_reg.predict(X_test_scaled)
y_pred_probs_lr = log_reg.predict_proba(X_test_scaled)[:, 1]  # probabilities for the positive class

In [None]:
# Evaluate the model
test_accuracy_lr = accuracy_score(y_test, y_pred_lr)
test_precision_lr = precision_score(y_test, y_pred_lr, zero_division=1)
test_recall_lr = recall_score(y_test, y_pred_lr, zero_division=1)
test_f1_lr = f1_score(y_test, y_pred_lr, zero_division=1)
test_roc_auc_lr = roc_auc_score(y_test, y_pred_probs_lr)

# Print evaluation metrics
print(f"Logistic Regression Test Accuracy: {test_accuracy_lr:.4f}")
print(f"Logistic Regression Test Precision: {test_precision_lr:.4f}")
print(f"Logistic Regression Test Recall: {test_recall_lr:.4f}")
print(f"Logistic Regression Test F1 Score: {test_f1_lr:.4f}")
print(f"Logistic Regression Test ROC AUC: {test_roc_auc_lr:.4f}")


In [None]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_test, y_pred_lr)
print("Confusion Matrix:")
print(cm)


In [None]:
# Make predictions on the out-of-sample data
y_pred_lr = log_reg.predict(X_out_scaled)
y_pred_probs_lr = log_reg.predict_proba(X_out_scaled)[:, 1]  # probabilities for the positive class


In [None]:
# Calculate evaluation metrics
out_accuracy_lr = accuracy_score(y_out, y_pred_lr)
out_precision_lr = precision_score(y_out, y_pred_lr, zero_division=1)
out_recall_lr = recall_score(y_out, y_pred_lr, zero_division=1)
out_f1_lr = f1_score(y_out, y_pred_lr, zero_division=1)
out_roc_auc_lr = roc_auc_score(y_out, y_pred_probs_lr)

# Print the metrics
print(f"Logistic Regression out Accuracy: {out_accuracy_lr:.4f}")
print(f"Logistic Regression out Precision: {out_precision_lr:.4f}")
print(f"Logistic Regression out Recall: {out_recall_lr:.4f}")
print(f"Logistic Regression out F1 Score: {out_f1_lr:.4f}")
print(f"Logistic Regression out ROC AUC: {out_roc_auc_lr:.4f}")

In [None]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_out, y_pred_lr)
print("Confusion Matrix:")
print(cm)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model with adjusted parameters
random_forest = RandomForestClassifier(
    n_estimators=50,  # Reduced number of trees
    max_depth=10,     # Limiting depth of each tree
    n_jobs=-1,        # Use all available cores
    random_state=42
)

# Train the model
random_forest.fit(X_train_resampled, y_train_resampled)

In [None]:
# Make predictions on the train data
y_pred_rf = random_forest.predict(X_train_scaled)
y_pred_probs_rf = random_forest.predict_proba(X_train_scaled)[:, 1]  # probabilities for the positive class

In [None]:
# Evaluate the model
train_accuracy_rf = accuracy_score(y_train, y_pred_rf)
train_precision_rf = precision_score(y_train, y_pred_rf, zero_division=1)
train_recall_rf = recall_score(y_train, y_pred_rf, zero_division=1)
train_f1_rf = f1_score(y_train, y_pred_rf, zero_division=1)
train_roc_auc_rf = roc_auc_score(y_train, y_pred_probs_rf)

# Print evaluation metrics
print(f"Random Forest Train Accuracy: {train_accuracy_rf:.4f}")
print(f"Random Forest Train Precision: {train_precision_rf:.4f}")
print(f"Random Forest Train Recall: {train_recall_rf:.4f}")
print(f"Random Forest Train F1 Score: {train_f1_rf:.4f}")
print(f"Random Forest Train ROC AUC: {train_roc_auc_rf:.4f}")

In [None]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_train, y_pred_rf)
print("Confusion Matrix:")
print(cm)


In [None]:
# Make predictions on the test data
y_pred_rf = random_forest.predict(X_test_scaled)
y_pred_probs_rf = random_forest.predict_proba(X_test_scaled)[:, 1]  # probabilities for the positive class

In [None]:
# Evaluate the model
test_accuracy_rf = accuracy_score(y_test, y_pred_rf)
test_precision_rf = precision_score(y_test, y_pred_rf, zero_division=1)
test_recall_rf = recall_score(y_test, y_pred_rf, zero_division=1)
test_f1_rf = f1_score(y_test, y_pred_rf, zero_division=1)
test_roc_auc_rf = roc_auc_score(y_test, y_pred_probs_rf)

# Print evaluation metrics
print(f"Random Forest Test Accuracy: {test_accuracy_rf:.4f}")
print(f"Random Forest Test Precision: {test_precision_rf:.4f}")
print(f"Random Forest Test Recall: {test_recall_rf:.4f}")
print(f"Random Forest Test F1 Score: {test_f1_rf:.4f}")
print(f"Random Forest Test ROC AUC: {test_roc_auc_rf:.4f}")

In [None]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:")
print(cm)


In [None]:
# Make predictions on the out-of-sample data
y_pred_rf = random_forest.predict(X_out_scaled)
y_pred_probs_rf = random_forest.predict_proba(X_out_scaled)[:, 1]  # probabilities for the positive class

In [None]:
out_accuracy_rf = accuracy_score(y_out, y_pred_rf)
out_precision_rf = precision_score(y_out, y_pred_rf, zero_division=1)
out_recall_rf = recall_score(y_out, y_pred_rf, zero_division=1)
out_f1_rf= f1_score(y_out, y_pred_rf, zero_division=1)
out_roc_auc_rf = roc_auc_score(y_out, y_pred_probs_rf)

# Print the metrics
print(f"Random Forest out Accuracy: {out_accuracy_rf:.4f}")
print(f"Random Forest out Precision: {out_precision_rf:.4f}")
print(f"Random Forest out Recall: {out_recall_rf:.4f}")
print(f"Random Forest out F1 Score: {out_f1_rf:.4f}")
print(f"Random Forest out ROC AUC: {out_roc_auc_rf:.4f}")

In [None]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_out, y_pred_rf)
print("Confusion Matrix:")
print(cm)
