In [4]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the dataset
file = "D:/download/taiwanese+bankruptcy+prediction/data.csv"
df = pd.read_csv(file)

# Remove extra spaces in column names
df.columns = df.columns.str.strip()

# List of expected important features
top_20_features = [
    "Operating Profit Rate", "Debt Ratio %", "Net Worth/Total Assets",
    "Interest-bearing debt interest rate", "Net Income to Total Assets",
    "Total expenses/Assets", "Retained Earnings to Total Assets",
    "ROA(C) before interest and depreciation before interest", "Total Asset Turnover",
    "Net Value Per Share (A)", "Cash Flow to Total Assets",
    "Persistent EPS in the Last Four Seasons", "Debt/Equity Ratio",
    "Operating Gross Margin", "Interest Expense Ratio", "Cash/Current Liability",
    "Quick Ratio", "ROA(A) before interest and %", "Total liabilities/Total Assets",
    "Net profit before tax/Paid-in capital"
]

# Find the exact matching column names
existing_features = list(set(df.columns).intersection(set(top_20_features)))

print(f"Found {len(existing_features)} matching features: {existing_features}")

# Ensure we have at least some features to proceed
if len(existing_features) < 10:
    raise ValueError("Not enough matching features found! Please check column names.")

# Define X and y with the matched features
X = df[existing_features]
y = df["Bankrupt?"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Design and Training
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
best_rf = grid_search.best_estimator_

# Model Evaluation
# Predictions
y_pred = best_rf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print("Random Forest Performance with Top Features:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Found 14 matching features: ['Net profit before tax/Paid-in capital', 'Persistent EPS in the Last Four Seasons', 'Operating Gross Margin', 'Cash Flow to Total Assets', 'Total Asset Turnover', 'Cash/Current Liability', 'Net Value Per Share (A)', 'Quick Ratio', 'Operating Profit Rate', 'Net Income to Total Assets', 'Retained Earnings to Total Assets', 'Interest Expense Ratio', 'ROA(C) before interest and depreciation before interest', 'Interest-bearing debt interest rate']
Random Forest Performance with Top Features:
Accuracy: 0.9699
Precision: 0.6000
Recall: 0.2045
F1 Score: 0.3051

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1320
           1       0.60      0.20      0.31        44

    accuracy                           0.97      1364
   macro avg       0.79      0.60      0.64      1364
weighted avg       0.96      0.97      0.96      1364



In [5]:
import pickle
# Save the trained Random Forest model
with open("bankruptcy_model.pkl", "wb") as model_file:
    pickle.dump(rf, model_file)

# Save the scaler
with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

print("✅ Model and scaler saved successfully as bankruptcy_model.pkl and scaler.pkl")


✅ Model and scaler saved successfully as bankruptcy_model.pkl and scaler.pkl
