# Ryan Camburn
# I will be analyzing cluster 3 for this project

In [64]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# 3.3 Part 1: Cluster ID Prediction

In [65]:
# 1. Load transformed training data (with Cluster and Bankrupt? columns)
df = pd.read_csv('../data/train_data_transformed.csv')

# 2. Split into X (features) and y (cluster labels)
X = df.drop(columns=['Cluster', 'Bankrupt?'])
y = df['Cluster']

# 3. Train a Random Forest to predict Cluster (overfitting is fine)
clf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42
)
clf.fit(X, y)


y_pred = clf.predict(X)
train_acc = accuracy_score(y, y_pred)
print(f"Training accuracy (cluster prediction): {train_acc:.4f}")

print("Classification Report:")
print(classification_report(y, y_pred))
labels = sorted(y.unique())
cm = confusion_matrix(y, y_pred, labels=labels)
print("\nConfusion Matrix:")
print(pd.DataFrame(cm, index=labels, columns=labels))

# 6. Feature importances
importances = pd.Series(clf.feature_importances_, index=X.columns)
top10 = importances.sort_values(ascending=False).head(10)

print("\nTop 10 features for cluster prediction:")
print(top10)

Training accuracy (cluster prediction): 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2118
           1       1.00      1.00      1.00      1742
           2       1.00      1.00      1.00      1485
           3       1.00      1.00      1.00       462

    accuracy                           1.00      5807
   macro avg       1.00      1.00      1.00      5807
weighted avg       1.00      1.00      1.00      5807


Confusion Matrix:
      0     1     2    3
0  2118     0     0    0
1     0  1742     0    0
2     0     0  1485    0
3     0     0     0  462

Top 10 features for cluster prediction:
Operating Expense Rate            0.372180
Cash Turnover Rate                0.267034
Cash/Current Liability            0.080804
Total expense/Assets              0.073301
Cash/Total Assets                 0.061999
Revenue per person                0.038513
Cash Flow to Equity               0.015817
Quick Ass

# 3.3 Part 2: Stacking Model

In [66]:
df = pd.read_csv('../data/train_data_transformed.csv')
df3 = df[df['Cluster'] == 3].copy()

print(f"Number of Companies in Cluster 3: {len(df3)}")
print(f"Number of Bankrupted Companies in Cluster 3: {df3['Bankrupt?'].sum()}")

X = df3.drop(columns=['Cluster', 'Bankrupt?'])
y = df3['Bankrupt?']
df3.head()

Number of Companies in Cluster 3: 462
Number of Bankrupted Companies in Cluster 3: 5


Unnamed: 0,Cash Flow to Equity,Retained Earnings to Total Assets,Revenue per person,Current Liability to Current Assets,Equity to Liability,Cash/Total Assets,Total expense/Assets,Liability-Assets Flag,Total debt/Total net worth,Operating profit per person,...,Current Ratio,Tax rate (A),Fixed Assets to Assets,Fixed Assets Turnover Frequency,Operating Expense Rate,Cash Turnover Rate,Cash/Current Liability,Net Income Flag,Cluster,Bankrupt?
6,0.213721,-0.041945,0.031354,0.023531,0.015105,0.034129,0.003701,0.0,-5.84731,0.220615,...,0.007002,0.182326,0.106966,0.000153,9.151684,0.000111,0.001338,1,3,0
13,0.215896,-0.033102,0.010282,0.006923,0.038966,0.128615,0.007271,0.0,-12.802953,0.22337,...,0.018798,0.068764,0.265905,0.000137,9.23378,0.000344,0.06232,1,3,0
16,0.216383,-0.040957,0.021283,0.017957,0.031904,0.099862,0.007569,0.0,-9.900615,0.219489,...,0.009608,0.119853,0.030487,0.001282,8.966717,0.000233,0.015564,1,3,0
18,0.219243,-0.036941,0.026478,0.026988,0.020897,0.098151,0.003087,0.0,-7.447987,0.237691,...,0.00574,0.022166,0.330082,4.124261,9.098308,0.000435,0.012389,1,3,0
23,0.219845,-0.041095,0.006166,0.015981,0.025978,0.12116,0.016103,0.0,-8.569932,0.222043,...,0.010774,0.0,0.1885,0.000417,9.312644,0.000208,0.015968,1,3,0


In [67]:
rf_imp = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
rf_imp.fit(X, y)
importances = pd.Series(rf_imp.feature_importances_, index=X.columns)
sorted_features = importances.sort_values(ascending=False).index.tolist()

selected_features = sorted_features[:7]
print("Selected features (Nfeatures = 7) for Cluster 3:")
for feat in selected_features:
    print(feat)

Selected features (Nfeatures = 7) for Cluster 3:
 Operating Expense Rate
 Cash/Current Liability
 Cash/Total Assets
 Total debt/Total net worth
 Equity to Liability
 Current Liability to Current Assets
 Current Ratio


In [68]:
base_estimators = [
    ('rf', RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)),
    ('et', ExtraTreesClassifier(class_weight='balanced', n_estimators=100, random_state=42)),
    ('knn', KNeighborsClassifier())
]

base_results = []
TT_sum = TF_sum = 0

for name, model in base_estimators:
    model.fit(X[selected_features], y)
    y_pred = model.predict(X[selected_features])
    acc = accuracy_score(y, y_pred)
    tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[0,1]).ravel()
    base_results.append({'Model': name, 'Accuracy': round(acc, 2), 'TT': tp, 'TF': fn})
    TT_sum += tp
    TF_sum += fn

print("\nCluster 3 Base Model Results (using accuracy_score):")
base_df = pd.DataFrame(base_results)
print(base_df.to_string(index=False))
print(f"\nAverage base-model accuracy: {base_df['Accuracy'].mean():.2f}")


Cluster 3 Base Model Results (using accuracy_score):
Model  Accuracy  TT  TF
   rf      1.00   5   0
   et      1.00   5   0
  knn      0.99   0   5

Average base-model accuracy: 1.00


In [72]:
stacking = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42, max_iter=1000),
    cv=5,
    passthrough=True
)
stacking.fit(X[selected_features], y)
y_pred_meta = stacking.predict(X[selected_features])
acc_meta = accuracy_score(y, y_pred_meta)
cm = confusion_matrix(y, y_pred_meta, labels=[0,1])
tn, fp, fn, tp = cm.ravel()

project_acc = tp / (tp + fn) if (tp + fn) > 0 else float('nan')

print("Stacking Model Cluster 3")
print("Confusion Matrix:")
print(cm)
print(f"True Positives (TT): {tp}")
print(f"False Negatives (TF): {fn}")
print(f"Recall (TT/(TT+TF)), used for grading: {project_acc:.4f}\n")

Stacking Model Cluster 3
Confusion Matrix:
[[367  90]
 [  0   5]]
True Positives (TT): 5
False Negatives (TF): 0
Recall (TT/(TT+TF)), used for grading: 1.0000

