# Ayan Mahmood
## I will be analyzing Cluster 1 for this project

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/train_data_transformed.csv')
df = df[df['Cluster'] == 1]
print(f"Number of Companies in Cluster 1: {len(df)}")
print(f"Number of Bankrupted Companies in Cluster 1: {df['Bankrupt?'].sum()}")

Number of Companies in Cluster 1: 1742
Number of Bankrupted Companies in Cluster 1: 26


In [2]:
df.head()

Unnamed: 0,Cash Flow to Equity,Retained Earnings to Total Assets,Revenue per person,Current Liability to Current Assets,Equity to Liability,Cash/Total Assets,Total expense/Assets,Liability-Assets Flag,Total debt/Total net worth,Operating profit per person,...,Current Ratio,Tax rate (A),Fixed Assets to Assets,Fixed Assets Turnover Frequency,Operating Expense Rate,Cash Turnover Rate,Cash/Current Liability,Net Income Flag,Cluster,Bankrupt?
2,0.219318,-0.035592,0.014429,0.014912,0.022242,0.087701,0.013941,0.0,-7.756428,0.224013,...,0.011474,0.079348,0.190924,0.000249,0.000106,0.000135,0.011598,1,1,0
9,0.216505,-0.035044,0.019035,0.007777,0.03504,0.128492,0.022969,0.0,-10.774427,0.223363,...,0.017826,0.0,0.017319,0.002938,0.000303,0.000345,0.036463,1,1,0
14,0.228137,-0.049663,0.001788,0.009978,0.031627,0.136495,0.019218,0.0,-9.832869,0.22072,...,0.015517,0.0,0.105442,0.000159,0.000762,0.001759,0.039603,1,1,0
17,0.218358,-0.042101,0.01068,0.011808,0.03597,0.106829,0.019244,0.0,-11.093872,0.220714,...,0.013844,0.058723,0.15686,0.000148,0.000384,0.000389,0.027017,1,1,0
24,0.219369,-0.030449,0.013829,0.009824,0.030578,0.120892,0.018977,0.0,-9.583275,0.223868,...,0.015669,0.049001,0.207863,0.000274,0.000149,0.000198,0.03276,1,1,0


### 3.3 Part 1 - Classification Model

In [3]:
#svm model for the classification of clusters
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv('../data/train_data_transformed.csv')

#prep features and tables
X = df.drop(columns=['Cluster', 'Bankrupt?'])
y = df['Cluster']

#train linear SVM classifier
svm_clf = LinearSVC(random_state=42, max_iter=10000)
svm_clf.fit(X, y)

#evaluate accuracy
train_acc = svm_clf.score(X, y)
y_pred = svm_clf.predict(X)

print(f"Training Accuracy: {train_acc:.4f}\n")
print("Classification Report:")
print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))

#get top 10 features by mean absolute coefficient
top_n = 10
coef_mean = np.abs(svm_clf.coef_).mean(axis=0)
feature_importance = pd.Series(coef_mean, index=X.columns)
top_feats = feature_importance.sort_values(ascending=False).head(top_n)
top_feature_names = top_feats.index.tolist()
print(f"\nTop {top_n} Features for Cluster-ID prediction:")
print(top_feats.to_frame(name='mean_abs_coefficient'))

Training Accuracy: 0.9998

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2118
           1       1.00      1.00      1.00      1742
           2       1.00      1.00      1.00      1485
           3       1.00      1.00      1.00       462

    accuracy                           1.00      5807
   macro avg       1.00      1.00      1.00      5807
weighted avg       1.00      1.00      1.00      5807

Confusion Matrix:
[[2118    0    0    0]
 [   0 1742    0    0]
 [   0    0 1485    0]
 [   0    0    1  461]]

Top 10 Features for Cluster-ID prediction:
                                   mean_abs_coefficient
Net Income Flag                                0.676183
Operating Expense Rate                         0.535133
Quick Assets/Current Liability                 0.152180
Cash Flow to Equity                            0.149433
Cash Turnover Rate                             0.139218
Operating profit per per

##### We see that the results are expected based on the data analysis and preprocessing which was done earlier in the Group7_TrainingData.ipynb file. The Net Income Flag and Operating Expense Rate have the highest importance in determining the cluster.

### Part 2 - Stacking Model

In [4]:
#part 2 tuned Stacking Model with Oversampling using Top-N Features
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

#prep cluster 1 data and get top 10
df1 = df[df['Cluster'] == 1]
X1 = df1[top_feature_names]
y1 = df1['Bankrupt?']

#oversample minority class to balance
df_min = df1[df1['Bankrupt?'] == 1]
df_maj = df1[df1['Bankrupt?'] == 0]
df_min_up = resample(df_min, replace=True, n_samples=len(df_maj), random_state=42)
df_bal = pd.concat([df_maj, df_min_up])
X_bal = df_bal[top_feature_names]
y_bal = df_bal['Bankrupt?']
print(f"Balanced training set size: {len(y_bal)} (pos={y_bal.sum()}, neg={len(y_bal)-y_bal.sum()})")

#define three base estimators with hyperparameter grids
base_estimators = {
    'rf': (RandomForestClassifier(random_state=42), {'n_estimators': [50, 100], 'max_depth': [None, 10]}),
    'dt': (DecisionTreeClassifier(random_state=42), {'max_depth': [None, 5, 10]}),
    'svc': (SVC(probability=True, random_state=42), {'C': [0.1, 1, 10]})
}

#hyperparameter tuning for base models
best_estimators = []
for name, (est, params) in base_estimators.items():
    grid = GridSearchCV(est, params, cv=5, scoring='recall', n_jobs=-1)
    grid.fit(X_bal, y_bal)
    print(f"Best {name} params: {grid.best_params_}")
    best_estimators.append((name, grid.best_estimator_))

#build stacking classifier with balanced-trained base models and RF meta-model
stack = StackingClassifier(
    estimators=best_estimators,
    final_estimator=RandomForestClassifier(random_state=42),
    cv=5,
    passthrough=True
)

#tune meta-model parameters on balanced data
stack_param_grid = {'final_estimator__n_estimators': [50, 100], 'final_estimator__max_depth': [None, 10]}
stack_cv = GridSearchCV(stack, stack_param_grid, cv=5, scoring='recall', n_jobs=-1)
stack_cv.fit(X_bal, y_bal)
print(f"Best meta-model params: {stack_cv.best_params_}")

#evaluate on original Cluster 1 data
y_pred = stack_cv.predict(X1)
cm = confusion_matrix(y1, y_pred)
tn, fp, fn, tp = cm.ravel()
recall_acc = tp / (tp + fn) if (tp + fn) > 0 else float('nan')
print("\n=== Final Stacking Model Performance (Cluster 1) ===")
print("Confusion Matrix (rows=actual):")
print(cm)
print(f"True Positives (TT): {tp}")
print(f"False Negatives (TF): {fn}")
print(f"Recall (TT/(TT+TF)), used for grading: {recall_acc:.4f}\n")
print("Classification Report:")
print(classification_report(y1, y_pred))

Balanced training set size: 3432 (pos=1716, neg=1716)
Best rf params: {'max_depth': None, 'n_estimators': 50}
Best dt params: {'max_depth': None}
Best svc params: {'C': 1}
Best meta-model params: {'final_estimator__max_depth': None, 'final_estimator__n_estimators': 50}

=== Final Stacking Model Performance (Cluster 1) ===
Confusion Matrix (rows=actual):
[[1716    0]
 [   0   26]]
True Positives (TT): 26
False Negatives (TF): 0
Recall (TT/(TT+TF)), used for grading: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1716
           1       1.00      1.00      1.00        26

    accuracy                           1.00      1742
   macro avg       1.00      1.00      1.00      1742
weighted avg       1.00      1.00      1.00      1742

