# Ayan Mahmood
## I will be analyzing Cluster 1 for this project

In [90]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/train_data_transformed.csv')
df = df[df['Cluster'] == 1]
print(f"Number of Companies in Cluster 1: {len(df)}")
print(f"Number of Bankrupted Companies in Cluster 1: {df['Bankrupt?'].sum()}")

Number of Companies in Cluster 1: 1742
Number of Bankrupted Companies in Cluster 1: 26


In [91]:
df.head()

Unnamed: 0,Cash Flow to Equity,Retained Earnings to Total Assets,Revenue per person,Current Liability to Current Assets,Equity to Liability,Cash/Total Assets,Total expense/Assets,Liability-Assets Flag,Total debt/Total net worth,Operating profit per person,...,Current Ratio,Tax rate (A),Fixed Assets to Assets,Fixed Assets Turnover Frequency,Operating Expense Rate,Cash Turnover Rate,Cash/Current Liability,Net Income Flag,Cluster,Bankrupt?
2,0.219318,-0.035592,0.014429,0.014912,0.022242,0.087701,0.013941,0.0,-7.756428,0.224013,...,0.011474,0.079348,0.190924,0.000249,0.000106,0.000135,0.011598,1,1,0
9,0.216505,-0.035044,0.019035,0.007777,0.03504,0.128492,0.022969,0.0,-10.774427,0.223363,...,0.017826,0.0,0.017319,0.002938,0.000303,0.000345,0.036463,1,1,0
14,0.228137,-0.049663,0.001788,0.009978,0.031627,0.136495,0.019218,0.0,-9.832869,0.22072,...,0.015517,0.0,0.105442,0.000159,0.000762,0.001759,0.039603,1,1,0
17,0.218358,-0.042101,0.01068,0.011808,0.03597,0.106829,0.019244,0.0,-11.093872,0.220714,...,0.013844,0.058723,0.15686,0.000148,0.000384,0.000389,0.027017,1,1,0
24,0.219369,-0.030449,0.013829,0.009824,0.030578,0.120892,0.018977,0.0,-9.583275,0.223868,...,0.015669,0.049001,0.207863,0.000274,0.000149,0.000198,0.03276,1,1,0


### 3.3 Part 1 - Classification Model

In [92]:
#svm model for the classification of clusters
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv('../data/train_data_transformed.csv')

#prep features and tables
X = df.drop(columns=['Cluster', 'Bankrupt?'])
y = df['Cluster']

#train linear SVM classifier
svm_clf = LinearSVC(random_state=42, max_iter=10000)
svm_clf.fit(X, y)

#evaluate accuracy
train_acc = svm_clf.score(X, y)
y_pred = svm_clf.predict(X)

print(f"Training Accuracy: {train_acc:.4f}\n")
print("Classification Report:")
print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))

Training Accuracy: 0.9998

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2118
           1       1.00      1.00      1.00      1742
           2       1.00      1.00      1.00      1485
           3       1.00      1.00      1.00       462

    accuracy                           1.00      5807
   macro avg       1.00      1.00      1.00      5807
weighted avg       1.00      1.00      1.00      5807

Confusion Matrix:
[[2118    0    0    0]
 [   0 1742    0    0]
 [   0    0 1485    0]
 [   0    0    1  461]]




In [93]:
#get top 5 features by mean absolute coefficient
top_n = 10
coef_mean = np.abs(svm_clf.coef_).mean(axis=0)
feature_importance = pd.Series(coef_mean, index=X.columns)
top_feats = feature_importance.sort_values(ascending=False).head(top_n)
top_feature_names = top_feats.index.tolist()
print(f"\nTop {top_n} Features for Cluster-ID prediction:")
print(top_feats.to_frame(name='mean_abs_coefficient'))


Top 10 Features for Cluster-ID prediction:
                                    mean_abs_coefficient
 Net Income Flag                                0.722359
 Operating Expense Rate                         0.536848
 Cash Flow to Equity                            0.158755
 Cash Turnover Rate                             0.145997
 Operating profit per person                    0.122190
 Quick Assets/Current Liability                 0.116919
 Cash/Total Assets                              0.102089
 Cash/Current Liability                         0.053969
 Fixed Assets to Assets                         0.035386
 Retained Earnings to Total Assets              0.033303


##### We see that the results are expected based on the data analysis and preprocessing which was done earlier in the Group7_TrainingData.ipynb file. The Net Income Flag and Operating Expense Rate have the highest importance in determining the cluster.

### Part 2 - Stacking Model

In [94]:
#part 2 tuned Stacking Model to predict Bankruptcy
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

#prep cluster 1 data and get top 10
df1 = df[df['Cluster'] == 1]
X1 = df1[top_feature_names]
y1 = df1['Bankrupt?']

#define three base estimators with hyperparameter grids
base_models = {
    'rf': (RandomForestClassifier(random_state=42), {'n_estimators': [50, 100], 'max_depth': [None, 10]}),
    'svc': (SVC(probability=True, random_state=42), {'C': [0.1, 1, 10]}),
    'dt': (DecisionTreeClassifier(random_state=42), {'max_depth': [None, 5, 10]})
}

#hyperparameter tuning for base models
tuned_base_models = []
for name, (est, params) in base_models.items():
    grid = GridSearchCV(est, params, cv=5, scoring='recall', n_jobs=-1)
    grid.fit(X1, y1)
    print(f"Best {name} params: {grid.best_params_}")
    tuned_base_models.append((name, grid.best_estimator_))
print(tuned_base_models)

tt_sum = 0
tf_sum = 0
base_result = []
for item in tuned_base_models:
    name, model = item
    model.fit(X1, y1)
    y_pred = model.predict(X1)
    acc = accuracy_score(y1, y_pred)
    cm = confusion_matrix(y1, y_pred)
    tn, fp, fn, tp = cm.ravel()
    base_result.append({'Model': name, 'Accuracy': round(acc, 2), 'TT': tp, 'TF': fn})
    tt_sum += tp
    tf_sum += fn

out = tt_sum + tf_sum
average_TT = round(tt_sum/ (out) * out/3)
average_TF = round(tf_sum/ (out) * out/3)
average_accuracy = (tt_sum)/ (tt_sum + tf_sum)
print("\nCluster 1 Base Model Results (using accuracy_score):")
base_df = pd.DataFrame(base_result)
print(base_df.to_string(index=False))
print(f"\nAverage base-model accuracy: {base_df['Accuracy'].mean():.2f}")
acc_model_format = f"{average_accuracy:.2f}[{average_TT}({average_TF})]"
print(f"Average accuracy base-model format: {acc_model_format}")

Best rf params: {'max_depth': None, 'n_estimators': 50}
Best svc params: {'C': 0.1}
Best dt params: {'max_depth': None}
[('rf', RandomForestClassifier(n_estimators=50, random_state=42)), ('svc', SVC(C=0.1, probability=True, random_state=42)), ('dt', DecisionTreeClassifier(random_state=42))]

Cluster 1 Base Model Results (using accuracy_score):
Model  Accuracy  TT  TF
   rf      1.00  25   1
  svc      0.99   0  26
   dt      1.00  26   0

Average base-model accuracy: 1.00
Average accuracy base-model format: 0.65[17(9)]


In [95]:
#apply stacking
final_estimator = LogisticRegression(class_weight='balanced', random_state=42)
stack = StackingClassifier(estimators=tuned_base_models, final_estimator=final_estimator, cv=5)
stack.fit(X1,y1)
y_pred_meta = stack.predict(X1)
acc_meta = accuracy_score(y1, y_pred_meta)
cm = confusion_matrix(y1, y_pred_meta, labels=[0,1])
tn, fp, fn, tp = cm.ravel()

final_acc = tp / (tp + fn) if (tp + fn) > 0 else float('nan')
print("Final Stacking Model Cluster 1")
print("Confusion Matrix:")
print(cm)
print(f"True Positives (TT): {tp}")
print(f"False Negatives (TF): {fn}")
print(f"Final Accuracy (TT/(TT+TF)): {final_acc:.4f}\n")

Final Stacking Model Cluster 1
Confusion Matrix:
[[1685   31]
 [   0   26]]
True Positives (TT): 26
False Negatives (TF): 0
Final Accuracy (TT/(TT+TF)): 1.0000



In [96]:
import joblib
joblib.dump(stack, "../models/model_cluster1.pkl")
joblib.dump(top_feature_names, '../models/features_cluster1.pkl')

['../models/features_cluster1.pkl']

In [97]:
df = pd.read_csv('../data/table_3.csv')
df = df[df[df.columns[0]] != '1']
accuracy_train_format = f"{final_acc:.2f}[{tp}({fn})]"
new_row = pd.DataFrame([{
    df.columns[0]: '1',
    df.columns[1]: 'Ayan Mahmood',
    df.columns[2]: acc_model_format,
    df.columns[3]: accuracy_train_format,
    df.columns[4]: len(top_feature_names),
}])
df = pd.concat([df, new_row], ignore_index=True)
df.to_csv('../data/table_3.csv', index=False)