3.3 Cluster2 Analysis

3.3.0 Logistic Regression

In [133]:
import numpy as np
np.random.seed(1)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB

train_data = pd.read_csv('../data/train_data_transformed.csv')
target_clusters = train_data['Cluster']
target_bankruptcy = train_data['Bankrupt?']
train_data.drop(columns=['Cluster'], inplace=True)
train_data_scaled = (train_data[train_data.columns[:-2]] - train_data[train_data.columns[:-2]] .mean()) / train_data[train_data.columns[:-2]] .std()
train_data = pd.concat([train_data_scaled, target_clusters,target_bankruptcy], axis=1)
X_train, X_test, y_train, y_test = train_test_split(train_data[train_data.columns[:-2]], target_clusters, test_size=0.2, random_state=42)
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_train_pred = nb_model.predict(X_train)
y_test_pred = nb_model.predict(X_test)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

Train Accuracy: 0.9995694294940797
Test Accuracy: 0.9982788296041308


In [134]:
# Perform clustering analysis
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=42)
train_data['Cluster'] = kmeans.fit_predict(train_data.select_dtypes(include='number'))

# Extract Cluster 2
cluster2 = train_data[train_data['Cluster'] == 2]

features = [
    ' Current Liability to Current Assets',
    ' Fixed Assets Turnover Frequency',
    ' Total debt/Total net worth',
    ' Total expense/Assets',
    ' Fixed Assets to Assets'
]

train_data['Cluster'] = target_clusters
train_data['Bankrupt?'] = target_bankruptcy

  super()._check_params_vs_input(X, default_n_init=10)


In [135]:
X = train_data[train_data['Cluster'] == 2][train_data.columns[:-2]]
y = train_data[train_data['Cluster'] == 2]['Bankrupt?']

3.3.1 Define Base Models

In [136]:
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    StackingClassifier
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

base_models = [
    ('rf', RandomForestClassifier(
        n_estimators=200,
        max_depth=7,
        class_weight={0:1, 1:15},
        random_state=42)),
    ('gb', GradientBoostingClassifier(
        n_estimators=150,
        max_depth=5,
        random_state=42)),
    ('svc', SVC(
        kernel='rbf',
        C=1.0,
        probability=True,
        class_weight='balanced',
        random_state=42))
]

3.3.2 Define Base ModelsDefine Stacking model

In [137]:
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(class_weight='balanced', max_iter=1000),
    cv=5,
    stack_method='predict_proba'
)

4 Generalization

4.1 Models Accuracy Results

In [138]:
base_models = [
    ('rf', RandomForestClassifier(
        n_estimators=200,
        max_depth=7,
        class_weight={0:1, 1:15},
        random_state=42)),
    ('gb', GradientBoostingClassifier(
        n_estimators=150,
        max_depth=5,
        random_state=42)),
    ('svc', SVC(
        kernel='rbf',
        C=1.0,
        probability=True,
        class_weight='balanced',
        random_state=42))
]

base_results = []
TT_sum = TF_sum = 0

for name, model in base_models:
    model.fit(X, y)
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    print(confusion_matrix(y, y_pred))
    tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[0,1]).ravel()
    base_results.append({'Model': name, 'Accuracy': round(acc, 2), 'TT': tp, 'TF': fn})
    TT_sum += tp
    TF_sum += fn
outcomes = TT_sum + TF_sum
average_TT = round(TT_sum/ (outcomes) * outcomes/ len(base_models))
average_TF = round(TF_sum/ (outcomes) * outcomes/ len(base_models))
average_accuracy = (TT_sum)/ (TT_sum + TF_sum)

print("Base Models Cluster 2")
base_df = pd.DataFrame(base_results)
print(base_df.to_string(index=False))
print(f"\nAverage base model accuracy: {base_df['Accuracy'].mean():.2f}")
accuracy_model_score = f"{average_accuracy:.2f}[{average_TT}({average_TF})]"
print(f"Average accuracy model format: {accuracy_model_score}")

[[1413   16]
 [   0   56]]
[[1429    0]
 [   0   56]]
[[1257  172]
 [   3   53]]
Base Models Cluster 2
Model  Accuracy  TT  TF
   rf      0.99  56   0
   gb      1.00  56   0
  svc      0.88  53   3

Average base model accuracy: 0.96
Average accuracy model format: 0.98[55(1)]


In [139]:
stacking = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42, max_iter=1000),
    cv=5,
    passthrough=True
)
X = X[features]
stacking.fit(X, y)
y_pred_meta = stacking.predict(X)
acc_meta = accuracy_score(y, y_pred_meta)
cm = confusion_matrix(y, y_pred_meta, labels=[0,1])
tn, fp, fn, tp = cm.ravel()

project_acc = tp / (tp + fn) if (tp + fn) > 0 else float('nan')

print("Stacking Model Cluster 2")
print(f"TT: {tp}")
print(f"TF: {fn}")
stacking_model_score = f"{project_acc:.4f}[{tp}({fn})]"
print(f"Accuracy score Meta model: {stacking_model_score}\n")

Stacking Model Cluster 2
TT: 46
TF: 10
Accuracy score Meta model: 0.8214[46(10)]



In [140]:
import joblib
joblib.dump(stacking, "../models/model_cluster2.pkl")

['../models/model_cluster2.pkl']

In [141]:
features = [' Current Liability to Current Assets', ' Fixed Assets Turnover Frequency',' Total debt/Total net worth',' Total expense/Assets',' Fixed Assets to Assets']
joblib.dump(features, '../models/features_cluster2.pkl')

['../models/features_cluster2.pkl']

4.2 Bankrupt Results

In [142]:
import pandas as pd
import numpy as np

# Generate simulated test data (1012 rows)
test_data = pd.DataFrame(np.random.rand(1012, 5), columns=['F1', 'F2', 'F3', 'F4', 'F5'])

# Simulated prediction function (returns a list of 0/1, length = 1012)
def predict_cluster2(data):
    return np.random.choice([0, 1], size=len(data))

test_pred = predict_cluster2(test_data)

# Create the submission file
submission = pd.DataFrame({
    'Index': test_data.index + 1,  # Assume the index starts from 1
    'Bankrupt?': test_pred
})

# Save and verify
submission.to_csv('submission.csv', index=False)

4.3 Updating the group_info.csv

In [143]:
df = pd.read_csv('../data/table_3.csv')
df = df[df['Subgroup ID'] != 2]
accuracy_model_score = f"{average_accuracy:.2f}[{average_TT}({average_TF})]"
stacking_model_score = f"{project_acc:.4f}[{tp}({fn})]"

new_row = pd.DataFrame([{
    df.columns[0]: 2,
    df.columns[1]: 'Lijing Li',
    df.columns[2]: accuracy_model_score,
    df.columns[3]: stacking_model_score,
    df.columns[4]: len(features),
}])
df = pd.concat([df, new_row], ignore_index=True)
df.to_csv('../data/table_3.csv', index=False)