In [37]:
import pandas as pd
import numpy as np


### Calculating total averages for table_3

In [38]:
def calc_accuracy_with_str(df,col_name):
    nums = []
    total_tt_sum = 0
    total_tf_sum = 0
    for value in df[col_name]:
        value = str(value)
        bracket_split = value.split('[')
        first_four = float(bracket_split[0])
        par_split = bracket_split[1].split('(')
        total_tt = int(par_split[0])
        total_tf = int(par_split[1].split(')')[0])
        total_tt_sum += total_tt
        total_tf_sum += total_tf
        nums.append(first_four)
    average_accuracy_base = np.mean(nums)
    return f"{round(average_accuracy_base,2)} [{total_tt_sum}({total_tf_sum})]"


In [39]:
df = pd.read_csv('../data/table_3.csv')
df = df[df['Subgroup ID']!= 'Team']
df = df.sort_values(by=['Subgroup ID'])
average_accuracy_base = calc_accuracy_with_str(df, df.columns[2])
average_accuracy_meta = calc_accuracy_with_str(df, df.columns[3])
average_number_of_features = df[df.columns[4]].mean()
new_row = pd.DataFrame({'Subgroup ID': ['Team'], df.columns[2]: [average_accuracy_base], df.columns[3]: [average_accuracy_meta], df.columns[4]: [average_number_of_features]})
df = pd.concat([df, new_row], ignore_index=True)
df.to_csv('../data/table_3.csv', index=False)

### Generalization of Model

In [40]:
import pandas as pd
import joblib

# 1) Load serialized preprocessing & models
scaler = joblib.load('../models/scaler.pkl')
cluster_classifier = joblib.load('../models/cluster_clf.pkl')
model0 = joblib.load('../models/model_cluster0.pkl')
model1 = joblib.load('../models/model_cluster1.pkl')
model2 = joblib.load('../models/model_cluster2.pkl')
model3 = joblib.load('../models/model_cluster3.pkl')
features0 = joblib.load('../models/features_cluster0.pkl')
features1 = joblib.load('../models/features_cluster1.pkl')
features2 = joblib.load('../models/features_cluster2.pkl')
features3 = joblib.load('../models/features_cluster3.pkl')

# 2) Read test set
test_df = pd.read_csv('../data/test_data.csv', index_col=0)

train_transformed = pd.read_csv('../data/train_data_transformed.csv')
cluster_feature_cols = train_transformed.drop(columns=['Cluster','Bankrupt?']).columns.tolist()

# 3) Preprocess & predict cluster-ID
X_test_full = test_df[cluster_feature_cols]
print(X_test_full.info())
X_scaled    = scaler.transform(X_test_full)
cluster_ids = cluster_classifier.predict(X_scaled)
print(features2)
# 4) For each test row, select the right features & model, predict Bankruptcy?
preds = []
for idx, cid in zip(X_test_full.index, cluster_ids):
    if cid == 0:
        X_feat = X_test_full.loc[idx, features0].values.reshape(1, -1)
        pred   = model0.predict(X_feat)[0]
    elif cid == 1:
        X_feat = X_test_full.loc[idx, features1].values.reshape(1, -1)
        pred = model1.predict(X_feat)[0]
    elif cid == 2:
        X_feat = X_test_full.loc[idx, features2].values.reshape(1, -1)
        pred = model2.predict(X_feat)[0]
    else:  # cid == 3
        X_feat = X_test_full.loc[idx, features3].values.reshape(1, -1)
        pred   = model3.predict(X_feat)[0]
    preds.append((idx, int(pred)))

# 5) Write submission file
submission = pd.DataFrame(preds, columns=['Index', 'Bankrupt?'])
submission.to_csv('../data/Group7_Generalization.csv', index=False)

print(submission.head())

<class 'pandas.core.frame.DataFrame'>
Index: 1012 entries, 0 to 1011
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0    Cash Flow to Equity                  1012 non-null   float64
 1    Retained Earnings to Total Assets    1012 non-null   float64
 2    Revenue per person                   1012 non-null   float64
 3    Current Liability to Current Assets  1012 non-null   float64
 4    Equity to Liability                  1012 non-null   float64
 5    Cash/Total Assets                    1012 non-null   float64
 6    Total expense/Assets                 1012 non-null   float64
 7    Liability-Assets Flag                1012 non-null   int64  
 8    Total debt/Total net worth           1012 non-null   float64
 9    Operating profit per person          1012 non-null   float64
 10   Quick Assets/Current Liability       1012 non-null   float64
 11   Current Ratio        



   Index  Bankrupt?
0      0          0
1      1          1
2      2          1
3      3          0
4      4          0


