In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

In [5]:
file = "/data/classes/2024/spring/cs425/kokalsg4814/GProject/data.csv"
df = pd.read_csv(file)
df

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.405750,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.827890,0.290202,0.026601,0.564050,1,0.016469
1,1,0.464291,0.538214,0.516730,0.610235,0.610235,0.998946,0.797380,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.601450,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.774670,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.998700,0.796967,0.808966,0.303350,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.035490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6814,0,0.493687,0.539468,0.543230,0.604455,0.604462,0.998992,0.797409,0.809331,0.303510,...,0.799927,0.000466,0.623620,0.604455,0.840359,0.279606,0.027064,0.566193,1,0.029890
6815,0,0.475162,0.538269,0.524172,0.598308,0.598308,0.998992,0.797414,0.809327,0.303520,...,0.799748,0.001959,0.623931,0.598306,0.840306,0.278132,0.027009,0.566018,1,0.038284
6816,0,0.472725,0.533744,0.520638,0.610444,0.610213,0.998984,0.797401,0.809317,0.303512,...,0.797778,0.002840,0.624156,0.610441,0.840138,0.275789,0.026791,0.565158,1,0.097649
6817,0,0.506264,0.559911,0.554045,0.607850,0.607850,0.999074,0.797500,0.809399,0.303498,...,0.811808,0.002837,0.623957,0.607846,0.841084,0.277547,0.026822,0.565302,1,0.044009


In [6]:
# seperate features and labels

X = df.drop('Bankrupt?', axis=1)
y = df['Bankrupt?']

In [8]:


# Split the data into training and test sets, in a 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)



In [9]:
# Standardize the features (important for SVD)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# Initialize SVD and fit on the training data
svd = TruncatedSVD(n_components=X_train.shape[1] - 1)  # Use one less component than the feature count
X_train_svd = svd.fit_transform(X_train)

In [11]:
# Calculate explained variance ratio for each component
explained_variance_ratio = svd.explained_variance_ratio_

# Calculate the cumulative explained variance
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Find the number of components that explain at least 75% of the variance
n_components = np.argmax(cumulative_explained_variance >= 0.75) + 1

In [12]:
# Transform both the training and test data to the selected number of components
X_train_svd_selected = svd.transform(X_train)[:, :n_components]
X_test_svd_selected = svd.transform(X_test)[:, :n_components]

# Print the number of components selected
print(f"Number of components selected: {n_components}")

Number of components selected: 27


In [13]:
b_y_data = df[df['Bankrupt?'] == 1].reset_index(drop=True)
b_n_data = df[df['Bankrupt?'] != 0].reset_index(drop=True)

yes_train_data, yes_test_data = train_test_split(b_y_data, test_size=0.3)

no_train_data, no_test_data = train_test_split(b_n_data, test_size=0.3)

num_positive_samples = len(yes_train_data)
num_negative_samples = len(no_train_data)

# Calculate the class weights based on the training data
class_weights = {0: num_negative_samples / (num_positive_samples + num_negative_samples),
                1: num_positive_samples / (num_positive_samples + num_negative_samples)}  


In [14]:
model = DecisionTreeClassifier(class_weight=class_weights, criterion='gini', max_depth = 5)


In [16]:
model.fit(X_train_svd_selected, y_train)
test_predictions = model.predict(X_test_svd_selected)

In [18]:
accuracy = accuracy_score(y_test, test_predictions)

f1 = f1_score(y_test, test_predictions)
tn, fp, fn, tp = confusion_matrix(y_test, test_predictions).ravel()
fpr = fp / (fp + tn)
fnr = fn / (fn + tp)

# find EER
Fpr, tpr, threshold = roc_curve(y_test, test_predictions, pos_label=1)
Fnr = 1 - tpr
EER = Fpr[np.nanargmin(np.absolute((Fnr - Fpr)))]

precision = precision_score(y_test, test_predictions)
recall = recall_score(y_test, test_predictions)

print("Accuracy: ", accuracy)
print("F1 Score:", f1)
print("Precision: ", precision)
print("Recall: ", recall)
print("True Positives: ", tp)
print("False Positives: ", fp)
print("True Negatives: ", tn)
print("False Negatives: ", fn)
print("False Positive Rate: ", fpr)
print("False Negative Rate: ", fnr)
print("Equal Error Rate: ", EER)
print("")

Accuracy:  0.9604105571847508
F1 Score: 0.2568807339449541
Precision:  0.3684210526315789
Recall:  0.19718309859154928
True Positives:  14
False Positives:  24
True Negatives:  1951
False Negatives:  57
False Positive Rate:  0.012151898734177215
False Negative Rate:  0.8028169014084507
Equal Error Rate:  0.012151898734177215



In [None]:
#whoop