In [15]:
import kagglehub
import pandas as pd
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report 
import pickle


In [16]:
# Use kagglehub to download the bankruptcy prediction dataset from Kaggle
path = kagglehub.dataset_download("fedesoriano/company-bankruptcy-prediction")

# Read the CSV file into a Pandas DataFrame
data = pd.read_csv(path + "/data.csv")

# Display the first 5 rows of the dataset
data.head()


Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [17]:
# Initialize the VarianceThreshold selector with a threshold of 0.0 (removes constant features)
selector = VarianceThreshold(threshold=0.0)

# Fit the selector to the data and transform it, resulting in a NumPy array with non-constant features
filtered_array = selector.fit_transform(data)

# Get the column names of the selected features using the support indices
selected_columns = data.columns[selector.get_support(indices=True)]

# Create a new DataFrame with the filtered data and selected column names
filtered_data = pd.DataFrame(filtered_array, columns=selected_columns)

In [18]:
# Split the filtered data into features (X) and target (y)
# 'Bankrupt?' is the target column, so we drop it from the features
# 20% of the data will be used for testing, and 80% for training
# random_state=1 ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(
    filtered_data.drop(columns=["Bankrupt?"]),  # Features (X)
    filtered_data["Bankrupt?"],                # Target (y)
    test_size=0.2,                             # 20% test, 80% train
    random_state=0                             # Seed for reproducibility
)


In [19]:
# Initialize SelectKBest to evaluate all features (k="all") using the f_classif score function
k_filter = SelectKBest(score_func=f_classif, k="all")

# Fit the selector to the training data to compute the F-scores for each feature
k_filter.fit(X_train, y_train)

# Create a Pandas Series with the computed F-scores, indexed by the feature names
data_scores = pd.Series(k_filter.scores_, index=X_train.columns)

# Calculate the mean F-score across all features
mean_data_scores = data_scores.mean()


In [20]:
# Select only the features from filtered_data that have a score above the mean F-score
# This filters columns whose importance (ANOVA F-score) is greater than the average
new_data = filtered_data[data_scores[data_scores > mean_data_scores].index.tolist()]

# Join the target column 'Bankrupt?' back to the reduced feature set
# Note: .join() returns a new DataFrame, so we need to assign it
new_data = new_data.join(filtered_data["Bankrupt?"])

# Display the first 5 rows of the final DataFrame
new_data.head()


Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Net Value Per Share (B),Net Value Per Share (A),Net Value Per Share (C),Persistent EPS in the Last Four Seasons,Operating Profit Per Share (Yuan ¥),Per Share Net profit before tax (Yuan ¥),Debt ratio %,...,Retained Earnings to Total Assets,Total expense/Assets,Current Liability to Equity,Equity to Long-term Liability,Current Liability to Current Assets,Liability-Assets Flag,Net Income to Total Assets,Net Income to Stockholder's Equity,Liability to Equity,Bankrupt?
0,0.370594,0.424389,0.40575,0.14795,0.14795,0.14795,0.169141,0.095921,0.138736,0.207576,...,0.903225,0.064856,0.339077,0.126549,0.11825,0.0,0.716845,0.82789,0.290202,1.0
1,0.464291,0.538214,0.51673,0.182251,0.182251,0.182251,0.208944,0.093722,0.169918,0.171176,...,0.931065,0.025516,0.32974,0.120916,0.047775,0.0,0.795297,0.839969,0.283846,1.0
2,0.426071,0.499019,0.472295,0.177911,0.177911,0.193713,0.180581,0.092338,0.142803,0.207516,...,0.909903,0.021387,0.334777,0.117922,0.025346,0.0,0.77467,0.836774,0.290189,1.0
3,0.399844,0.451265,0.457733,0.154187,0.154187,0.154187,0.193722,0.077762,0.148603,0.151465,...,0.906902,0.024161,0.331509,0.12076,0.06725,0.0,0.739555,0.834697,0.281721,1.0
4,0.465022,0.538432,0.522298,0.167502,0.167502,0.167502,0.212537,0.096898,0.168412,0.106509,...,0.91385,0.026385,0.330726,0.110933,0.047725,0.0,0.795016,0.839973,0.278514,1.0


In [21]:
print(new_data.columns.tolist())

[' ROA(C) before interest and depreciation before interest', ' ROA(A) before interest and % after tax', ' ROA(B) before interest and depreciation after tax', ' Net Value Per Share (B)', ' Net Value Per Share (A)', ' Net Value Per Share (C)', ' Persistent EPS in the Last Four Seasons', ' Operating Profit Per Share (Yuan ¥)', ' Per Share Net profit before tax (Yuan ¥)', ' Debt ratio %', ' Net worth/Assets', ' Borrowing dependency', ' Operating profit/Paid-in capital', ' Net profit before tax/Paid-in capital', ' Inventory and accounts receivable/Net value', ' Working Capital to Total Assets', ' Current Liability to Assets', ' Working Capital/Equity', ' Current Liabilities/Equity', ' Retained Earnings to Total Assets', ' Total expense/Assets', ' Current Liability to Equity', ' Equity to Long-term Liability', ' Current Liability to Current Assets', ' Liability-Assets Flag', ' Net Income to Total Assets', " Net Income to Stockholder's Equity", ' Liability to Equity', 'Bankrupt?']


In [22]:
# Split the new_data DataFrame into training and testing sets
# 'Bankrupt?' is the target column and is removed from the features (X)

new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(
    new_data.drop(columns="Bankrupt?"),        # Features (X)
    new_data["Bankrupt?"],                     # Target (y)
    test_size=0.2,                             # 20% of data reserved for testing
    stratify=new_data["Bankrupt?"],            # Ensures the same class distribution in both sets
    random_state=0                             # Seed for reproducibility
)


In [23]:
svm = SVC(
    C=1,                        # Regularization parameter
    gamma='scale',             # Kernel coefficient
    kernel='rbf',              # Kernel type
    probability=True,          # Needed to get predict_proba
    class_weight='balanced',   # Handle class imbalance
    random_state=0
)

In [24]:
# Train the model on your training set
svm.fit(new_X_train, new_y_train)

In [25]:
# Predict labels for the test set
y_pred = svm.predict(new_X_test)
# Predict class probabilities for the test set (only positive class probabilities)
y_proba = svm.predict_proba(new_X_test)[:, 1]

In [26]:
# Print detailed classification metrics
print(classification_report(new_y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.85      0.92      1320
         1.0       0.16      0.86      0.27        44

    accuracy                           0.85      1364
   macro avg       0.58      0.86      0.59      1364
weighted avg       0.97      0.85      0.89      1364



In [27]:
# --- Save the trained model using pickle ---
with open('model.pkl', 'wb') as file:
    pickle.dump(svm, file)