In [18]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn import set_config
set_config(display='diagram')

In [19]:
# Load the dataset
df = pd.read_excel(r"E:\eXCELR PROJECt\bankruptcy-prevention.xlsx")  # Replace with your dataset file path

In [20]:
df

Unnamed: 0,industrial_risk,management_risk,financial_flexibility,credibility,competitiveness,operating_risk,class
0,0.5,1.0,0.0,0.0,0.0,0.5,bankruptcy
1,0.0,1.0,0.0,0.0,0.0,1.0,bankruptcy
2,1.0,0.0,0.0,0.0,0.0,1.0,bankruptcy
3,0.5,0.0,0.0,0.5,0.0,1.0,bankruptcy
4,1.0,1.0,0.0,0.0,0.0,1.0,bankruptcy
...,...,...,...,...,...,...,...
245,0.0,1.0,1.0,1.0,1.0,1.0,non-bankruptcy
246,1.0,1.0,0.5,1.0,1.0,0.0,non-bankruptcy
247,0.0,1.0,1.0,0.5,0.5,0.0,non-bankruptcy
248,1.0,0.0,0.5,1.0,0.5,0.0,non-bankruptcy


In [21]:
# Separate features and target variable
X = df.drop(columns=['class'])
y = df['class']

In [22]:
 ## Encode the target column
 encoder = LabelEncoder()
 y = encoder.fit_transform(y)

In [23]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Define the pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handling missing values
    ('scaler', StandardScaler()),  # Feature scaling
    ('classifier', RandomForestClassifier(random_state=42))  # Classifier
])

In [24]:
# Define the pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),  # Feature scaling
    ('classifier', RandomForestClassifier(random_state=42))  # Classifier
])

In [25]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
}

model = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
model.fit(X_train, y_train)

In [26]:
# Print best parameters
print("Best Parameters: ", model.best_params_)

Best Parameters:  {'classifier__max_depth': None, 'classifier__n_estimators': 100}


In [27]:
# Predict on test set
y_pred = model.predict(X_test)

In [28]:
# Evaluate the model
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy:  1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        25
           1       1.00      1.00      1.00        50

    accuracy                           1.00        75
   macro avg       1.00      1.00      1.00        75
weighted avg       1.00      1.00      1.00        75



In [29]:
# Cross-validation score
cv_scores = cross_val_score(model.best_estimator_, X, y, cv=5)
print("Cross-validation scores: ", cv_scores)
print("Mean CV Score: ", cv_scores.mean())

Cross-validation scores:  [1. 1. 1. 1. 1.]
Mean CV Score:  1.0


In [30]:
pipeline

In [31]:
y_pred = model.predict(X_test)

In [32]:
y_pred

array([1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1])

In [33]:
model

In [34]:
import pickle

pickle.dump (model, open('bankurpcy_model.pkl', 'wb'))
