In [2]:
# Step 1 : Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# Step 2 : Import Data
default = pd.read_csv('/content/Credit Default.csv')

In [None]:
default.head()

Unnamed: 0,Income,Age,Loan,Loan to Income,Default
0,66155.9251,59.017015,8106.532131,0.122537,0
1,34415.15397,48.117153,6564.745018,0.190752,0
2,57317.17006,63.108049,8020.953296,0.13994,0
3,42709.5342,45.751972,6103.64226,0.142911,0
4,66952.68885,18.584336,8770.099235,0.13099,1


In [None]:
default.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Income          2000 non-null   float64
 1   Age             2000 non-null   float64
 2   Loan            2000 non-null   float64
 3   Loan to Income  2000 non-null   float64
 4   Default         2000 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 78.2 KB


In [None]:
default.describe()

Unnamed: 0,Income,Age,Loan,Loan to Income,Default
count,2000.0,2000.0,2000.0,2000.0,2000.0
mean,45331.600018,40.927143,4444.369695,0.098403,0.1415
std,14326.327119,13.26245,3045.410024,0.05762,0.348624
min,20014.48947,18.055189,1.37763,4.9e-05,0.0
25%,32796.45972,29.062492,1939.708847,0.047903,0.0
50%,45789.11731,41.382673,3974.719418,0.099437,0.0
75%,57791.28167,52.596993,6432.410625,0.147585,0.0
max,69995.68558,63.971796,13766.05124,0.199938,1.0


In [None]:
# Count of each category
default['Default'].value_counts()

Unnamed: 0_level_0,count
Default,Unnamed: 1_level_1
0,1717
1,283


In [None]:
# Step 3 : Define target (y) and features (X)
y = default['Default']
X = default.drop(['Default'], axis=1)

In [None]:
# Step 4 : Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=2529)

In [None]:
# Check shape of train and test sample
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1400, 4), (600, 4), (1400,), (600,))

In [None]:
# Step 5 : Select Logistic Regression Model
model = LogisticRegression()

In [None]:
# Step 6 : Train or Fit Logistic Regression Model
model.fit(X_train, y_train)
model.intercept_, model.coef_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(array([9.72655935]),
 array([[-2.38835452e-04, -3.48116982e-01,  1.72714014e-03,
          4.12484924e-01]]))

In [None]:
!pip install scikit-learn




In [None]:
import pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved as model.pkl")

In [None]:
from google.colab import files

files.download('model.pkl')


In [None]:
# Step 7 : Predict with Logistic Regression Model
y_pred = model.predict(X_test)

In [None]:
# Step 8 : Evaluate Logistic Regression Model
confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       519
           1       0.83      0.79      0.81        81

    accuracy                           0.95       600
   macro avg       0.90      0.88      0.89       600
weighted avg       0.95      0.95      0.95       600



In [None]:
# Ensemble Model Implementation
# Step 9: Ensembling with Voting Classifier
log_reg = LogisticRegression(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
svc_clf = SVC(probability=True, random_state=42)

In [None]:
# Combine models into a Voting Classifier
ensemble_model = VotingClassifier(
    estimators=[('lr', log_reg), ('rf', rf_clf), ('svc', svc_clf)],
    voting='soft'
)

In [None]:
# Train Ensemble Model
ensemble_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Evaluate Ensemble Model
ensemble_preds = ensemble_model.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, ensemble_preds)
print(f"Ensemble Model Accuracy: {ensemble_accuracy:.2f}")

Ensemble Model Accuracy: 0.98


In [None]:
# Confusion Matrix for Ensemble Model
confusion_matrix(y_test, ensemble_preds)
print(classification_report(y_test, ensemble_preds))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       519
           1       1.00      0.85      0.92        81

    accuracy                           0.98       600
   macro avg       0.99      0.93      0.95       600
weighted avg       0.98      0.98      0.98       600

