In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
# 1. Load the data
df = pd.read_csv('D:/Advance Compt Science/ML/ML/MID_ML/Lec_06_Logistic_regression/Credit_Scoring/dataset/cs-training.csv')

# Check the shape of the dataset
print(f"Dataset shape: {df.shape}")

# Check for non-null counts in each column
df.info()

# 2. Handle missing values (e.g., by replacing 'NA' with NaN and then imputing)
df.replace('NA', np.nan, inplace=True)

# Option 1: Impute missing values with the median
df.fillna(df.median(), inplace=True)

# 3. Split the dataset into features (X) and target (y)
X = df.drop('SeriousDlqin2yrs', axis=1)  # All columns except the target
y = df['SeriousDlqin2yrs']  # The target column

# 4. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. Create a pipeline with scaling and logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic_regression', LogisticRegression(max_iter=1000))
])

# 6. Fit the model using the pipeline
pipeline.fit(X_train, y_train)

# 7. Make predictions
y_pred_logistic = pipeline.predict(X_test)
y_pred_prob_logistic = pipeline.predict_proba(X_test)[:, 1]

# 8. Evaluate the model
accuracy = accuracy_score(y_test, y_pred_logistic)
roc_auc = roc_auc_score(y_test, y_pred_prob_logistic)

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_logistic))


Dataset shape: (150000, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Unnamed: 0                            150000 non-null  int64  
 1   SeriousDlqin2yrs                      150000 non-null  int64  
 2   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 3   age                                   150000 non-null  int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 5   DebtRatio                             150000 non-null  float64
 6   MonthlyIncome                         120269 non-null  float64
 7   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 8   NumberOfTimes90DaysLate               150000 non-null  int64  
 9   NumberRealEstateLoansOrLines          150000 non-null  int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  15

In [12]:
# 9. Feature Importance (Optional)
# Logistic regression coefficients can be used to understand the importance of features
feature_importance = pd.Series(pipeline.named_steps['logistic_regression'].coef_[0], index=X.columns)
print("\nFeature Importance:")
print(feature_importance.sort_values(ascending=False))


Feature Importance:
NumberOfTime30-59DaysPastDueNotWorse    2.109796
NumberOfTimes90DaysLate                 2.078339
NumberOfDependents                      0.093019
NumberRealEstateLoansOrLines            0.069673
Unnamed: 0                              0.005044
RevolvingUtilizationOfUnsecuredLines   -0.017755
NumberOfOpenCreditLinesAndLoans        -0.029035
DebtRatio                              -0.039291
age                                    -0.431364
MonthlyIncome                          -0.455484
NumberOfTime60-89DaysPastDueNotWorse   -4.024526
dtype: float64


In [13]:
# 5. Train Decision Tree model

decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
y_pred_tree = decision_tree_model.predict(X_test)
y_pred_prob_tree = decision_tree_model.predict_proba(X_test)[:, 1]

In [14]:


# 6. Train Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100)
random_forest_model.fit(X_train, y_train)
y_pred_forest = random_forest_model.predict(X_test)
y_pred_prob_forest = random_forest_model.predict_proba(X_test)[:, 1]

# 7. Train Gradient Boosting model
gbm_model = GradientBoostingClassifier(n_estimators=100)
gbm_model.fit(X_train, y_train)
y_pred_gbm = gbm_model.predict(X_test)
y_pred_prob_gbm = gbm_model.predict_proba(X_test)[:, 1]

# 8. Train Support Vector Machine model
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
y_pred_prob_svm = svm_model.predict_proba(X_test)[:, 1]



In [None]:
# 9. Evaluate Models
def evaluate_model(y_test, y_pred, y_pred_prob):
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

print("\nLogistic Regression:")
evaluate_model(y_test, y_pred_logistic, y_pred_prob_logistic)

print("\nDecision Tree:")
evaluate_model(y_test, y_pred_tree, y_pred_prob_tree)

print("\nRandom Forest:")
evaluate_model(y_test, y_pred_forest, y_pred_prob_forest)

print("\nGradient Boosting Machine:")
evaluate_model(y_test, y_pred_gbm, y_pred_prob_gbm)

print("\nSupport Vector Machine:")
evaluate_model(y_test, y_pred_svm, y_pred_prob_svm)

# 10. Cross-Validation (Optional but recommended)
models = {
    'Logistic Regression': logistic_model,
    'Decision Tree': decision_tree_model,
    'Random Forest': random_forest_model,
    'Gradient Boosting': gbm_model,
    'SVM': svm_model
}

for model_name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
    print(f"{model_name} Average ROC AUC: {scores.mean():.4f}")