#**CREDIT SCORING MODEL**

In [None]:
#@title Imports & Data Loading – Bring in Python libraries and load your dataset.

In [None]:
# 1. Imports & Data Loading
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import joblib

In [None]:
# Load your dataset (replace with your path/filename)
credit_data= pd.read_csv(r'/content/credit_scoring.csv')
credit_data

In [None]:
#@title  Exploratory Data Analysis (EDA) – quick checks
credit_data.head()

In [None]:
credit_data.isnull().sum()

In [None]:
print(credit_data.head())
print(credit_data.info())
print(credit_data.isnull().sum())

**Reasoning**:
Examine the columns of the `credit_data` DataFrame to identify a column that indicates creditworthiness or a credit score. Since there is no explicit target variable, I will create one based on 'Credit Utilization Ratio' and 'Payment History'. A lower credit utilization ratio and higher payment history indicate better creditworthiness. I will define creditworthy as having a Credit Utilization Ratio less than 0.5 and Payment History greater than the median.



In [None]:
print(credit_data.columns)

median_payment_history = credit_data['Payment History'].median()
credit_data['Creditworthy'] = ((credit_data['Credit Utilization Ratio'] < 0.5) & (credit_data['Payment History'] > median_payment_history)).astype(int)
print(credit_data[['Credit Utilization Ratio', 'Payment History', 'Creditworthy']].head())

## **Define features & target**



**Reasoning**:
The previous code failed because the target column was not correctly specified. Based on the previous subtask, the target column is 'Creditworthy'. This code block will correctly set the target column and split the data into features (X) and target (y).



In [None]:
target_column = 'Creditworthy'
X = credit_data.drop(columns=[target_column])
y = credit_data[target_column]

print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)

## **Data preprocessing**




**Reasoning**:
Identify categorical and numerical columns, apply one-hot encoding to categorical columns, scale numerical columns, and combine them.



In [None]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

scaler = StandardScaler()
X_encoded[numerical_cols] = scaler.fit_transform(X_encoded[numerical_cols])

display(X_encoded.head())

## **Split data**



**Reasoning**:
Split the preprocessed data into training and testing sets.



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

## **Model selection & training**



**Reasoning**:
Train a Logistic Regression model using the training data.



In [None]:
log_reg_model = LogisticRegression(random_state=42)
log_reg_model.fit(X_train, y_train)

**Reasoning**:
Train a Decision Tree Classifier model using the training data.



In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

**Reasoning**:
Train a Random Forest Classifier model using the training data.



In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

## **Model evaluation**


Evaluate the trained model using appropriate metrics (e.g., precision, recall, F1-score, ROC-AUC).


**Reasoning**:
Make predictions and calculate evaluation metrics for each trained model.



In [None]:
# Make predictions
y_pred_lr = log_reg_model.predict(X_test)
y_proba_lr = log_reg_model.predict_proba(X_test)[:, 1]

y_pred_dt = dt_model.predict(X_test)
y_proba_dt = dt_model.predict_proba(X_test)[:, 1]

y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics
metrics = {
    'Logistic Regression': {
        'Precision': precision_score(y_test, y_pred_lr),
        'Recall': recall_score(y_test, y_pred_lr),
        'F1-score': f1_score(y_test, y_pred_lr),
        'ROC-AUC': roc_auc_score(y_test, y_proba_lr)
    },
    'Decision Tree': {
        'Precision': precision_score(y_test, y_pred_dt),
        'Recall': recall_score(y_test, y_pred_dt),
        'F1-score': f1_score(y_test, y_pred_dt),
        'ROC-AUC': roc_auc_score(y_test, y_proba_dt)
    },
    'Random Forest': {
        'Precision': precision_score(y_test, y_pred_rf),
        'Recall': recall_score(y_test, y_pred_rf),
        'F1-score': f1_score(y_test, y_pred_rf),
        'ROC-AUC': roc_auc_score(y_test, y_proba_rf)
    }
}

# Print metrics
for model, model_metrics in metrics.items():
    print(f"{model} Metrics:")
    for metric, value in model_metrics.items():
        print(f"  {metric}: {value:.4f}")
    print("-" * 20)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

# Assume y_test and predicted probabilities y_proba_lr, y_proba_dt, y_proba_rf exist

# Compute FPR and TPR for each model
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba_lr)
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_proba_dt)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_proba_rf)

# Compute AUC values
auc_lr = roc_auc_score(y_test, y_proba_lr)
auc_dt = roc_auc_score(y_test, y_proba_dt)
auc_rf = roc_auc_score(y_test, y_proba_rf)

# Plot all ROC curves on one figure
plt.figure(figsize=(8, 6))
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {auc_lr:.3f})')
plt.plot(fpr_dt, tpr_dt, label=f'Decision Tree (AUC = {auc_dt:.3f})')
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve Comparison')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay

# Plot the first model
disp_lr = RocCurveDisplay.from_estimator(log_reg_model, X_test, y_test, name='Logistic Regression')
ax = disp_lr.ax_

# Overlay additional models
RocCurveDisplay.from_estimator(dt_model, X_test, y_test, name='Decision Tree', ax=ax)
RocCurveDisplay.from_estimator(rf_model, X_test, y_test, name='Random Forest', ax=ax)

ax.set_title('ROC Curve Comparison')
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import pandas as pd

# --- 6. Bar Plot for Metrics ---
# Create a DataFrame from the metrics dictionary
df_results = pd.DataFrame(metrics).T
df_results.plot(kind='bar', figsize=(10, 6))
plt.title('Model Evaluation Metrics')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

## Summary:

### Data Analysis Key Findings

*   A new binary target variable 'Creditworthy' was engineered based on 'Credit Utilization Ratio' and 'Payment History' because the original dataset lacked a target column for creditworthiness.
*   Categorical features were successfully one-hot encoded, and numerical features were scaled using `StandardScaler`.
*   The preprocessed data was split into training (80%) and testing (20%) sets.
*   Three classification models (Logistic Regression, Decision Tree, and Random Forest) were trained on the training data.
*   Model evaluation on the test set showed that the Decision Tree and Random Forest models achieved perfect scores (Precision, Recall, F1-score, ROC-AUC = 1.0000), while the Logistic Regression model also performed well but not perfectly.

### Insights or Next Steps

*   The perfect scores achieved by the Decision Tree and Random Forest models suggest potential overfitting or that the engineered target variable is too simple and easily separable based on the chosen features. Further investigation into the data and the definition of 'Creditworthy' is needed.
*   Consider using cross-validation during training to get a more robust estimate of model performance and explore more complex feature engineering or model architectures if needed.


**Y-DATA PRIOFILING**

In [None]:
!pip install ydata-profiling


In [None]:
from ydata_profiling import ProfileReport
prof = ProfileReport(credit_data)
prof.to_file(output_file='EDA.html')

from IPython.core.display import display, HTML
display(HTML(prof.to_html()))