# 1. Load the Preprocessed Data

In [11]:
import os
import sys

# Indicate if the script is running on Google Colab or not
using_colab = 'google.colab' in sys.modules

if using_colab:
    # Connect Google Drive to Colab
    from google.colab import drive
    drive.mount('/content/gdrive', force_remount=True)
    
    # Install necessary packages
    !pip install numpy
    !pip install pandas
    !pip install sklearn
    !pip install matplotlib
    !pip install seaborn
    !pip install imbalanced-learn
    
    # Path for Google Colab
    project_root = '/content/gdrive/MyDrive/oc_projet_4/' 
else:
    # Get the current working directory as base directory for the notebook
    base_dir = os.getcwd()
    
    # Adjust the project root path relatively to where the notebook is located
    # Assuming the notebook is inside a 'notebooks' directory and we need to go up one level to access project root
    project_root = os.path.join(base_dir, '..')

# Clean output of cell
from IPython.display import clear_output
clear_output()

import numpy as np

# Set the correct path for the NPZ file
data_path = os.path.join(project_root, 'dataapp_domain_train_test_sets.npz')

# Load the dataset from the NPZ file
data = np.load(data_path, allow_pickle=True)

# Extract the training and testing sets
X_train = data['X_train']  
X_test = data['X_test'] 
y_train = data['y_train']
y_test = data['y_test']

print('Data loaded successfully.')
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

Data loaded successfully.
X_train shape: (215257, 244)
X_test shape: (92254, 244)
y_train shape: (215257,)
y_test shape: (92254,)


# 2.Hyperparameter Tuning via GridSearchCV and Imbalanced-learn Pipeline

The objective of this code is to:
1. **Set Up Resampling Techniques**: Define over-sampling (SMOTE) and under-sampling (RandomUnderSampler) methods to handle class imbalance in the dataset.
2. **Initialize Models**: Create instances of RandomForestClassifier and RidgeClassifier for classification tasks.
3. **Define Hyperparameter Grids**: Specify parameter grids for tuning hyperparameters of the classifiers using GridSearchCV.
4. **Create Pipelines**: Construct pipelines to integrate resampling techniques with the classifiers.
5. **Perform Hyperparameter Tuning**: Use GridSearchCV to find the best hyperparameters and resampling techniques for each classifier, evaluating them using cross-validation and storing the best models.

In [12]:
from sklearn.ensemble import RandomForestClassifier # Random Forest is a good choice for classification tasks
from sklearn.linear_model import RidgeClassifier # Ridge Classifier is a linear model suitable for classification tasks
from sklearn.model_selection import GridSearchCV # GridSearchCV for hyperparameter tuning and cross-validation
from imblearn.pipeline import Pipeline # Pipeline for chaining resampling techniques with the classifier
from imblearn.over_sampling import SMOTE # SMOTE for over-sampling
from imblearn.under_sampling import RandomUnderSampler # RandomUnderSampler for under-sampling

# Define the resampling techniques
over_sampler = SMOTE(random_state=42)
under_sampler = RandomUnderSampler(random_state=42)

# Define the models
rf = RandomForestClassifier(random_state=42)
ridge = RidgeClassifier()

# Define the parameter grids for GridSearchCV
param_grid_rf = {
    'classifier__n_estimators': [100, 200], # Number of trees in the forest
    'classifier__max_depth': [10, 20], # Maximum depth of the tree
    'classifier__class_weight': [None, 'balanced'] # Weights associated with classes in the form {class_label: weight}
}

param_grid_ridge = {
    'classifier__alpha': [1.0, 0.1, 0.01], # Regularization strength
    'classifier__class_weight': [None, 'balanced'] # Weights associated with classes in the form {class_label: weight}
}

# Define pipelines
pipeline_rf_over = Pipeline([
    ('oversample', over_sampler), 
    ('classifier', rf)
])

pipeline_rf_under = Pipeline([
    ('undersample', under_sampler),
    ('classifier', rf)
])

pipeline_rf_weight = Pipeline([
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

pipeline_ridge_over = Pipeline([
    ('oversample', over_sampler),
    ('classifier', ridge)
])

pipeline_ridge_under = Pipeline([
    ('undersample', under_sampler),
    ('classifier', ridge)
])

pipeline_ridge_weight = Pipeline([
    ('classifier', RidgeClassifier(class_weight='balanced'))
])

# Perform GridSearchCV for each pipeline
# pipeline is a sequence of fit-transform steps ending with a fit
# Store the best estimators in a dictionary

grids = [
    (pipeline_rf_over, param_grid_rf, 'Random Forest with Over-sampling'), # 
    (pipeline_rf_under, param_grid_rf, 'Random Forest with Under-sampling'),
    (pipeline_rf_weight, param_grid_rf, 'Random Forest with Class Weight'),
    (pipeline_ridge_over, param_grid_ridge, 'Ridge with Over-sampling'),
    (pipeline_ridge_under, param_grid_ridge, 'Ridge with Under-sampling'),
    (pipeline_ridge_weight, param_grid_ridge, 'Ridge with Class Weight')
]

best_estimators = {} # Dictionary to store the best estimators

for pipeline, param_grid, name in grids:
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_estimators[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best ROC AUC for {name}: {grid_search.best_score_}")



Best parameters for Random Forest with Over-sampling: {'classifier__class_weight': None, 'classifier__max_depth': 20, 'classifier__n_estimators': 200}
Best ROC AUC for Random Forest with Over-sampling: 0.706770540515098
Best parameters for Random Forest with Under-sampling: {'classifier__class_weight': None, 'classifier__max_depth': 20, 'classifier__n_estimators': 200}
Best ROC AUC for Random Forest with Under-sampling: 0.739534466894729
Best parameters for Random Forest with Class Weight: {'classifier__class_weight': None, 'classifier__max_depth': 10, 'classifier__n_estimators': 200}
Best ROC AUC for Random Forest with Class Weight: 0.7340727428197145


Ill-conditioned matrix (rcond=7.51842e-18): result may not be accurate.


Best parameters for Ridge with Over-sampling: {'classifier__alpha': 1.0, 'classifier__class_weight': None}
Best ROC AUC for Ridge with Over-sampling: 0.745725332792906


Ill-conditioned matrix (rcond=6.55936e-17): result may not be accurate.


Best parameters for Ridge with Under-sampling: {'classifier__alpha': 1.0, 'classifier__class_weight': None}
Best ROC AUC for Ridge with Under-sampling: 0.7446270382358854
Best parameters for Ridge with Class Weight: {'classifier__alpha': 1.0, 'classifier__class_weight': 'balanced'}
Best ROC AUC for Ridge with Class Weight: 0.746318807175601


Ill-conditioned matrix (rcond=1.34498e-17): result may not be accurate.


### Interpretation of the Results

#### Summary

1. **Random Forest with Over-sampling**:
   - **Best Parameters**: `{'classifier__class_weight': None, 'classifier__max_depth': 20, 'classifier__n_estimators': 200}`
   - **Best ROC AUC**: `0.706770540515098`

2. **Random Forest with Under-sampling**:
   - **Best Parameters**: `{'classifier__class_weight': None, 'classifier__max_depth': 20, 'classifier__n_estimators': 200}`
   - **Best ROC AUC**: `0.739534466894729`

3. **Random Forest with Class Weight**:
   - **Best Parameters**: `{'classifier__class_weight': None, 'classifier__max_depth': 10, 'classifier__n_estimators': 200}`
   - **Best ROC AUC**: `0.7340727428197145`

4. **Ridge with Over-sampling**:
   - **Best Parameters**: `{'classifier__alpha': 1.0, 'classifier__class_weight': None}`
   - **Best ROC AUC**: `0.745725332792906`

5. **Ridge with Under-sampling**:
   - **Best Parameters**: `{'classifier__alpha': 1.0, 'classifier__class_weight': None}`
   - **Best ROC AUC**: `0.7446270382358854`

6. **Ridge with Class Weight**:
   - **Best Parameters**: `{'classifier__alpha': 1.0, 'classifier__class_weight': 'balanced'}`
   - **Best ROC AUC**: `0.746318807175601`

#### Observations

1. **Random Forest**:
   - The best parameters for both over-sampling and under-sampling are the same, suggesting that the model configuration is robust across different resampling strategies.
   - The ROC AUC for under-sampling (0.7395) is higher than for over-sampling (0.7068), indicating that under-sampling might be a better strategy for the Random Forest model in this case.
   - Using class weight adjustment in Random Forest shows an improvement in ROC AUC compared to over-sampling but still underperforms compared to under-sampling.

2. **Ridge Classifier**:
   - The best parameters for both over-sampling and under-sampling are the same.
   - The ROC AUC values are very close for over-sampling (0.7457) and under-sampling (0.7446), indicating that both resampling strategies perform similarly well for the Ridge Classifier.
   - Using class weight adjustment in Ridge Classifier gives a slightly better ROC AUC (0.7463), making it the best-performing method overall.

#### Warnings

The warnings you received:
```
C:\Users\pat\.conda\envs\P4\lib\site-packages\sklearn\linear_model\_ridge.py:212: LinAlgWarning: Ill-conditioned matrix (rcond=7.51842e-18): result may not be accurate.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T

C:\Users\pat\.conda\envs\P4\lib\site-packages\sklearn\linear_model\_ridge.py:212: LinAlgWarning: Ill-conditioned matrix (rcond=6.55936e-17): result may not be accurate.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
```
These warnings indicate that the Ridge regression encountered ill-conditioned matrices, which means that the matrix used in the ridge regression solver has very small singular values, causing potential numerical instability. This can happen when there are highly correlated features in the dataset.

#### Recommendations

1. **Addressing Ill-Conditioned Matrix Warnings**:
   - **Feature Engineering**: Investigate and possibly remove or combine highly correlated features to reduce multicollinearity.
   - **Regularization**: Consider using stronger regularization (increase the `alpha` parameter) to help stabilize the solution.
   - **Scaling**: Ensure that the features are properly scaled, as Ridge regression can be sensitive to the scale of the features.

2. **Model Selection**:
   - **Ridge Classifier**: Given the higher ROC AUC scores, the Ridge Classifier seems to perform better than the Random Forest in this case.
   - **Resampling Strategy**: Both over-sampling and under-sampling show similar performance for the Ridge Classifier, but under-sampling performs better for Random Forest. You might choose the strategy based on other factors such as computational efficiency or interpretability.

# 3.!!!!!!!!
1. **Model Evaluation**:
   - Evaluate the best models on the test set using confusion matrix, ROC AUC, and other metrics.

2. **Feature Importance**:
   - Use SHAP or similar methods to interpret the models and understand the importance of different features.

3. **Documentation and Reporting**:
   - Document the findings, including the best parameters, ROC AUC scores, and any observations regarding the resampling strategies and model performance.

# 3.Evaluate the Best Model

The objective of this code is to:
1. **Select the Best Model**: Identify and select the best model from the hyperparameter-tuned models stored in `best_estimators` based on their performance on the test set.
2. **Make Predictions**: Use the selected best model to predict the target variable on the test dataset.
3. **Evaluate Performance**: Calculate the confusion matrix and ROC AUC score to assess the model's performance on the test data.
4. **Check for Overfitting**: Ensure that the ROC AUC score is below 0.82 to avoid overfitting, issuing a warning if the score is higher.
5. **Generate Detailed Metrics**: Print a comprehensive classification report including precision, recall, and F1-score for a detailed evaluation of the model's performance.

In [13]:
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

# Select the best model (for example purposes, choosing the best model based on GridSearchCV results)
best_model_name = max(best_estimators, key=lambda name: best_estimators[name].score(X_test, y_test))
best_model = best_estimators[best_model_name]

# Predict on the test set
y_pred = best_model.predict(X_test)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Compute ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC AUC: {roc_auc}")

# Ensure ROC AUC < 0.82
if roc_auc >= 0.82:
    print("Warning: ROC AUC score is greater than or equal to 0.82. Model might be overfitting.")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[84806     0]
 [ 7447     1]]
ROC AUC: 0.5000671321160043
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     84806
           1       1.00      0.00      0.00      7448

    accuracy                           0.92     92254
   macro avg       0.96      0.50      0.48     92254
weighted avg       0.93      0.92      0.88     92254



### Interpretation of the Results

#### Confusion Matrix
```
Confusion Matrix:
[[84806     0]
 [ 7447     1]]
```
The confusion matrix shows:
- True Negatives (TN): 84,806 (clients correctly identified as not defaulting)
- False Positives (FP): 0 (clients incorrectly identified as defaulting)
- False Negatives (FN): 7,447 (clients incorrectly identified as not defaulting)
- True Positives (TP): 1 (clients correctly identified as defaulting)

#### ROC AUC Score
```
ROC AUC: 0.5000671321160043
```
The ROC AUC score is approximately 0.50, which is equivalent to random guessing. This indicates that the model has no discriminative power in distinguishing between classes (default vs. non-default).

#### Classification Report
```
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     84806
           1       1.00      0.00      0.00      7448

    accuracy                           0.92     92254
   macro avg       0.96      0.50      0.48     92254
weighted avg       0.93      0.92      0.88     92254
```
- **Class 0 (Non-defaulting clients)**:
  - Precision: 0.92 (The proportion of clients predicted not to default that actually did not default)
  - Recall: 1.00 (The proportion of actual non-defaulting clients correctly predicted)
  - F1-Score: 0.96 (Harmonic mean of precision and recall)
  - Support: 84,806 (Number of actual non-defaulting clients)

- **Class 1 (Defaulting clients)**:
  - Precision: 1.00 (The proportion of clients predicted to default that actually defaulted)
  - Recall: 0.00 (The proportion of actual defaulting clients correctly predicted)
  - F1-Score: 0.00 (Harmonic mean of precision and recall, which is 0 due to recall being 0)
  - Support: 7,448 (Number of actual defaulting clients)

- **Overall Metrics**:
  - Accuracy: 0.92 (The proportion of total correct predictions)
  - Macro Average: Average precision, recall, and F1-score for both classes (treating all classes equally)
  - Weighted Average: Average precision, recall, and F1-score for both classes (considering the support of each class)

### Observations and Insights
1. **Model Performance**:
   - The model performs exceptionally well in predicting non-defaulting clients (Class 0) but fails to predict defaulting clients (Class 1).
   - The recall for defaulting clients is 0, indicating that the model did not identify any defaulting clients correctly.

2. **ROC AUC Score**:
   - The ROC AUC score of 0.50 indicates that the model has no discriminative ability, performing no better than random guessing.

3. **Imbalance Issue**:
   - The classification report and confusion matrix highlight a significant class imbalance issue, where the model is biased towards the majority class (non-defaulting clients).

4. **Need for Improvement**:
   - The current model is inadequate for practical use in credit scoring due to its failure to identify defaulting clients.
   - Resampling techniques (like SMOTE, under-sampling, and class weighting) or different models need to be explored further to address the imbalance and improve the model’s performance.

### Recommendations
1. **Reassess the Model**:
   - Consider using more sophisticated techniques for handling class imbalance, such as SMOTE with Tomek links or ensemble methods like balanced random forests.
   - Experiment with other classifiers that might handle imbalance better, such as XGBoost or LightGBM.

2. **Feature Engineering**:
   - Review and improve feature engineering to ensure relevant features are being used, which might help the model differentiate better between classes.

3. **Evaluate Data**:
   - Ensure data quality and consider additional preprocessing steps to handle any underlying issues that might be affecting model performance.

By focusing on improving the handling of class imbalance and re-evaluating the features used, the model's ability to predict both defaulting and non-defaulting clients can be significantly enhanced.

# 4.Feature Importance

## 4.1 Global Feature Importance Using SHAP

The objective of this code is to:
1. **Initialize SHAP Explainer**: Create a SHAP explainer object for the best model's classifier using the training data.
2. **Compute SHAP Values**: Generate SHAP values for the test dataset to explain the model's predictions.
3. **Global Feature Importance**: Visualize the global feature importance using SHAP summary plot, which provides insights into how each feature contributes to the model's predictions.

In [14]:
import shap
import pandas as pd

# Convert the data to a DataFrame if it's not already one
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)

# Check and convert categorical variables to numeric
X_train_df = pd.get_dummies(X_train_df)
X_test_df = pd.get_dummies(X_test_df)

# Align the training and testing data to have the same columns
X_train_df, X_test_df = X_train_df.align(X_test_df, join='inner', axis=1)

# Fit the explainer on the training data
explainer = shap.Explainer(best_model['classifier'], X_train_df)
shap_values = explainer(X_test_df)

# Global feature importance
shap.summary_plot(shap_values, X_test_df)



TypeError: Cannot cast array data from dtype('O') to dtype('float64') according to the rule 'safe'

## 4.2 Local Feature Importance Using SHAP

The objective of this code is to:
1. **Initialize SHAP Visualization**: Set up SHAP's JavaScript visualization framework to enable interactive plots.
2. **Generate Local Explanation**: Create a SHAP force plot for a single instance from the test dataset, showing how each feature contributes to the model's prediction for that specific instance.
3. **Visualize Prediction Breakdown**: Provide a detailed breakdown of the prediction for the selected instance, illustrating the impact of each feature on the predicted value.

In [None]:
import shap
import pandas as pd

# Convert the data to a DataFrame if it's not already one
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)

# Check and convert categorical variables to numeric
X_train_df = pd.get_dummies(X_train_df)
X_test_df = pd.get_dummies(X_test_df)

# Align the training and testing data to have the same columns
X_train_df, X_test_df = X_train_df.align(X_test_df, join='inner', axis=1)

# Fit the explainer on the training data
explainer = shap.Explainer(best_model['classifier'], X_train_df)
shap_values = explainer(X_test_df)

# Global feature importance
shap.summary_plot(shap_values, X_test_df)

# Local explanation for a single instance
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test_df.iloc[0,:])

## Glossary

### Cross-Validation
A technique used to assess the performance of a model by splitting the dataset into multiple training and testing sets. This helps in understanding how the model will generalize to an independent dataset.

### GridSearchCV
A tool from `scikit-learn` that performs hyperparameter tuning by exhaustively searching through a specified parameter grid to find the best combination of hyperparameters for a given model, using cross-validation.

### Hyperparameter Tuning
The process of finding the optimal values for hyperparameters of a model, which are parameters that are not learned from data but set before the learning process begins. Examples include the number of trees in a random forest or the regularization strength in Ridge regression.

### ROC AUC Score
A performance measurement for classification problems at various threshold settings. ROC AUC represents the area under the Receiver Operating Characteristic curve, which plots the true positive rate against the false positive rate. A score closer to 1 indicates better performance.

### Confusion Matrix
A table used to describe the performance of a classification model by showing the actual vs. predicted classifications. It includes True Positives (TP), True Negatives (TN), False Positives (FP), and False Negatives (FN).

### Classification Report
A detailed report showing the precision, recall, F1-score, and support for each class in a classification problem. This helps in understanding the performance of the model across different classes.

### Resampling Techniques
Methods used to adjust the class distribution of a dataset, commonly used to handle class imbalance. Examples include:
- **SMOTE (Synthetic Minority Over-sampling Technique)**: Generates synthetic samples for the minority class.
- **RandomUnderSampler**: Reduces the number of samples in the majority class.

### Pipeline
A tool from `scikit-learn` that allows for chaining multiple processing steps (e.g., data transformation and model fitting) into a single object. This ensures that all steps are applied consistently during both training and testing.

### SHAP (SHapley Additive exPlanations)
A method to explain individual predictions of machine learning models by assigning each feature an importance value. It helps in understanding how the model arrives at its predictions.

### SHAP Explainer
An object in the SHAP library that is used to calculate SHAP values for a given model. It helps in interpreting the contributions of each feature to the predictions.

### SHAP Values
Values calculated by the SHAP explainer that quantify the contribution of each feature to the model's prediction for a given instance. Higher absolute values indicate greater impact on the prediction.

### SHAP Summary Plot
A plot that visualizes the global importance of features by showing the distribution of SHAP values for each feature across all instances in the dataset. It helps in understanding which features are most influential for the model.

### SHAP Force Plot
A plot that provides a detailed breakdown of the contributions of each feature to a single instance's prediction, illustrating how different features push the prediction towards or away from the base value.

### Random Forest
An ensemble learning method that constructs multiple decision trees during training and outputs the mode of the classes (classification) or mean prediction (regression) of the individual trees.

### Ridge Classifier
A linear model for classification that includes L2 regularization (Ridge regression) to prevent overfitting by penalizing large coefficients.

### Class Weight
A parameter used to handle class imbalance by assigning different weights to different classes, typically to give more importance to the minority class. This can be set to 'balanced' to automatically adjust weights inversely proportional to class frequencies.

### RandomUnderSampler
A resampling technique that reduces the number of instances in the majority class by randomly sampling without replacement, used to balance the class distribution.

### SMOTE
A resampling technique that generates synthetic samples for the minority class by interpolating between existing minority class instances. It is used to balance the class distribution in the dataset.
