# Loan Default Prediction System - Model Training

This notebook is designed to run the training pipeline with full hyperparameter tuning (XGBoost, CatBoost, etc.) on Google Colab's GPU.

In [None]:
# 0. Check for GPU
!nvidia-smi

In [None]:
# 1. Clone the Repository (Safely)
import os
# Check if we are already in the repo to avoid nesting
if os.path.basename(os.getcwd()) == 'loan_defult_prediction_system':
    print("Already in the repository directory.")
else:
    if not os.path.exists('loan_defult_prediction_system'):
        !git clone https://github.com/rkpcode/loan_defult_prediction_system.git
    %cd loan_defult_prediction_system
    
!git pull  # Ensure we have latest code

In [None]:
# 2. Install Dependencies
!pip install -r requirements.txt
!pip install -e .

In [None]:
# 3. Data Setup (Kaggle Credentials)
import os
from google.colab import files

# Create artifacts directory if it doesn't exist
os.makedirs('artifacts', exist_ok=True)

# Setup Kaggle API
# The pipeline in src/utils.py will automatically download the data if it detects kaggle.json.
# We just need to put kaggle.json in the right place.

print("Please upload your kaggle.json file here:")
uploaded = files.upload()

if 'kaggle.json' in uploaded:
    !mkdir -p ~/.kaggle
    !cp kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    print("Kaggle API key configured. The pipeline will download the dataset automatically.")
else:
    print("kaggle.json not uploaded. The pipeline requires it to download the dataset automatically.")

In [None]:
# 4. Run Training Pipeline (Directly in Python)
# We run this directly in the notebook to ensure sys.path is correct and 'src' can be imported.

import sys
import os
# Add current directory to sys.path
sys.path.append(os.getcwd())

try:
    from src.loan_defult_prediction_system.components.data_ingestion import DataIngestion
    from src.loan_defult_prediction_system.components.data_transformation import DataTransformation
    from src.loan_defult_prediction_system.components.model_trainer import ModelTrainer
    from src.loan_defult_prediction_system.components.model_monitering import ModelMonitoring
    from src.loan_defult_prediction_system.logger import logging

    logging.info(">>>>> Training Pipeline Started in Colab <<<<<")

    # 1. Data Ingestion
    print("Starting Step 1: Data Ingestion")
    obj = DataIngestion()
    train_data_path, test_data_path = obj.initiate_data_ingestion()
    print(f"Data Ingestion Completed. Train: {train_data_path}, Test: {test_data_path}")

    # 2. Data Transformation
    print("Starting Step 2: Data Transformation")
    data_transformation = DataTransformation()
    train_arr, test_arr, _ = data_transformation.initiate_data_transformation(train_data_path, test_data_path)
    print("Data Transformation Completed.")

    # 3. Model Training
    print("Starting Step 3: Model Training (This may take time due to Hyperparameter Tuning)")
    model_trainer = ModelTrainer()
    accuracy = model_trainer.initiate_model_trainer(train_arr, test_arr)
    print(f"Model Training Completed. ROC AUC Score: {accuracy:.4f}")

    # 4. Model Monitoring
    print("Starting Step 4: Model Monitoring")
    model_monitoring = ModelMonitoring()
    report_path = model_monitoring.initiate_model_monitoring()
    print(f"Model Monitoring Completed. Report saved at: {report_path}")

except Exception as e:
    print(f"An error occurred: {e}")
    raise e

In [None]:
# 5. Detailed Evaluation & Model Metadata
import json
if os.path.exists('artifacts/model_metadata.json'):
    with open('artifacts/model_metadata.json', 'r') as f:
        print(json.load(f))
else:
    print("Model metadata not found. Training might have failed.")

In [None]:
# 6. Download the Best Model
from google.colab import files
if os.path.exists('artifacts/model.pkl'):
    files.download('artifacts/model.pkl')
if os.path.exists('artifacts/model_metadata.json'):
    files.download('artifacts/model_metadata.json')