<a href="https://colab.research.google.com/github/ramya2110f/Ramz/blob/master/5foldcv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np
import pandas as pd
import xgboost as xgb
import scipy.special
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset
df = pd.read_csv('/content/Dataset-Mental-Disorders.csv')

# Encode categorical labels
label_encoder = LabelEncoder()
df["Expert Diagnose"] = label_encoder.fit_transform(df["Expert Diagnose"])  # Encode labels

# Define features and target
X = df.drop(columns=["Expert Diagnose"])
y = df["Expert Diagnose"]
X = X.apply(lambda col: LabelEncoder().fit_transform(col) if col.dtype == 'object' else col)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Custom focal loss function (Fixed)
def custom_focal_loss(preds, dtrain):
    labels = dtrain.get_label().astype(int)  # Ensure labels are integers
    num_class = preds.shape[1]  # Get number of classes
    preds = scipy.special.softmax(preds, axis=1)  # Convert logits to probabilities

    gamma = 2.0
    alpha = 0.5

    p_t = preds[np.arange(len(labels)), labels]
    focal_weight = alpha * (1 - p_t) ** gamma

    grad = preds.copy()
    grad[np.arange(len(labels)), labels] -= 1
    grad *= focal_weight[:, np.newaxis]  # Apply focal weight

    hess = preds * (1 - preds)  # Hessian for second-order optimization

    return grad, hess  # Do NOT flatten, keep the shape as (n_samples, n_classes)

# Define parameters for XGBoost
params = {
    "objective": "multi:softprob",
    "num_class": len(np.unique(y)),  # Number of unique classes
    "eval_metric": "mlogloss",  # Multi-class log loss
    "learning_rate": 0.1,
    "max_depth": 6,
    "lambda": 1.0,  # L2 regularization
}

# Perform Stratified 5-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X_scaled, y), 1):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Train model using custom focal loss
    model = xgb.train(params, dtrain, num_boost_round=100, obj=custom_focal_loss)

    # Predict and evaluate
    y_pred_probs = model.predict(dtest)
    y_pred = np.argmax(y_pred_probs, axis=1)  # Get class with highest probability

    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)
    print(f"Fold {fold} Accuracy: {accuracy:.4f}")

# Print all fold accuracies
print("\nCross-Validation Accuracy Scores:")
for i, acc in enumerate(fold_accuracies, 1):
    print(f"Fold {i}: {acc:.4f}")

# Print average accuracy
print(f"\nAverage Accuracy: {np.mean(fold_accuracies):.4f}")




Fold 1 Accuracy: 0.9167
Fold 2 Accuracy: 0.7500
Fold 3 Accuracy: 0.8750
Fold 4 Accuracy: 0.8333
Fold 5 Accuracy: 0.8750

Cross-Validation Accuracy Scores:
Fold 1: 0.9167
Fold 2: 0.7500
Fold 3: 0.8750
Fold 4: 0.8333
Fold 5: 0.8750

Average Accuracy: 0.8500
