In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, cohen_kappa_score, roc_auc_score, accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import GroupKFold

In [2]:
#import
df = pd.read_csv('/Users/sun/Downloads/ca1-dataset.csv')

#Assign group
group_dict = {}
groups = np.array([])
for index, row in df.iterrows():
    name = row['namea']
    if name not in group_dict:
        group_dict[name] = index
    groups = np.append(groups, group_dict[name])

#find no variance data
constant_columns = [col for col in df.columns if df[col].nunique() == 1]


#drop data&assign x,y
df['OffTask'] = df['OffTask'].replace({'Y': 1, 'N': 0})
X = df.drop(columns=['OffTask', 'Unique-id', 'namea'] + constant_columns, axis=1)
y = df['OffTask']

In [3]:
# 10 fold
gkf = GroupKFold(n_splits=10)

kappa_scores = []
accuracy_scores = []
roc_scores = []

for train_index, test_index in gkf.split(X, y, groups=groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Create and fit an XGBoost classifier
    xgb_classifier = XGBClassifier(learning_rate=0.1, n_estimators=600, random_state=5)
    xgb_classifier.fit(X_train, y_train)
  
    # Make predictions
    y_pred = xgb_classifier.predict(X_test)

    # Calculate Cohen's Kappa for this fold
    kappa = cohen_kappa_score(y_test, y_pred)
    kappa_scores.append(kappa)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    roc = roc_auc_score(y_test, y_pred)
    roc_scores.append(roc)

#Calculate the mean Cohen's Kappa score across all folds
mean_kappa = sum(kappa_scores) / len(kappa_scores)
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_roc = sum(roc_scores) / len(roc_scores)

print(f"Mean Cohen's Kappa: {mean_kappa:.3f}")
print(f"Mean Accuracy: {mean_accuracy:.3f}")
print(f"Mean ROC: {mean_roc:.3f}")

Mean Cohen's Kappa: 0.353
Mean Accuracy: 0.964
Mean ROC: 0.638
