In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, cohen_kappa_score, accuracy_score
from sklearn.model_selection import GroupKFold, train_test_split
from xgboost import XGBClassifier

In [2]:
df1 = pd.read_csv('/Users/sun/Desktop/LST/EDUC 6191/CA2/ca1-dataset.csv')
df2 = pd.read_csv('/Users/sun/Desktop/LST/EDUC 6191/CA2/ca2-dataset.csv')

In [3]:
df2 = df2.groupby('Unique-id').last().reset_index()

In [4]:
# Create new column
# 1. 
df2['timelast5'] =  ((df2['timelast5SDnormed']-df2['timeSDnormed']>0)).astype(int)
# 2. 
df2['Not_Thinking'] =((df2['right'] == 0) & (df2['help'] == 0) & (df2['bug'] == 0) & (df2['time'] <= 20)).astype(int)
# 3. 
df2['timelast3'] = ((df2['timelast3SDnormed']-df2['timeSDnormed'] <0)).astype(int)
# 4. 
df2['helprecent'] = df2['recent8help'] / 8 + df2[' recent5wrong'] / 5 
# 5.
df2['help_count_by_prod'] = df2[df2['help'] == 1].groupby('prod')['help'].transform('sum')
# 6.
df2['timeSD'] = (((df2['timeSDnormed']*df2['timelast3SDnormed']*df2['timelast5SDnormed'])<0)).astype(int)
# 7.
df2['PrevC'] = (df2['Prev3Count-up']/3<df2['Prev5Count-up']/5).astype(int)
# 8.
df2['manyPrev'] = df2['manywrong-up'] - df2['helppct-up']  
# 9.
df2['react'] = ((df2['notright'] == 1)& (df2['time'] <= 15)).astype(int)
# 10. 
df2['recent'] = df2[' recent5wrong'] / 5 

In [5]:
new_features = df2[['Unique-id','timelast5','Not_Thinking','timelast3','helprecent','help_count_by_prod',
                   'timeSD','PrevC','react','manyPrev','recent']]

In [6]:
df = df1.merge(new_features, on='Unique-id', how='left')

In [7]:
group_dict = {}
groups = np.array([])
for index, row in df.iterrows():
    name = row['namea']
    if name not in group_dict:
        group_dict[name] = index
    groups = np.append(groups, group_dict[name])

#find no variance data
constant_columns = [col for col in df.columns if df[col].nunique() == 1]
df['OffTask'] = df['OffTask'].replace({'Y': 1, 'N': 0})
X = df.drop(columns=['OffTask', 'Unique-id', 'namea','AvgPrev3Count-up','AvgPrev5Count-up'] + constant_columns, axis=1)
y = df['OffTask']

In [8]:
gkf = GroupKFold(n_splits=10)

kappa_scores = []
accuracy_scores = []
roc_scores = []

for train_index, test_index in gkf.split(X, y, groups=groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and fit an XGBoost classifier with early stopping
    xgb_classifier = XGBClassifier(
        learning_rate=0.1,
        n_estimators=600, 
        random_state=5,
    )
    
    xgb_classifier.fit(
        X_train, y_train,
    )

    # Make predictions
    y_pred = xgb_classifier.predict(X_test)

    # Calculate Cohen's Kappa for this fold
    kappa = cohen_kappa_score(y_test, y_pred)
    kappa_scores.append(kappa)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    roc = roc_auc_score(y_test, y_pred)
    roc_scores.append(roc)

# Calculate the mean Cohen's Kappa score across all folds
mean_kappa = sum(kappa_scores) / len(kappa_scores)
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_roc = sum(roc_scores) / len(roc_scores)

print(f"Mean Cohen's Kappa: {mean_kappa:.3f}")
print(f"Mean Accuracy: {mean_accuracy:.3f}")
print(f"Mean ROC: {mean_roc:.3f}")

Mean Cohen's Kappa: 0.364
Mean Accuracy: 0.962
Mean ROC: 0.653


In [9]:
for train_index, test_index in gkf.split(X, y, groups=groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and fit an XGBoost classifier with early stopping
    xgb_classifier = XGBClassifier(
        learning_rate=0.1,
        n_estimators=1000,  # A large number to ensure early stopping works
        random_state=5,
        eval_metric="logloss",  # Use log loss for early stopping
        early_stopping_rounds=10,  # Stop if the validation metric doesn't improve for 10 rounds
    )
    
    xgb_classifier.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],  # Validation set
        verbose=False  # Set to True if you want to see the training progress
    )

    # Make predictions
    y_pred = xgb_classifier.predict(X_test)

    # Calculate Cohen's Kappa for this fold
    kappa = cohen_kappa_score(y_test, y_pred)
    kappa_scores.append(kappa)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    roc = roc_auc_score(y_test, y_pred)
    roc_scores.append(roc)

# Calculate the mean Cohen's Kappa score across all folds
mean_kappa = sum(kappa_scores) / len(kappa_scores)
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_roc = sum(roc_scores) / len(roc_scores)

print(f"Mean Cohen's Kappa with early stop: {mean_kappa:.3f}")
print(f"Mean Accuracy with early stop: {mean_accuracy:.3f}")
print(f"Mean ROC with early stop: {mean_roc:.3f}")

Mean Cohen's Kappa with early stop: 0.334
Mean Accuracy with early stop: 0.963
Mean ROC with early stop: 0.637


In [10]:
group_dict = {}
groups = np.array([])
for index, row in df1.iterrows():
    name = row['namea']
    if name not in group_dict:
        group_dict[name] = index
    groups = np.append(groups, group_dict[name])

#find no variance data
constant_columns = [col for col in df1.columns if df1[col].nunique() == 1]
df1['OffTask'] = df1['OffTask'].replace({'Y': 1, 'N': 0})
X = df1.drop(columns=['OffTask', 'Unique-id', 'namea'] + constant_columns, axis=1)
y = df1['OffTask']
gkf = GroupKFold(n_splits=10)

kappa_scores = []
accuracy_scores = []
roc_scores = []

for train_index, test_index in gkf.split(X, y, groups=groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and fit an XGBoost classifier with early stopping
    xgb_classifier = XGBClassifier(
        learning_rate=0.1,
        n_estimators=600,
        random_state=5,
    )
    
    xgb_classifier.fit(
        X_train, y_train,
    )


    # Make predictions
    y_pred = xgb_classifier.predict(X_test)

    # Calculate Cohen's Kappa for this fold
    kappa = cohen_kappa_score(y_test, y_pred)
    kappa_scores.append(kappa)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    roc = roc_auc_score(y_test, y_pred)
    roc_scores.append(roc)

# Calculate the mean Cohen's Kappa score across all folds
mean_kappa = sum(kappa_scores) / len(kappa_scores)
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_roc = sum(roc_scores) / len(roc_scores)

print(f"Old Mean Cohen's Kappa: {mean_kappa:.3f}")
print(f"Old Mean Accuracy: {mean_accuracy:.3f}")
print(f"Old Mean ROC: {mean_roc:.3f}")

Old Mean Cohen's Kappa: 0.353
Old Mean Accuracy: 0.964
Old Mean ROC: 0.638


In [11]:
for train_index, test_index in gkf.split(X, y, groups=groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and fit an XGBoost classifier with early stopping
    xgb_classifier = XGBClassifier(
        learning_rate=0.1,
        n_estimators=1000,  # A large number to ensure early stopping works
        random_state=5,
        eval_metric="logloss",  # Use log loss for early stopping
        early_stopping_rounds=10,  # Stop if the validation metric doesn't improve for 10 rounds
    )
    
    xgb_classifier.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],  # Validation set
        verbose=False  # Set to True if you want to see the training progress
    )


    # Make predictions
    y_pred = xgb_classifier.predict(X_test)

    # Calculate Cohen's Kappa for this fold
    kappa = cohen_kappa_score(y_test, y_pred)
    kappa_scores.append(kappa)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    roc = roc_auc_score(y_test, y_pred)
    roc_scores.append(roc)

# Calculate the mean Cohen's Kappa score across all folds
mean_kappa = sum(kappa_scores) / len(kappa_scores)
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_roc = sum(roc_scores) / len(roc_scores)

print(f"Old Mean Cohen's Kappa with early stop: {mean_kappa:.3f}")
print(f"Old Mean Accuracy with early stop: {mean_accuracy:.3f}")
print(f"Old Mean ROC with early stop: {mean_roc:.3f}")

Old Mean Cohen's Kappa with early stop: 0.316
Old Mean Accuracy with early stop: 0.963
Old Mean ROC with early stop: 0.625
