In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from hyperopt import hp
import random
from sklearn.svm import SVC
from scipy.stats import mode
import shap
from sklearn.inspection import permutation_importance

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
tabular= pd.read_csv('/Users/finnschonknecht/Desktop/XGB_train_folder/tabular.csv')
targets = pd.read_csv('/Users/finnschonknecht/Desktop/XGB_train_folder/binary_personalised_targets.csv')
personality_df = pd.read_csv('/Users/finnschonknecht/Desktop/XGB_train_folder/personality_df.csv')

In [3]:
p_code_counts = tabular.groupby('Pcode').size().reset_index(name='counts')
p_code_counts = pd.DataFrame(p_code_counts)

In [4]:
p_code_counts.counts.max()

109

In [5]:
tabular

Unnamed: 0,ResponseTime,Pcode,Stress_binary,Valence_binary,Arousal_binary,Age,Gender,Openness,Conscientiousness,Neuroticism,...,6hr_WEATHER_number_of_events,previous_nights_sleep_proxy,hour_of_day,day_0,day_1,day_2,day_3,day_4,day_5,day_6
0,2019-05-08 09:09:48+09:00,P19,0.0,1.0,1.0,18,M,12,13,3,...,,0.761660,9,0,0,1,0,0,0,0
1,2019-05-08 10:13:10+09:00,P19,1.0,0.0,1.0,18,M,12,13,3,...,,0.761660,10,0,0,1,0,0,0,0
2,2019-05-08 10:41:14+09:00,P19,1.0,1.0,1.0,18,M,12,13,3,...,,0.761660,10,0,0,1,0,0,0,0
3,2019-05-08 11:24:43+09:00,P19,1.0,1.0,0.0,18,M,12,13,3,...,,0.761660,11,0,0,1,0,0,0,0
4,2019-05-08 12:10:15+09:00,P19,0.0,1.0,1.0,18,M,12,13,3,...,,0.761660,12,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5433,2019-05-05 19:37:56+09:00,P62,0.0,1.0,0.0,20,M,15,15,9,...,,11.066579,19,0,0,0,0,0,0,1
5434,2019-05-05 20:45:53+09:00,P62,1.0,1.0,1.0,20,M,15,15,9,...,,11.066579,20,0,0,0,0,0,0,1
5435,2019-05-05 21:43:23+09:00,P62,1.0,0.0,0.0,20,M,15,15,9,...,,11.066579,21,0,0,0,0,0,0,1
5436,2019-05-05 23:10:51+09:00,P62,0.0,1.0,0.0,20,M,15,15,9,...,,11.066579,23,0,0,0,0,0,0,1


In [18]:
personality_columns = ['Openness', 'Agreeableness', 'Conscientiousness', 'Extraversion', 'Neuroticism']

In [19]:
tabular.fillna(0, inplace=True)

In [20]:
unique_ids = tabular['Pcode'].unique()
random.seed(150)   
test_ids = np.random.choice(unique_ids, 15, replace=False)

In [42]:
max_count_row = p_code_counts.loc[p_code_counts['counts'].idxmax()]

In [43]:
max_count_row

Pcode     P75
counts    109
Name: 69, dtype: object

In [21]:
test_data= tabular[tabular['Pcode'].isin(test_ids)]
train_data= tabular[~tabular['Pcode'].isin(test_ids)]

In [22]:
def encode_dataframe(df):
    # Encode the Gender column
    if 'Gender' in df.columns:
        le = LabelEncoder()
        df['Gender'] = le.fit_transform(df['Gender'])

    # Convert object columns to category if needed
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype('category')
    
    return df

def scale_numeric_columns(df_train, df_test, exclude_columns):
    # Identify numeric columns
    numeric_cols = df_train.select_dtypes(include=['float64', 'int64']).columns

    # Exclude specified columns from numeric columns
    numeric_cols = [col for col in numeric_cols if col not in exclude_columns]

    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform the training data
    df_train_scaled = df_train.copy()
    df_train_scaled[numeric_cols] = scaler.fit_transform(df_train[numeric_cols])

    # Transform the test data using the same scaler
    df_test_scaled = df_test.copy()
    df_test_scaled[numeric_cols] = scaler.transform(df_test[numeric_cols])

    return df_train_scaled, df_test_scaled

def create_personality_flags(df, personality_columns):
    for col in personality_columns:
        df[f'{col}_High'] = (df[col] >= 9).astype(int)
    return df

In [23]:
train_data, test_data = scale_numeric_columns(
    train_data, test_data, ['Stress_binary', 'Valence_binary', 'Arousal_binary', 'day_0', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6'])

In [25]:
def SVMGroupKFoldCV(data, train, test, unique_ids, test_ids, idcolumn, outcomevar, dropcols=[], n_splits=5, n_jobs=-1):
   
    train_data = encode_dataframe(train.copy())
    test_data = encode_dataframe(test.copy())
    
    best_hyperparams = []
    F1 = []

    group_kfold = GroupKFold(n_splits=n_splits)

    param_dist = {
        'C': [0.01, 0.1, 1, 10, 100],
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf']
    }

    # Step 2: Perform GroupKFold on the remaining data
    best_f1_score = -1  # Initialize to a very low value
    best_params_overall = None

    for train_index, val_index in group_kfold.split(train_data, groups=train_data[idcolumn]):
        data_train = train_data.iloc[train_index]
        data_val = train_data.iloc[val_index]

        svm = SVC(probability = True, random_state=0)
        random_search = RandomizedSearchCV(svm, param_dist, cv=3, scoring='f1_macro', n_jobs=n_jobs, random_state= 0, n_iter= 10)
        random_search.fit(data_train.drop(columns=dropcols + [outcomevar]), data_train[outcomevar])

        best_params = random_search.best_params_
        best_hyperparams.append(best_params)

        svm_best = SVC(**best_params, probability = True , random_state=0)
        svm_best.fit(data_train.drop(columns=dropcols + [outcomevar]), data_train[outcomevar])

        predictions = svm_best.predict(data_val.drop(columns=dropcols + [outcomevar]))
        f1 = f1_score(data_val[outcomevar], predictions, average='macro')
        F1.append(f1)

        if f1 > best_f1_score:
            best_f1_score = f1
            best_params_overall = best_params

        print('...Fold processing complete.')

    mean_F1_micro = np.mean(F1)
    std_F1_micro = np.std(F1)

    # Use the best hyperparameters to train the final model
    final_model = SVC(**best_params_overall, probability= True , random_state=0)
    final_model.fit(train_data.drop(columns=dropcols + [outcomevar]), train_data[outcomevar])

    # Predict on the separate test set
    test_predictions = final_model.predict(test_data.drop(columns=dropcols + [outcomevar]))
    test_F1_score = f1_score(test_data[outcomevar], test_predictions, average='macro')
    test_confusion_matrix = confusion_matrix(test_data[outcomevar], test_predictions)

    # Print metrics
    print(f'Mean F1 (Overall): {mean_F1_micro}')
    print(f'Std F1 (Overall): {std_F1_micro}')
    print(f'Test F1 Score (Overall): {test_F1_score}')

    # Print confusion matrices
    print('Overall Confusion Matrix:')
    print(test_confusion_matrix)

    # Compute permutation feature importance
    # result = permutation_importance(final_model, test_data.drop(columns=dropcols + [outcomevar]), test_data[outcomevar], n_repeats=10, random_state=42, n_jobs=n_jobs)
    # for i in result.importances_mean.argsort()[::-1]:
    #     print(f"{test_data.columns[i]}: {result.importances_mean[i]:.4f} +/- {result.importances_std[i]:.4f}")

    # Optionally, compute SHAP values (can be commented out if causing kernel restarts)
    # try:
    #     explainer = shap.KernelExplainer(final_model.predict_proba, train_data.drop(columns=dropcols + [outcomevar])[:100])
    #     shap_values = explainer.shap_values(test_data.drop(columns=dropcols + [outcomevar])[:50])
    #     shap.summary_plot(shap_values, test_data.drop(columns=dropcols + [outcomevar])[:50], feature_names=test_data.drop(columns=dropcols + [outcomevar]).columns)
    # except Exception as e:
    #     print(f"SHAP computation failed: {e}")
    
    print(best_params_overall)

    # Return results
    return mean_F1_micro, std_F1_micro, test_F1_score, test_confusion_matrix, best_params_overall, final_model

In [26]:
mean_f1, std_f1, test_f1, test_cm, best_params, final_model = SVMGroupKFoldCV(
    data = tabular, 
    train = train_data, 
    test = test_data, 
    unique_ids = unique_ids, 
    test_ids = test_ids, 
    idcolumn = 'Pcode',
    outcomevar = 'Stress_binary',
    dropcols=['Pcode', 'ResponseTime', 'Valence_binary', 'Arousal_binary'], n_splits=5, n_jobs=-1)

...Fold processing complete.
...Fold processing complete.
...Fold processing complete.
...Fold processing complete.
...Fold processing complete.
Mean F1 (Overall): 0.5174945695474259
Std F1 (Overall): 0.010472406427535799
Test F1 Score (Overall): 0.5010227775320655
Overall Confusion Matrix:
[[382 171]
 [357 184]]
{'kernel': 'rbf', 'gamma': 'auto', 'C': 1}


In [27]:
mean_f1_valence, std_f1_valence, test_f1_valence, test_cm_valence, best_params_valence, final_model_valence = SVMGroupKFoldCV(
    data = tabular, 
    train = train_data, 
    test = test_data, 
    unique_ids = unique_ids, 
    test_ids = test_ids, 
    idcolumn = 'Pcode',
    outcomevar = 'Valence_binary',
    dropcols=['Pcode', 'ResponseTime', 'Stress_binary', 'Arousal_binary'], n_splits=5, n_jobs=-1)

...Fold processing complete.
...Fold processing complete.
...Fold processing complete.
...Fold processing complete.
...Fold processing complete.
Mean F1 (Overall): 0.5395309664576053
Std F1 (Overall): 0.02629201132687472
Test F1 Score (Overall): 0.5173523733557334
Overall Confusion Matrix:
[[171 298]
 [202 423]]
{'kernel': 'rbf', 'gamma': 'scale', 'C': 1}


In [28]:
mean_f1_arousal, std_f1_arousal, test_f1_arousal, test_cm_arousal, best_params_arousal, final_model_arousal = SVMGroupKFoldCV(
    data = tabular, 
    train = train_data, 
    test = test_data, 
    unique_ids = unique_ids, 
    test_ids = test_ids, 
    idcolumn = 'Pcode',
    outcomevar = 'Arousal_binary',
    dropcols=['Pcode', 'ResponseTime', 'Stress_binary', 'Valence_binary'], n_splits=5, n_jobs=-1)

...Fold processing complete.
...Fold processing complete.
...Fold processing complete.
...Fold processing complete.
...Fold processing complete.
Mean F1 (Overall): 0.5607464868416526
Std F1 (Overall): 0.027269662052613133
Test F1 Score (Overall): 0.574404026940993
Overall Confusion Matrix:
[[377 233]
 [227 257]]
{'kernel': 'rbf', 'gamma': 'scale', 'C': 1}
