In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import GroupKFold

In [2]:
tabular= pd.read_csv('/Users/finnschonknecht/Desktop/XGB_train_folder/tabular.csv')
targets = pd.read_csv('/Users/finnschonknecht/Desktop/XGB_train_folder/binary_personalised_targets.csv')
personality_df = pd.read_csv('/Users/finnschonknecht/Desktop/XGB_train_folder/personality_df.csv')

In [3]:
tabular.fillna(0, inplace=True)

In [4]:
unique_ids = tabular['Pcode'].unique()
random.seed(150)   
test_ids = np.random.choice(unique_ids, 15, replace=False)

In [5]:
test_data= tabular[tabular['Pcode'].isin(test_ids)]
train_data= tabular[~tabular['Pcode'].isin(test_ids)]

In [6]:
def encode_dataframe(df):
    # Encode the Gender column
    if 'Gender' in df.columns:
        le = LabelEncoder()
        df['Gender'] = le.fit_transform(df['Gender'])

    # Convert object columns to category if needed
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype('category')
    
    return df

def scale_numeric_columns(df_train, df_test, exclude_columns):
    # Identify numeric columns
    numeric_cols = df_train.select_dtypes(include=['float64', 'int64']).columns

    # Exclude specified columns from numeric columns
    numeric_cols = [col for col in numeric_cols if col not in exclude_columns]

    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform the training data
    df_train_scaled = df_train.copy()
    df_train_scaled[numeric_cols] = scaler.fit_transform(df_train[numeric_cols])

    # Transform the test data using the same scaler
    df_test_scaled = df_test.copy()
    df_test_scaled[numeric_cols] = scaler.transform(df_test[numeric_cols])

    return df_train_scaled, df_test_scaled

In [7]:
train_data = encode_dataframe(train_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Gender'] = le.fit_transform(df['Gender'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')


In [8]:
test_data = encode_dataframe(test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Gender'] = le.fit_transform(df['Gender'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')


In [9]:
train_data, test_data = scale_numeric_columns(
    train_data, test_data, ['Stress_binary', 'Valence_binary', 'Arousal_binary', 'day_0', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6'])

In [10]:
train_stress = train_data['Stress_binary']
test_stress = test_data['Stress_binary']

train_valence = train_data['Valence_binary']
test_valence = test_data['Valence_binary']

train_arousal = train_data['Arousal_binary']
test_arousal = test_data['Arousal_binary']

In [11]:
train_data = train_data.drop(columns=['Stress_binary', 'Arousal_binary', 'Valence_binary', 'ResponseTime'])

In [12]:
test_data = test_data.drop(columns=['Stress_binary', 'Arousal_binary', 'Valence_binary','ResponseTime'])

In [13]:
test_data = test_data.drop(columns=['Pcode'])

## Majority Classifier

In [14]:
n_splits = 5
group_kfold = GroupKFold(n_splits=n_splits)
f1_scores = []

for train_index, test_index in group_kfold.split(train_data, groups=train_data['Pcode']):
    X_train, X_test = train_data.iloc[train_index].drop(columns='Pcode'), train_data.iloc[test_index].drop(columns='Pcode')
    y_train, y_test = train_stress.iloc[train_index], train_stress.iloc[test_index]

    # Fit Majority Classifier
    majority_clf = DummyClassifier(strategy='most_frequent')
    majority_clf.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = majority_clf.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(macro_f1)

print(f'Mean Macro F1 Score: {np.mean(f1_scores)}')
print(f'STD of F1 scores during training: {np.std(f1_scores)}')

Mean Macro F1 Score: 0.34148037662086156
STD of F1 scores during training: 0.026093847665018734


In [15]:
y_pred_test = majority_clf.predict(test_data)
test_macro_f1 = f1_score(test_stress, y_pred_test, average='macro')
print("Test Macro-F1 Score:", test_macro_f1)

Test Macro-F1 Score: 0.3689676237018937


In [16]:
n_splits = 5
group_kfold = GroupKFold(n_splits=n_splits)
f1_scores = []

for train_index, test_index in group_kfold.split(train_data, groups=train_data['Pcode']):
    X_train, X_test = train_data.iloc[train_index].drop(columns='Pcode'), train_data.iloc[test_index].drop(columns='Pcode')
    y_train, y_test = train_valence.iloc[train_index], train_valence.iloc[test_index]

    # Fit Majority Classifier
    majority_clf = DummyClassifier(strategy='most_frequent')
    majority_clf.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = majority_clf.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(macro_f1)

print(f'Mean Macro F1 Score: {np.mean(f1_scores)}')
print(f'STD of F1 scores during training: {np.std(f1_scores)}')

Mean Macro F1 Score: 0.3588057237020674
STD of F1 scores during training: 0.01864346920460392


In [17]:
y_pred_test = majority_clf.predict(test_data)
test_macro_f1 = f1_score(test_valence, y_pred_test, average='macro')
print("Test Macro-F1 Score:", test_macro_f1)

Test Macro-F1 Score: 0.353566958698373


In [18]:
n_splits = 5
group_kfold = GroupKFold(n_splits=n_splits)
f1_scores = []

for train_index, test_index in group_kfold.split(train_data, groups=train_data['Pcode']):
    X_train, X_test = train_data.iloc[train_index].drop(columns='Pcode'), train_data.iloc[test_index].drop(columns='Pcode')
    y_train, y_test = train_arousal.iloc[train_index], train_arousal.iloc[test_index]

    # Fit Majority Classifier
    majority_clf = DummyClassifier(strategy='most_frequent')
    majority_clf.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = majority_clf.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(macro_f1)

print(f'Mean Macro F1 Score: {np.mean(f1_scores)}')
print(f'STD of F1 scores during training: {np.std(f1_scores)}')

Mean Macro F1 Score: 0.3417823685320419
STD of F1 scores during training: 0.01566923729665864


In [19]:
y_pred_test = majority_clf.predict(test_data)
test_macro_f1 = f1_score(test_arousal, y_pred_test, average='macro')
print("Test Macro-F1 Score:", test_macro_f1)

Test Macro-F1 Score: 0.3373957665169981


## Naive Bayes

In [20]:
n_splits = 5
group_kfold = GroupKFold(n_splits=n_splits)
f1_scores = []

for train_index, test_index in group_kfold.split(train_data, groups=train_data['Pcode']):
    X_train, X_test = train_data.iloc[train_index].drop(columns='Pcode'), train_data.iloc[test_index].drop(columns='Pcode')
    y_train, y_test = train_stress.iloc[train_index], train_stress.iloc[test_index]

    # Fit Naive Bayes
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = naive_bayes.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(macro_f1)

print(f'Mean Macro F1 Score: {np.mean(f1_scores)}')
print(f'STD of F1 scores during training: {np.std(f1_scores)}')

Mean Macro F1 Score: 0.47567272823973583
STD of F1 scores during training: 0.032898100798772756


In [21]:
y_pred_test = naive_bayes.predict(test_data)
test_macro_f1 = f1_score(test_stress, y_pred_test, average='macro')
print("Test Macro-F1 Score:", test_macro_f1)

Test Macro-F1 Score: 0.5128064368206706


In [22]:
n_splits = 5
group_kfold = GroupKFold(n_splits=n_splits)
f1_scores = []

for train_index, test_index in group_kfold.split(train_data, groups=train_data['Pcode']):
    X_train, X_test = train_data.iloc[train_index].drop(columns='Pcode'), train_data.iloc[test_index].drop(columns='Pcode')
    y_train, y_test = train_valence.iloc[train_index], train_valence.iloc[test_index]

    # Fit Naive Bayes
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = naive_bayes.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(macro_f1)

print(f'Mean Macro F1 Score: {np.mean(f1_scores)}')
print(f'STD of F1 scores during training: {np.std(f1_scores)}')

Mean Macro F1 Score: 0.51677637251573
STD of F1 scores during training: 0.021427159632991966


In [23]:
y_pred_test = naive_bayes.predict(test_data)
test_macro_f1 = f1_score(test_valence, y_pred_test, average='macro')
print("Test Macro-F1 Score:", test_macro_f1)

Test Macro-F1 Score: 0.5472013390477651


In [24]:
n_splits = 5
group_kfold = GroupKFold(n_splits=n_splits)
f1_scores = []

for train_index, test_index in group_kfold.split(train_data, groups=train_data['Pcode']):
    X_train, X_test = train_data.iloc[train_index].drop(columns='Pcode'), train_data.iloc[test_index].drop(columns='Pcode')
    y_train, y_test = train_arousal.iloc[train_index], train_arousal.iloc[test_index]

    # Fit Naive Bayes
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = naive_bayes.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(macro_f1)

print(f'Mean Macro F1 Score: {np.mean(f1_scores)}')
print(f'STD of F1 scores during training: {np.std(f1_scores)}')

Mean Macro F1 Score: 0.4735434580165103
STD of F1 scores during training: 0.04801902665911414


In [25]:
y_pred_test = naive_bayes.predict(test_data)
test_macro_f1 = f1_score(test_arousal, y_pred_test, average='macro')
print("Test Macro-F1 Score:", test_macro_f1)

Test Macro-F1 Score: 0.5341861235942065
