In [131]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GroupKFold, StratifiedKFold
from sklearn.utils import resample

In [132]:
exp = pd.read_csv('../data/processed/exp.csv', index_col=0)

label_cols = ['la_p','ha_p','ha_n','la_n','la','ha','p','n','max_p','max_n','valence_p','valence_n','valence_reg','valence', 'subject']
feature_cols = set(exp.columns.values) - set(label_cols)
feature_cols = list(feature_cols)

## 1.Split data by group

In [133]:
# Randomize Subjects with weighted probability
sub_id_prob = exp['subject'].value_counts()/exp.shape[0]
sub_id = sub_id_prob.index.values
val_subjects = np.random.choice(sub_id,15,p=sub_id_prob)

# Split Validation Set
val_data = exp[exp['subject'].isin(val_subjects)]
X_val = val_data[feature_cols]
y_val = val_data[['max_p','max_n','valence_p','valence_n','valence_reg','valence', 'subject']]

In [134]:
# Split training and test dataset
exp_train_test = exp[~exp['subject'].isin(val_subjects)]
# Remove Validation Set from Dataframe
exp_train_test = exp[~exp['subject'].isin(val_subjects)]
# Split Training and Test set
X = exp_train_test[feature_cols]
y = exp_train_test[['max_p','max_n','valence_p','valence_n','valence_reg','valence', 'subject']]
group = exp_train_test['subject'].values
gkf_modeling = list(GroupKFold(n_splits=5).split(X,y,group))
X_train = X.iloc[gkf_modeling[0][0],]
X_test = X.iloc[gkf_modeling[0][1],]
y_train = y.iloc[gkf_modeling[0][0],]
y_test = y.iloc[gkf_modeling[0][1],]

In [135]:
# Export result
X_val.to_csv('../data/processed/new_user/X_val_group.csv', index=False)
y_val.to_csv('../data/processed/new_user/y_val_group.csv', index=False)
X_test.to_csv('../data/processed/new_user/X_test_group.csv', index=False)
y_test.to_csv('../data/processed/new_user/y_test_group.csv', index=False)
X_train.to_csv('../data/processed/new_user/X_train_group.csv', index=False)
y_train.to_csv('../data/processed/new_user/y_train_group.csv', index=False)

In [136]:
X_val.shape, X_test.shape, X_train.shape

((341, 170), (309, 170), (1223, 170))

## 2.Stratifies Split

In [137]:
# Split features and labels
X_stratified = exp[feature_cols]
y_stratified = exp[['max_p','max_n','valence_p','valence_n','valence_reg','valence','subject']]
# ID for stratification
group_stratified = exp['subject']
# Split validation set
X_stratified_train_test, X_stratified_val, y_stratified_train_test, y_stratified_val = train_test_split(X_stratified, y_stratified, test_size=0.2, stratify=group_stratified)
# Split training/test set
group_stratified_2 = exp.iloc[X_stratified_train_test.index.values]['subject']
X_stratified_train, X_stratified_test, y_stratified_train, y_stratified_test = train_test_split(X_stratified_train_test, y_stratified_train_test, test_size=0.25, stratify=group_stratified_2)

In [138]:
# Export result
X_stratified_val.to_csv('../data/processed/cur_user/X_val_stratify.csv', index=False)
y_stratified_val.to_csv('../data/processed/cur_user/y_val_stratify.csv', index=False)
X_stratified_test.to_csv('../data/processed/cur_user/X_test_stratify.csv', index=False)
y_stratified_test.to_csv('../data/processed/cur_user/y_test_stratify.csv', index=False)
X_stratified_train.to_csv('../data/processed/cur_user/X_train_stratify.csv', index=False)
y_stratified_train.to_csv('../data/processed/cur_user/y_train_stratify.csv', index=False)

## 3.Upsample Minority Class

In [139]:
def upsample_data(X, y):
    # concatenate our training data back together
    data_to_sample = pd.concat([X, y], axis=1)
    # separate minority and majority classes
    unhappy = data_to_sample[data_to_sample['valence']==1]
    happy = data_to_sample[data_to_sample['valence']==0]


    # upsample minority
    unhappy_upsampled = resample(unhappy,
                        replace=True, # sample with replacement
                        n_samples=len(happy), # match number in majority class
                        random_state=27) # reproducible results

    # combine majority and upsampled minority
    upsampled = pd.concat([happy, unhappy_upsampled])

    # Split train and test
    y_upsample = upsampled[['max_p','max_n','valence_p','valence_n','valence_reg','valence', 'subject']]
    X_upsample = upsampled.drop(['max_p','max_n','valence_p','valence_n','valence_reg','valence', 'subject'], axis=1)

    return X_upsample, y_upsample

In [140]:
X_up_train, y_up_train = upsample_data(X_train, y_train)
X_stratified_up_train, y_stratified_up_train = upsample_data(X_stratified_train, y_stratified_train)

In [141]:
X_up_train.to_csv('../data/processed/new_user/X_train_group_up.csv', index=False)
y_up_train.to_csv('../data/processed/new_user/y_train_group_up.csv', index=False)
X_stratified_up_train.to_csv('../data/processed/cur_user/X_train_stratify_up.csv', index=False)
y_stratified_up_train.to_csv('../data/processed/cur_user/y_train_stratify_up.csv', index=False)