In [16]:
import numpy as np
import pandas
from sklearn.model_selection import train_test_split

eeg_data = pandas.read_csv('./results.csv')
print(eeg_data.columns)
print(eeg_data.dtypes)

Index(['subject_id', 'in_cohort1', 'in_cohort2', 'cohort', 'sampling_seq',
       'sleep_stage', 'eeg_0', 'eeg_1', 'eeg_2', 'eeg_3',
       ...
       'eeg_116', 'eeg_117', 'eeg_118', 'eeg_119', 'eeg_120', 'eeg_121',
       'eeg_122', 'eeg_123', 'eeg_124', 'eeg_signal'],
      dtype='object', length=132)
subject_id        int64
in_cohort1         bool
in_cohort2         bool
cohort            int64
sampling_seq      int64
sleep_stage       int64
eeg_0           float64
eeg_1           float64
eeg_2           float64
eeg_3           float64
eeg_4           float64
eeg_5           float64
eeg_6           float64
eeg_7           float64
eeg_8           float64
eeg_9           float64
eeg_10          float64
eeg_11          float64
eeg_12          float64
eeg_13          float64
eeg_14          float64
eeg_15          float64
eeg_16          float64
eeg_17          float64
eeg_18          float64
eeg_19          float64
eeg_20          float64
eeg_21          float64
eeg_22          float6

In [17]:
person_1_df = eeg_data[
    (eeg_data['subject_id'] == eeg_data.iloc[0]['subject_id']) & 
    eeg_data['eeg_signal'] == 1
].sort_values(['sampling_seq'])
feature_p1_df = person_1_df.drop(
    ['subject_id', 'in_cohort1', 'in_cohort2', 'cohort', 'eeg_signal'], 
    axis=1
).drop(
    person_1_df.head(900).index
).drop(
    person_1_df.tail(900).index
)
feature_p1_df.columns

Index(['sampling_seq', 'sleep_stage', 'eeg_0', 'eeg_1', 'eeg_2', 'eeg_3',
       'eeg_4', 'eeg_5', 'eeg_6', 'eeg_7',
       ...
       'eeg_115', 'eeg_116', 'eeg_117', 'eeg_118', 'eeg_119', 'eeg_120',
       'eeg_121', 'eeg_122', 'eeg_123', 'eeg_124'],
      dtype='object', length=127)

In [18]:
print(feature_p1_df.shape)
print(feature_p1_df.head(1)['sampling_seq'])
print(feature_p1_df.tail(1)['sampling_seq'])

(38720, 127)
1800    1800
Name: sampling_seq, dtype: int64
79238    79238
Name: sampling_seq, dtype: int64


In [19]:
y = feature_p1_df['sleep_stage']
x = feature_p1_df[
    ["eeg_{0}".format(i) for i in range(125)]
]

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

a_train, a_test, b_train, b_test = train_test_split(x, y, test_size=0.33)
model = RandomForestClassifier(64)
model.fit(a_train, b_train)
    
print(accuracy_score(b_test, model.predict(a_test)))

0.646110502426


In [21]:
grouped = eeg_data.groupby(by=["subject_id", "cohort"]).apply(
    lambda df: df.drop(
        df.head(900).index
    ).drop(
        df.tail(900).index
    )
)

In [22]:
y_grouped = grouped['sleep_stage']
x_grouped = grouped[
    ["eeg_{0}".format(i) for i in range(125)]
]
a_train, a_test, b_train, b_test = train_test_split(x_grouped, y_grouped, test_size=0.33)
print(a_train.shape)
print(a_test.shape)

(972612, 125)
(479048, 125)


In [24]:
model = RandomForestClassifier(16)
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupShuffleSplit

for train_index, test_index in GroupShuffleSplit(
    n_splits=10, test_size=0.4
).split(x_grouped, y_grouped, grouped["subject_id"]):
    model.fit(
        x_grouped.iloc[train_index],
        y_grouped.iloc[train_index]
    )
    print(accuracy_score(
        y_grouped.iloc[test_index], 
        model.predict(x_grouped.iloc[test_index])
    ))

0.552121844206
0.608365960201


KeyboardInterrupt: 

In [54]:
groups = [
    (x[["eeg_{0}".format(i) for i in range(125)]], 
     x['sleep_stage'])
    for _, x in grouped.groupby(['subject_id'], as_index=False)
]


accuracy_scores = {}
for i, (x_train, y_train) in enumerate(groups):
    model.fit(x_train, y_train)
    print('trained on subject {0}'.format(i))
    for j, (x_test, y_test) in enumerate(groups):
        i_accuracy_score = accuracy_scores.get(i, {})
        i_accuracy_score[j] = accuracy_score(
            y_test, model.predict(x_test)
        )
        i_accuracy_score[j] = np.mean(
            1 - (np.absolute(y_test - model.predict(x_test)) * 0.2)
        )
        accuracy_scores[i] = i_accuracy_score

Defaulting to column, but this will raise an ambiguity error in a future version


trained on subject 0
trained on subject 1
trained on subject 2
trained on subject 3
trained on subject 4
trained on subject 5
trained on subject 6
trained on subject 7
trained on subject 8
trained on subject 9
trained on subject 10
trained on subject 11
trained on subject 12
trained on subject 13
trained on subject 14
trained on subject 15
trained on subject 16
trained on subject 17
trained on subject 18
trained on subject 19


In [55]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

accuracy_scores = np.array([
    [
        accuracy_scores[group_num][sub_group_num]
        for sub_group_num in range(20)
    ]
    for group_num in range(20)
])


fig, ax = plt.subplots()
fig.set_figheight(20)
fig.set_figwidth(20)
im = ax.imshow(accuracy_scores)

ax.set_xticks(np.arange(20))
ax.set_yticks(np.arange(20))

subject_ids = ["subject_{0}".format(i) for i in range(20)]
ax.set_xticklabels(subject_ids)
ax.set_yticklabels(subject_ids)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(20):
    for j in range(20):
        text = ax.text(j, i, round(accuracy_scores[i, j], 3),
                       ha="center", va="center", 
                       color='black' if accuracy_scores[i, j] > 0.8 else 'white')

ax.set_title("Accuracy scores")
fig.tight_layout()
plt.savefig(
    "accuracy_scores.png",
    bbox_inches='tight',
    pad_inches=0.25
)