# EEG Eye State

In [16]:
import os
import random
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from ucimlrepo import fetch_ucirepo 


## Paths and variables

In [17]:
id_col = 'subject_id'
time_col = 'time'
target_col = 'eyeDetection'

In [18]:
dataset_handle = 'EEG Eye State'
dataset_name = 'eeg_eye_state'
processed_dir = './../../processed/'
output_dir = f'./../../processed/{dataset_name}/'
os.makedirs(output_dir, exist_ok=True)
full_outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
test_key_outp_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
train_outp_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
test_outp_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')

## Read the data

In [19]:
eeg_eye_state = fetch_ucirepo(id=264) 
  
# data (as pandas dataframes) 
X = eeg_eye_state.data.features 
y = eeg_eye_state.data.targets 

data = X.copy()
data[target_col] = y[target_col]
data.head()

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,eyeDetection
0,4329.23,4009.23,4289.23,4148.21,4350.26,4586.15,4096.92,4641.03,4222.05,4238.46,4211.28,4280.51,4635.9,4393.85,0
1,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.1,0
2,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,0
3,4328.72,4011.79,4296.41,4155.9,4343.59,4582.56,4097.44,4630.77,4217.44,4235.38,4210.77,4287.69,4632.31,4396.41,0
4,4326.15,4011.79,4292.31,4151.28,4347.69,4586.67,4095.9,4627.69,4210.77,4244.1,4212.82,4288.21,4632.82,4398.46,0


In [20]:
# data[target_col] = data[target_col].astype(str)
# data[time_col] = data[time_col].astype(str)


# def quantize_df(data: pd.DataFrame , id_col:str , steps: int):

#     def aggregate_group(group, steps=steps):

#         numeric_cols = group.select_dtypes(include=[np.number]).columns
#         object_cols = group.select_dtypes(exclude=[np.number]).columns
        
#         aggregation_functions = {col: 'mean' for col in numeric_cols}
#         aggregation_functions.update({col: lambda x: mode(x) for col in object_cols})
        
#         grouped = group.groupby(np.arange(len(group)) // steps).agg(aggregation_functions)
#         grouped[time_col] = range(len(grouped))
#         return grouped

#     dfs = []
#     cols = list(data.columns.values)

#     data_grouped = data.groupby(id_col)
#     for name, group in data_grouped:
#         aggregated_group = aggregate_group(group)
#         dfs.append(aggregated_group)

#     data_quantized = pd.concat(dfs).reset_index(drop=True)
#     data_quantized = data_quantized[cols] #to keep the columns order
#     return data_quantized


# data = quantize_df(data, id_col, steps=100)

## Insert id and time columns

In [21]:
data.insert(0, id_col, 0)
data.insert(1, time_col, range(len(data)))

## Train/Test split

In [25]:
random.seed(42)
test_size = 0.2

test_length = int(len(data) * test_size)
train_length = len(data) - test_length

train_df= data.iloc[:train_length]
test_df = data.iloc[train_length:]

test_key = test_df[[id_col, time_col, target_col]]
test_df = test_df.drop(columns=[target_col])

## Save data

In [26]:
data.to_csv(full_outp_fname, index=False)
train_df.to_csv(train_outp_fname, index=False)
test_df.to_csv(test_outp_fname, index=False)
test_key.to_csv(test_key_outp_fname, index=False)