# import statements

In [1]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB

# load data

In [3]:
eeg_data = arff.loadarff("/Users/sjw19206/7135CEM/eye-state-classification-dataset/EEG_Eye_State.arff")

In [4]:
eeg_df = pd.DataFrame(eeg_data[0])

In [5]:
# rename column
eeg_df = eeg_df.rename(columns={"eyeDetection":"EYEST"})

In [6]:
eeg_df.head(5)

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,EYEST
0,4329.23,4009.23,4289.23,4148.21,4350.26,4586.15,4096.92,4641.03,4222.05,4238.46,4211.28,4280.51,4635.9,4393.85,b'0'
1,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.1,b'0'
2,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,b'0'
3,4328.72,4011.79,4296.41,4155.9,4343.59,4582.56,4097.44,4630.77,4217.44,4235.38,4210.77,4287.69,4632.31,4396.41,b'0'
4,4326.15,4011.79,4292.31,4151.28,4347.69,4586.67,4095.9,4627.69,4210.77,4244.1,4212.82,4288.21,4632.82,4398.46,b'0'


In [7]:
eeg_df.isnull().values.any()

False

In [8]:
# if scaler is not using
eeg_df_iv = eeg_df.drop('EYEST', axis = 1)
eeg_df_tv = eeg_df['EYEST'].astype(int)
eeg_df_scaled = pd.concat([eeg_df_iv, eeg_df_tv], axis = 1)

# scale using standardization

In [None]:
std_scaler = StandardScaler()
eeg_df_iv = eeg_df.drop('EYEST', axis = 1)
eeg_df_tv = eeg_df['EYEST'].astype(int)
eeg_df_scaled = std_scaler.fit_transform(eeg_df_iv.to_numpy())
eeg_df_scaled = pd.DataFrame(eeg_df_scaled, columns=['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', 'FC6', 'F4', 'F8', 'AF4'])
eeg_df_scaled = pd.concat([eeg_df_scaled, eeg_df_tv], axis = 1)

In [9]:
eeg_df_scaled.head(5)

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,EYEST
0,4329.23,4009.23,4289.23,4148.21,4350.26,4586.15,4096.92,4641.03,4222.05,4238.46,4211.28,4280.51,4635.9,4393.85,0
1,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.1,0
2,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,0
3,4328.72,4011.79,4296.41,4155.9,4343.59,4582.56,4097.44,4630.77,4217.44,4235.38,4210.77,4287.69,4632.31,4396.41,0
4,4326.15,4011.79,4292.31,4151.28,4347.69,4586.67,4095.9,4627.69,4210.77,4244.1,4212.82,4288.21,4632.82,4398.46,0


In [10]:
eeg_df_scaled.query("EYEST == 0").count()

AF3      8257
F7       8257
F3       8257
FC5      8257
T7       8257
P7       8257
O1       8257
O2       8257
P8       8257
T8       8257
FC6      8257
F4       8257
F8       8257
AF4      8257
EYEST    8257
dtype: int64

In [11]:
eeg_df_scaled.query("EYEST == 1").count()

AF3      6723
F7       6723
F3       6723
FC5      6723
T7       6723
P7       6723
O1       6723
O2       6723
P8       6723
T8       6723
FC6      6723
F4       6723
F8       6723
AF4      6723
EYEST    6723
dtype: int64

In [12]:
eeg_df_eye_zero = eeg_df_scaled.query("EYEST == 0")

In [13]:
eeg_df_eye_zero.head(5)

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,EYEST
0,4329.23,4009.23,4289.23,4148.21,4350.26,4586.15,4096.92,4641.03,4222.05,4238.46,4211.28,4280.51,4635.9,4393.85,0
1,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.1,0
2,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,0
3,4328.72,4011.79,4296.41,4155.9,4343.59,4582.56,4097.44,4630.77,4217.44,4235.38,4210.77,4287.69,4632.31,4396.41,0
4,4326.15,4011.79,4292.31,4151.28,4347.69,4586.67,4095.9,4627.69,4210.77,4244.1,4212.82,4288.21,4632.82,4398.46,0


In [14]:
eeg_df_eye_one = eeg_df_scaled.query("EYEST == 1")

In [15]:
eeg_df_eye_one.head(5)

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,EYEST
188,4445.13,4017.95,4292.82,4121.54,4325.13,4591.79,4077.44,4628.72,4229.23,4273.33,4316.92,4350.77,4810.26,4552.82,1
189,4438.46,4005.64,4287.69,4120.51,4321.03,4589.74,4070.77,4628.21,4224.1,4272.82,4319.49,4356.41,4811.28,4546.67,1
190,4432.31,3998.46,4281.54,4115.38,4314.87,4583.59,4063.59,4621.03,4204.1,4255.38,4304.1,4351.28,4795.9,4533.85,1
191,4427.18,3990.26,4281.03,4107.18,4310.77,4574.87,4061.03,4608.21,4188.21,4238.97,4297.44,4343.08,4783.59,4530.77,1
192,4426.67,3987.69,4285.13,4108.21,4314.36,4575.9,4063.59,4594.87,4181.54,4235.38,4301.54,4345.13,4784.1,4533.33,1


In [16]:
eeg_df_eye_one.count()

AF3      6723
F7       6723
F3       6723
FC5      6723
T7       6723
P7       6723
O1       6723
O2       6723
P8       6723
T8       6723
FC6      6723
F4       6723
F8       6723
AF4      6723
EYEST    6723
dtype: int64

In [17]:
eeg_df_eye_zero.count()

AF3      8257
F7       8257
F3       8257
FC5      8257
T7       8257
P7       8257
O1       8257
O2       8257
P8       8257
T8       8257
FC6      8257
F4       8257
F8       8257
AF4      8257
EYEST    8257
dtype: int64

In [18]:
eeg_df_train = eeg_df_eye_one.iloc[:4000]
eeg_df_test = eeg_df_eye_one.iloc[4000:]

In [19]:
eeg_df_train_tmp = eeg_df_eye_zero.iloc[0:4000]
eeg_df_test_tmp = eeg_df_eye_zero.iloc[4000:]

In [20]:
eeg_df_train = pd.concat([eeg_df_train, eeg_df_train_tmp])
eeg_df_test = pd.concat([eeg_df_test, eeg_df_test_tmp])

In [21]:
eeg_df_train = eeg_df_train.sample(frac=1)
eeg_df_test = eeg_df_test.sample(frac=1)

In [22]:
eeg_df_train.count()

AF3      8000
F7       8000
F3       8000
FC5      8000
T7       8000
P7       8000
O1       8000
O2       8000
P8       8000
T8       8000
FC6      8000
F4       8000
F8       8000
AF4      8000
EYEST    8000
dtype: int64

In [23]:
eeg_df_train.head(10)

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,EYEST
1900,4297.95,4013.33,4270.26,4120.0,4333.33,4616.92,4075.38,4608.21,4195.9,4226.15,4189.74,4283.08,4604.62,4360.51,0
4870,4292.31,4002.05,4264.62,4107.69,4343.59,4623.08,4082.56,4634.87,4215.38,4250.26,4216.92,4287.69,4607.18,4361.54,0
1707,4272.31,3995.38,4250.77,4118.46,4334.36,4620.0,4084.1,4615.9,4213.85,4229.23,4195.9,4276.92,4590.26,4335.38,0
7129,4261.54,3969.23,4244.1,4105.13,4320.51,4597.44,4049.74,4626.15,4205.13,4211.79,4196.41,4273.85,4584.1,4342.05,1
5178,4313.85,4003.08,4262.56,4110.26,4335.38,4621.03,4075.38,4607.69,4200.0,4254.36,4206.15,4285.64,4609.74,4367.69,0
9407,4299.49,3987.69,4254.87,4125.13,4338.97,4612.82,4051.79,4604.62,4195.9,4227.18,4197.44,4267.69,4599.49,4357.44,0
857,4264.62,4024.1,4248.72,4126.15,4335.38,4621.03,4098.97,4601.03,4188.21,4187.18,4150.26,4255.9,4520.51,4298.97,1
9453,4300.0,3990.26,4262.56,4108.21,4343.59,4624.1,4065.13,4614.36,4212.31,4247.69,4213.33,4272.82,4609.23,4363.08,0
2980,4281.03,3976.92,4253.33,4107.18,4316.41,4614.87,4081.03,4617.95,4205.13,4239.49,4197.44,4280.0,4589.74,4353.33,0
5583,4289.23,4001.54,4270.77,4128.21,4338.46,4616.92,4097.44,4632.31,4209.74,4243.08,4206.67,4281.03,4613.33,4360.51,1


In [24]:
eeg_df_test.count()

AF3      6980
F7       6980
F3       6980
FC5      6980
T7       6980
P7       6980
O1       6980
O2       6980
P8       6980
T8       6980
FC6      6980
F4       6980
F8       6980
AF4      6980
EYEST    6980
dtype: int64

In [25]:
eeg_df_test.head(10)

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,EYEST
7994,4284.62,3986.67,4260.51,4102.56,4338.97,4616.41,4068.72,4610.77,4193.85,4226.67,4166.15,4269.74,4593.33,4350.26,1
14837,4275.38,4026.15,4244.1,4120.51,4334.87,4606.15,4053.85,4615.38,4207.18,4225.64,4203.59,4267.18,4607.18,4339.49,0
7965,4286.15,3992.31,4251.28,4100.0,4335.38,4604.62,4047.18,4611.28,4194.87,4225.64,4195.38,4269.74,4584.62,4347.18,1
10605,4306.67,4006.67,4281.54,4143.08,4361.54,4644.62,4070.77,4635.38,4220.51,4256.41,4221.54,4290.26,4621.03,4369.74,0
13099,4261.03,3986.67,4249.74,4107.18,4328.72,4600.51,4052.31,4608.21,4193.85,4210.26,4172.31,4255.9,4570.77,4322.05,0
11731,4390.26,4075.9,4342.56,4188.21,4425.13,4699.49,4153.85,4680.0,4270.26,4304.1,4291.79,4353.85,4698.46,4464.62,1
11299,4304.1,4009.23,4271.28,4116.92,4354.36,4629.74,4110.26,4627.18,4203.59,4242.56,4219.49,4279.49,4617.44,4375.9,1
8990,4284.62,4014.87,4260.51,4123.08,4342.05,4614.87,4064.62,4613.85,4191.79,4223.59,4194.87,4269.23,4585.13,4358.97,1
14644,4264.1,4028.21,4236.41,4108.21,4330.26,4601.03,4057.44,4588.72,4169.23,4185.64,4164.62,4243.08,4552.31,4310.26,0
11201,4291.79,3975.9,4267.69,4106.15,4355.9,4631.28,4083.08,4643.08,4223.59,4252.31,4222.56,4287.69,4630.77,4379.49,1


In [26]:
eeg_df_test.query("EYEST == 1").count()

AF3      2723
F7       2723
F3       2723
FC5      2723
T7       2723
P7       2723
O1       2723
O2       2723
P8       2723
T8       2723
FC6      2723
F4       2723
F8       2723
AF4      2723
EYEST    2723
dtype: int64

In [27]:
df_x_train = eeg_df_train[['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', 'FC6', 'F4', 'F8', 'AF4']]
df_y_train = eeg_df_train[['EYEST']]

In [28]:
np_x_train = df_x_train.values

In [29]:
np_y_train = df_y_train.values.ravel()

In [30]:
np_x_train.shape

(8000, 14)

In [31]:
np_y_train.shape

(8000,)

In [32]:
df_x_test = eeg_df_test[['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', 'FC6', 'F4', 'F8', 'AF4']]
df_y_test = eeg_df_test[['EYEST']]

In [33]:
np_x_test = df_x_test.values

In [34]:
np_y_test = df_y_test.values.ravel()

In [49]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
np_x_test_pca = pca.fit_transform(np_x_test)
np_x_train_pca = pca.fit_transform(np_x_train)

In [46]:
np_x_train_pca.shape

(8000, 5)

In [56]:
# model
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB

# clf = GaussianNB()
clf = BernoulliNB()
# clf = ComplementNB()
# clf = MultinomialNB()
clf.fit(np_x_train_pca, np_y_train)

In [57]:
prediction = clf.predict(np_x_test_pca)

In [58]:
from sklearn.metrics import accuracy_score
accuracy_score(np_y_test, prediction)

0.5789398280802293