In [75]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np


def preprocess(filename):
    file = "data/test/"+filename+".xls" # filename
    df = pd.read_excel(file) # load xls file into pandas dataframe
    cols = df.columns # get column names
    new_cols = [] # initialize list to contain new column names

    if(cols[0]=="Rodent Sleep"):
        for i in range(len(cols)): # deep copy column names to new column names
            new_cols.append(cols[i])
        for i in range(1,len(new_cols)-2): # for each column name, try to extract frequency range
            new_cols[i]=new_cols[i].split(',')[0][7:]
        for i in range(1,6): # cleanup for first 5 frequency ranges still containing " HZ"
            new_cols[i]=new_cols[i][0:5]
        new_cols[-1]=new_cols[-1][:8] # remove end of column name
        new_cols[-2]=new_cols[-2][:5] # remove end of column name
        df.columns=new_cols # set dataframe column names to new column names
        df.rename(columns={"Rodent Sleep":"Class"},inplace=True)
        df["Class"]=pd.Categorical(df["Class"]).codes # Convert to categorical codes here so we can analyze percentage of each class in next code block
        df = df.drop([0]) # drop first row containing units [muV^2]
        df = df.fillna(0) ## handle NaN values
        for col in df.loc[:, df.columns != 'Class']: # typecast each column to type float
            df[col] = df[col].astype(float)
    elif(cols[0]=="10 second Epochs"):
        del df[df.columns[0]] # remove first column
        for i in range(1,len(cols)): # deep copy column names to new column names
            new_cols.append(cols[i])
        for i in range(2,len(new_cols)-2): # for each column name, try to extract frequency range
            new_cols[i]=new_cols[i].split(',')[0][7:]
        for i in range(2,7): # cleanup for first 5 frequency ranges still containing " HZ"
            new_cols[i]=new_cols[i][0:5]
        new_cols[-2]=new_cols[-2][:8] # remove end of column name
        new_cols[-1]=new_cols[-1][:5] # remove end of column name
        df.columns=new_cols # set dataframe column names to new column names
        df.rename(columns={"Rodent Sleep":"Class"},inplace=True)
        df["Class"]=pd.Categorical(df["Class"]).codes # Convert to categorical codes here so we can analyze percentage of each class in next code block
        df = df.drop([0]) # drop first row containing units [muV^2]
        df = df.fillna(0) ## handle NaN values
        for col in df.loc[:, df.columns != 'Class']: # typecast each column to type float
            df[col] = df[col].astype(float)
    else:
        for i in range(len(cols)): # deep copy column names to new column names
            new_cols.append(cols[i])    
        for i in range(0,len(new_cols)-2): # for each column name, try to extract frequency range
            new_cols[i]=new_cols[i].split(',')[0][7:]
        for i in range(0,5): # cleanup for first 5 frequency ranges still containing " HZ"
            new_cols[i]=new_cols[i][0:5]
        new_cols[-1]=new_cols[-1][:8] # remove end of column name
        new_cols[-2]=new_cols[-2][:5] # remove end of column name
        df.columns=new_cols # set dataframe column names to new column names
        df = df.drop([0]) # drop first row containing units [muV^2]
        df = df.fillna(0) ## handle NaN values
        for col in df.loc[:, df.columns != 'Class']: # typecast each column to type float
            df[col] = df[col].astype(float)
    os.system('mkdir data/preprocessed')
    df.to_csv("data/preprocessed/"+filename+"_preprocessed.csv",index=False) # save dataframe in csv format
    return df
def window(target_filename):
    df = pd.read_csv("data/preprocessed/"+target_filename+"_preprocessed.csv")
    Y = pd.DataFrame()
    if(df.columns[0]!="Class"):
        return None
    for i in tqdm(range(len(df)-4)):
        win = df.iloc[i:i+5]
        c = np.argmax(np.bincount(win['Class']))
        del win['Class']
        x = win.values.flatten()
        x = np.insert(x,0,c)
        X = pd.DataFrame(x).T
        X = X.rename({0: 'Class'}, axis='columns')
        Y = pd.concat([Y,X])
    df_win = Y
    df_win = df_win.reset_index()
    del df_win['index']
    df_win['Class'] = df_win['Class'].astype(int)
    df = df_win
    os.system('mkdir data/windowed')
    df.to_csv("data/windowed/"+target_filename+"_windowed.csv",index=False)
    return df

In [76]:
## Preprocess
for i in range(22):
    preprocess(str(i))

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [58]:
## Windowing
for i in range(22):
    window(str(i))

Index(['0-0.5', '0.5-1', '1-1.5', '1.5-2', '2-2.5', '2.5-3', '3-3.5', '3.5-4',
       '4-4.5', '4.5-5', '5-5.5', '5.5-6', '6-6.5', '6.5-7', '7-7.5', '7.5-8',
       '8-8.5', '8.5-9', '9-9.5', '9.5-10', '10-10.5', '10.5-11', '11-11.5',
       '11.5-12', '12-12.5', '12.5-13', '13-13.5', '13.5-14', '14-14.5',
       '14.5-15', '15-15.5', '15.5-16', '16-16.5', '16.5-17', '17-17.5',
       '17.5-18', '18-18.5', '18.5-19', '19-19.5', '19.5-20', 'EEG 2',
       'Activity'],
      dtype='object')
Index(['0-0.5', '0.5-1', '1-1.5', '1.5-2', '2-2.5', '2.5-3', '3-3.5', '3.5-4',
       '4-4.5', '4.5-5', '5-5.5', '5.5-6', '6-6.5', '6.5-7', '7-7.5', '7.5-8',
       '8-8.5', '8.5-9', '9-9.5', '9.5-10', '10-10.5', '10.5-11', '11-11.5',
       '11.5-12', '12-12.5', '12.5-13', '13-13.5', '13.5-14', '14-14.5',
       '14.5-15', '15-15.5', '15.5-16', '16-16.5', '16.5-17', '17-17.5',
       '17.5-18', '18-18.5', '18.5-19', '19-19.5', '19.5-20', 'EEG 2',
       'Activity'],
      dtype='object')
Index(['Rode

In [172]:
p,s,w = class_count(df)
df = pd.read_csv("data/"+target_filename+"_windowed.csv")

## Balancing
# TODO : balancing algorithm
ps = df.loc[df["Class"]==0]
ss = df.loc[df["Class"]==1]
ws = df.loc[df["Class"]==2]
for i in range(int(w/p)):
  df = pd.concat([df,ps])
p,s,w = class_count(df)

df.to_csv("data/"+target_filename+"_windowed_balanced.csv",index=False)

Examples:
    Total: 8636
    P: 685 (7.93% of total)
    S: 4159 (48.16% of total)
    W: 3792 (43.91% of total)

Examples:
    Total: 12061
    P: 4110 (34.08% of total)
    S: 4159 (34.48% of total)
    W: 3792 (31.44% of total)



In [173]:
df = pd.read_csv("data/"+target_filename+"_windowed_balanced.csv")
test_labels = np.array(df.pop('Class'))
test_features = np.array(df)
test_features

array([[ 3.85654342e+01,  5.70729515e+01,  7.20343959e+01, ...,
         1.70714166e+00, -5.51311502e+00,  5.01099073e-01],
       [ 4.25379527e+01,  3.47616829e+01,  5.70236791e+01, ...,
         1.96111906e+00,  6.23864651e+00,  2.00895988e-01],
       [ 2.63087151e+01,  8.63925026e+01,  2.79903929e+01, ...,
         2.39541245e+00, -3.68290997e-01,  5.00999871e-01],
       ...,
       [ 3.51125278e+01,  1.30516884e+01,  2.24620945e+01, ...,
         1.98929236e+00,  1.14147854e-01,  9.92078916e-04],
       [ 2.81393441e+01,  1.39130203e+01,  1.50494196e+01, ...,
         2.97737889e+00, -1.67937880e+00,  9.92078916e-04],
       [ 1.35788201e+01,  4.39082036e+01,  3.63237022e+01, ...,
         3.19278002e+00, -1.39679813e-01,  9.92078916e-04]])

In [177]:
from tensorflow import keras
model = keras.models.load_model('./model')
BATCH_SIZE=200
hln = 200
from time import strftime
test_predictions_baseline = model.predict(test_features, batch_size=BATCH_SIZE)
test_predictions_baseline.argmax(axis=1)
# baseline_results = model.evaluate(test_features, tf.one_hot(test_labels,depth=3),
#                                     batch_size=BATCH_SIZE, verbose=0)
# plot_cm(tf.one_hot(test_labels,depth=3).numpy().argmax(axis=1),test_predictions_baseline.argmax(axis=1),baseline_results,hln)
# date = strftime('%X %x').replace("/","").split()
# plt.savefig("figures/"+str(date[1])+"@"+str(date[0][:5].replace(":",""))+"_"+str(hln)+".png",bbox_inches='tight')

array([2, 2, 2, ..., 0, 0, 0])

In [186]:
import os
filenames = []
for filename in os.listdir('data'):
    if (filename.endswith("preprocessed.csv") and not filename.startswith("control") and not filename.startswith("deprivation")):
        filenames.append(filename)
