In [48]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
%matplotlib inline

import os

Based on the paper (The one in the code directory, "Signal Acquisition" section) channel oz is the best to identify users

In [49]:
DATA_DIR = 'dataset'
channel = 'Oz..'
dataSets = []
for fil in os.listdir(DATA_DIR):
    if fil.endswith('.csv'):
        datafile = pd.read_csv(os.path.join(DATA_DIR, fil))
        userId = fil[2:4]
        datafile['userId'] = userId
        dataSets.append(datafile)
data = pd.concat(dataSets)

In [50]:
#Ensemble Averaging
#The aim is to average the results of the 50 samples to smooth out the data. We can use a sliding window to smooth the data
averagingWindow = 50
for user in data.userId.unique():
    data.loc[data.userId == user] = pd.rolling_mean(data[data.userId == user], averagingWindow, center=False, min_periods=0)

	DataFrame.rolling(min_periods=0,window=50,center=False).mean()
  """


In [51]:
#Starting with frequency analysis and feature generation
#We will use 1 of data for checking user identity, with feautures created by 500 ms of data. Each such sample
#will be of length 15. There will be 80 samples in a second as frequency is 160 and we are taking 500 ms as the sample
#size, so if we slide a window of 80 i.e. 500 ms we should get 80 samples in a second

'''
Things to do:
1. split the data for a user into 500 ms samples
2. convert to fft and pass the low pass 60hz filter
3. Split the data in the 5 bands and calculate the mean, std and entropy
'''

featureOrder = ['delta_mean', 'delta_std', 'delta_entropy', 'theta_mean', 'theta_std', 'theta_entropy',
                'alpha_mean', 'alpha_std', 'alpha_entropy',
               'beta_mean', 'beta_std', 'beta_entropy', 'gamma_mean', 'gamma_std', 'gamma_entropy',]
def fftFeatureTransformation(windowData):
    #Calculating the FFT features
    Fs = 160.0
    y = windowData
    n = y.shape[0]
    k = np.arange(n)
    T = n/Fs
    frq = k/T # two sides frequency range
    frq = frq[range(n/2)] # one side frequency range

    Y = np.fft.fft(y)/n # fft computing and normalization
    Y = Y[range(n/2)]
    
    #using a low pass filter to filter out frequency above 60 hz
    Y = abs(Y[frq <= 60])
    frq = frq[frq <= 60]
    
    lowerBandEnd = 0
    bandSteps = [4, 8, 15, 30, 60]
    featureData = []
    for upperBand in bandSteps:
        bandData = Y[(frq >= lowerBandEnd) & (frq <= upperBand)]
        #calculating the mean of the band data
        featureData.append(bandData.mean())
        
        #calculating the std of the band data
        featureData.append(bandData.std())
        
        featureData.append( -1*(np.dot(np.square(bandData), np.log(np.square(bandData)))).sum() )
    return featureData

def createFeatures(df, le):
    dataSet = []
    df = df.reset_index(drop=True)
    for i in range(le): #1600 - 160):
        dataWindow = df.loc[i - 80: i]
        #print dataWindow.shape
        dataSet.append(fftFeatureTransformation(dataWindow))
    return dataSet

In [52]:
# Taking 1 second of data
#ozChannelData_u1 = ensembleAverage_user1[channel].reset_index(drop=True)[160:1600]
#ozChannelData_u2 = ensembleAverage_user2[channel].reset_index(drop=True)[160:1600]

transformedDataSets = []
for user in data.userId.unique():
    channelData = data[data.userId == user][channel].reset_index(drop=True)[160:]
    transformedDataset_user = pd.DataFrame(createFeatures(channelData, channelData.shape[0] - 160), columns = featureOrder)
    transformedDataset_user['userId'] = user
    transformedDataSets.append(transformedDataset_user)

transformedData = pd.concat(transformedDataSets)



In [53]:
'''
Need to now run the 2 experiments to determine which model to pick and deploy
'''

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [54]:
'''
Experiment 1:
Accept the correct user
1. Randomly sample 80 points from the dataset for the same user. And check if the algorithm works or not
2. Perform it for all the users
'''
#[('Random Forest', RandomForestClassifier(max_depth = 3))] Did not perform well in identifying same user
mdls = [('Linear SVM', LinearSVC(C = 0.01)), ('Logistic Regression', LogisticRegression(C = 0.01)), ('Naive Bayes', GaussianNB())] #, 
for user in transformedData.userId.unique():
    print 'Running for user', user
    mdlResults = {}
    for i in range(100):
        x = transformedData[transformedData.userId == user].reset_index(drop=True).loc[1:].sample(80) #.iloc[1:161]
        y = [1]*(x.shape[0]/2) + [0]*(x.shape[0]/2)
        for mdlName, mdl in mdls:
            mdl.fit(x, y)
            pred_y = mdl.predict(x)
            #print 'For Model', mdlName, 'the accuracy was', (pred_y == y).mean()
            mdlResults[mdlName] = mdlResults.get(mdlName, []) + [(pred_y == y).mean()]
        #print '-'*53
    for mdl in mdlResults.keys():
        print 'Average accuracy or model', mdl, 'is', np.array(mdlResults[mdl]).mean()
    print '**'*53

Running for user 1
Average accuracy or model Naive Bayes is 0.528125
Average accuracy or model Linear SVM is 0.520875
Average accuracy or model Logistic Regression is 0.580625
**********************************************************************************************************
Running for user 2
Average accuracy or model Naive Bayes is 0.5295
Average accuracy or model Linear SVM is 0.548875
Average accuracy or model Logistic Regression is 0.55975
**********************************************************************************************************
Running for user 3
Average accuracy or model Naive Bayes is 0.53425
Average accuracy or model Linear SVM is 0.52275
Average accuracy or model Logistic Regression is 0.578625
**********************************************************************************************************
Running for user 4
Average accuracy or model Naive Bayes is 0.5515
Average accuracy or model Linear SVM is 0.5385
Average accuracy or model Logistic Regress

In [55]:
'''
Experiment 2:
Distinguish between 2 different users
1. Randomly sample 40 points from one user and another 40 points from another user and check if it works or not
2. Perform it for all the pairs of users
'''
mdls = [('Linear SVM', LinearSVC(C = 0.01)), ('Logistic Regression', LogisticRegression(C = 0.01)), ('Naive Bayes', GaussianNB())] #, 
users = list(transformedData.userId.unique())
for i, userA in enumerate(users):
    for userB in users[i+1:]:
        print 'Running for users', userA, userB
        mdlResults = {}
        for i in range(100):
            x_a = transformedData[transformedData.userId == userA].reset_index(drop=True).loc[1:].sample(40) #.iloc[1:161]
            x_b = transformedData[transformedData.userId == userB].reset_index(drop=True).loc[1:].sample(40)
            x = x_a.append(x_b)
            y = [1]*(x_a.shape[0]) + [0]*(x_b.shape[0])
            for mdlName, mdl in mdls:
                mdl.fit(x, y)
                pred_y = mdl.predict(x)
                #print 'For Model', mdlName, 'the accuracy was', (pred_y == y).mean()
                mdlResults[mdlName] = mdlResults.get(mdlName, []) + [(pred_y == y).mean()]
            #print '-'*53
        for mdl in mdlResults.keys():
            print 'Average accuracy or model', mdl, 'is', np.array(mdlResults[mdl]).mean()
        print '**'*53

Running for users 1 2
Average accuracy or model Naive Bayes is 0.846375
Average accuracy or model Linear SVM is 0.82875
Average accuracy or model Logistic Regression is 0.8665
**********************************************************************************************************
Running for users 1 3
Average accuracy or model Naive Bayes is 0.787
Average accuracy or model Linear SVM is 0.57325
Average accuracy or model Logistic Regression is 0.871375
**********************************************************************************************************
Running for users 1 4
Average accuracy or model Naive Bayes is 0.917625
Average accuracy or model Linear SVM is 0.668
Average accuracy or model Logistic Regression is 0.878125
**********************************************************************************************************
Running for users 1 5
Average accuracy or model Naive Bayes is 0.9415
Average accuracy or model Linear SVM is 0.843375
Average accuracy or model Logisti

Average accuracy or model Naive Bayes is 0.95875
Average accuracy or model Linear SVM is 0.776125
Average accuracy or model Logistic Regression is 0.886
**********************************************************************************************************
Running for users 5 6
Average accuracy or model Naive Bayes is 1.0
Average accuracy or model Linear SVM is 0.570375
Average accuracy or model Logistic Regression is 0.6005
**********************************************************************************************************
Running for users 5 7
Average accuracy or model Naive Bayes is 1.0
Average accuracy or model Linear SVM is 0.6975
Average accuracy or model Logistic Regression is 0.73175
**********************************************************************************************************
Running for users 5 8
Average accuracy or model Naive Bayes is 0.998625
Average accuracy or model Linear SVM is 0.741625
Average accuracy or model Logistic Regression is 0.82075
*****

Unnamed: 0,delta_mean,delta_std,delta_entropy,theta_mean,theta_std,theta_entropy,alpha_mean,alpha_std,alpha_entropy,beta_mean,beta_std,beta_entropy,gamma_mean,gamma_std,gamma_entropy
1140,6.457475,5.651211,-1130.791623,4.401789,5.050249,-1132.792964,3.026227,4.369857,-1131.819715,1.716639,3.357823,-1129.706940,0.974684,2.531166,-1127.994592
305,5.892747,2.028969,-451.103828,4.626145,2.226444,-482.935922,3.426288,2.346361,-487.504167,2.101601,2.126026,-485.571494,1.288779,1.743408,-481.041245
119,22.705367,20.892818,-22098.782876,14.448275,19.087810,-22114.853350,9.508162,16.383511,-22118.064763,5.099249,12.395936,-22115.633589,2.820319,9.211390,-22111.415590
142,16.539879,16.528600,-11811.676968,10.654473,14.702914,-11826.000537,7.147917,12.474472,-11828.966301,3.907821,9.397780,-11826.574776,2.180782,6.983285,-11822.881278
1359,6.858732,3.644130,-805.769225,4.412985,4.118639,-805.388367,2.970265,3.751715,-804.355143,1.611870,2.980919,-802.954451,0.907675,2.261832,-801.580069
522,5.428865,2.439464,-408.144719,3.813136,2.748750,-411.945330,2.621845,2.662746,-410.892883,1.477136,2.204810,-409.022835,0.850013,1.711434,-407.336655
1174,11.578844,8.483995,-3697.349392,7.162301,8.513510,-3696.923381,4.633182,7.480806,-3696.027917,2.433184,5.729451,-3694.774215,1.293718,4.281111,-3694.293660
204,9.637350,10.044963,-3620.091771,6.035696,8.944271,-3619.372866,3.864861,7.606863,-3618.890995,1.975795,5.701131,-3618.539507,1.025978,4.211655,-3618.509097
1087,3.799880,3.595554,-336.395813,2.648018,3.122967,-336.198332,1.801761,2.704450,-335.590938,1.053324,2.055291,-333.907809,0.616978,1.543980,-332.597662
1056,3.228277,3.436164,-271.163665,2.541143,2.793818,-275.172828,1.877203,2.370526,-274.365473,1.110507,1.844970,-272.442037,0.660612,1.404656,-270.762074


In [16]:
train_x[40:].describe()

Unnamed: 0,delta_mean,delta_std,delta_entropy,theta_mean,theta_std,theta_entropy,alpha_mean,alpha_std,alpha_entropy,beta_mean,beta_std,beta_entropy,gamma_mean,gamma_std,gamma_entropy
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,17.894218,16.69224,-13433.154662,11.477378,15.139405,-13444.736081,7.579101,12.986001,-13446.224907,4.081351,9.827114,-13443.815904,2.260765,7.306363,-13440.403603
std,2.929637,2.02884,4287.101016,1.847977,2.023689,4289.695077,1.216851,1.787006,4290.754131,0.659628,1.377829,4290.804702,0.368644,1.033423,4290.153113
min,13.871458,13.416976,-21575.595094,8.556245,12.263231,-21589.004265,5.618425,10.411238,-21591.612863,2.966651,7.824994,-21589.206686,1.614258,5.792581,-21584.882954
25%,15.25006,15.00171,-16891.024478,9.984741,13.289878,-16909.29251,6.584019,11.387236,-16911.065835,3.519547,8.615703,-16908.432196,1.931661,6.403315,-16904.647938
50%,17.427865,16.796749,-12752.414347,11.203015,15.087248,-12765.772899,7.565654,12.818688,-12771.345191,4.099583,9.704839,-12768.787183,2.288744,7.218765,-12764.900788
75%,20.665306,18.097894,-9658.254959,13.317128,16.659747,-9671.167886,8.773486,14.418104,-9670.865878,4.668702,10.990847,-9668.370589,2.578749,8.185902,-9665.936215
max,22.584311,20.563811,-7645.718994,14.300013,18.891369,-7644.989848,9.391854,16.224534,-7644.315658,5.035148,12.272353,-7642.504663,2.794283,9.115567,-7640.943026


In [20]:
for user in transformedData['userId'].unique():
    print transformedData[transformedData.userId == user].shape

(9391, 16)
(9391, 16)
(9391, 16)
(9391, 16)
(9391, 16)
(9391, 16)
(9391, 16)
(9391, 16)
(9391, 16)
(9391, 16)


In [56]:
transformedData.to_csv('processedTransformedData.csv', index = False)

In [66]:
user1.shape

(9760, 65)