In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier as Knn
from sklearn.ensemble import RandomForestClassifier as rfc 
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.svm import SVC

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv(r"physionet_A_N.csv")
print(df.shape)
df.head()

(18370, 271)


Unnamed: 0,TS_0,TS_1,TS_2,TS_3,TS_4,TS_5,TS_6,TS_7,TS_8,TS_9,...,TS_261,TS_262,TS_263,TS_264,TS_265,TS_266,TS_267,TS_268,TS_269,CLASS
0,-127.0,-267.0,-241.0,-229.0,-215.0,-201.0,-253.0,-273.0,-226.0,-172.0,...,-29.0,-22.0,-19.0,-22.0,-43.0,-53.0,-23.0,22.0,-31.0,N
1,-41.0,663.0,-39.0,-113.0,-81.0,-67.0,-45.0,34.0,82.0,62.0,...,-22.0,42.0,104.0,32.0,2.0,2.0,-13.0,-16.0,6.0,N
2,7.0,7.0,12.0,18.0,36.0,80.0,0.0,-19.0,418.0,248.0,...,-22.0,-33.0,-31.0,-34.0,-11.0,1.0,-8.0,12.0,62.0,N
3,128.0,268.0,244.0,209.0,194.0,234.0,203.0,188.0,211.0,228.0,...,-70.0,-62.0,-44.0,393.0,-74.0,-84.0,-65.0,-30.0,-6.0,N
4,13.0,54.0,80.0,56.0,-36.0,1469.0,-199.0,-180.0,-177.0,-126.0,...,-15.0,-37.0,-50.0,-21.0,30.0,17.0,1571.0,-165.0,-201.0,N


a 1)  
Time series data  
Categorical labels and continuous featues


In [3]:
corr = df.corr()>0.5
corr.values.sum()//2

298

a 2)

Features are correlatded, we'll use pca

In [4]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]


a 3)

In [5]:
pca = PCA(n_components = 'mle')
Xt = pca.fit_transform(X)
Xt.shape[1]

183

a 4)

In [18]:
yt = pd.factorize(y)[0]  # for convention
X_train, X_test, y_train, y_test = train_test_split(Xt, yt, test_size=0.2, random_state=42,stratify=yt);

In [7]:
def metrics(y_pred,y_pred_train):
    train_acc = accuracy_score(y_train,y_pred_train)  # train predictions
    acc = accuracy_score(y_test,y_pred)
    precisiont, recallt, fscoret,_ = score(y_train,y_pred_train,average = 'macro',zero_division=0)
    precision, recall, fscore,_ = score(y_test,y_pred,average = 'macro',zero_division=0)
    scores = pd.DataFrame( columns = ['metric','Train','Test'])
    scores.metric =  ['Accuarcy','Precision','Recall','F1']
    scores.Train = [train_acc,precisiont, recallt, fscoret]
    scores.Test = [acc,precision, recall, fscore]
    print(scores)

b)

In [8]:
knn  = Knn(n_neighbors = 10) 
knn.fit(X_train,y_train); 
y_pred_train = knn.predict(X_train)  
y_pred = knn.predict(X_test)   # test predictions
metrics(y_pred,y_pred_train)

      metric     Train      Test
0   Accuarcy  0.872754  0.872891
1  Precision  0.769753  0.936428
2     Recall  0.500990  0.501068
3         F1  0.468147  0.468188


In [9]:
rf = rfc()
rf.fit(X_train,y_train)
y_pred_train = rf.predict(X_train)  
y_pred = rf.predict(X_test)  
metrics(y_pred,y_pred_train)

      metric  Train      Test
0   Accuarcy    1.0  0.871802
1  Precision    1.0  0.436257
2     Recall    1.0  0.499532
3         F1    1.0  0.465755


C)  
As we can see even though the accuracy is good. Precision and Recall are both bad, we can use F1 to get a general sense of the model's performance.    
Now considering we want to make sure that a positive case is not labeled negative we should minimize false negatives. We will use recall from here on

In [10]:
np.unique(yt,return_counts=True)

(array([0, 1], dtype=int64), array([16030,  2340], dtype=int64))

The data is imbalanced, the number of positive cases are very low, so we should either upsample positive class or downsample negative class.

d )  
We can oversample our positive class if we want to work on the data.  
Otherwise we can put some weights to our model to penalize wrong classification of the positive class more 

Generally Random Forest is capable of determining feature importances, so here we'll show how giving weights to classes affect the SVM classifier

## Setting Class Weight

In [19]:
print("SVM")
svm = SVC(kernel = 'rbf')
svm.fit(X_train, y_train)
y_pred_train = svm.predict(X_train)  
y_pred = svm.predict(X_test)  
metrics(y_pred,y_pred_train)

SVM
      metric     Train      Test
0   Accuarcy  0.879491  0.872618
1  Precision  0.939329  0.436309
2     Recall  0.526976  0.500000
3         F1  0.518896  0.465988


In [12]:
print('Weighted SVM')
svm = SVC(kernel = 'rbf',class_weight='balanced', probability=True)
svm.fit(X_train, y_train)
y_pred_train = svm.predict(X_train)  
y_pred = svm.predict(X_test)  
metrics(y_pred,y_pred_train)

Weighted SVM
      metric     Train      Test
0   Accuarcy  0.774565  0.680457
1  Precision  0.672823  0.557841
2     Recall  0.848702  0.614349
3         F1  0.685106  0.544350


while the results are not satisfactory, they are still an improvement from the last case. Where one clases was not even predicted

Now we upsample the positive class and see how the classifier changes.

## Upsampling

In [13]:
class1 = y_train==1
X_train = np.append(X_train,X_train[class1].repeat(6,axis=0),axis=0)
y_train = np.append(y_train,y_train[class1].repeat(6,axis=0),axis=0)

In [14]:
print('Knn')
knn  = Knn(n_neighbors = 10,weights='uniform', metric='minkowski',p=2) 
knn.fit(X_train,y_train); 
y_pred_train = knn.predict(X_train)  
y_pred = knn.predict(X_test)   # test predictions
metrics(y_pred,y_pred_train)


print('SVM')
svm = SVC(kernel = 'rbf')
svm.fit(X_train, y_train)
y_pred_train = svm.predict(X_train)  
y_pred = svm.predict(X_test)  
metrics(y_pred,y_pred_train)

print('Random Forest')

rf = rfc(bootstrap=  True,class_weight="balanced_subsample")
rf.fit(X_train,y_train)
y_pred_train = rf.predict(X_train)  
y_pred = rf.predict(X_test)  
metrics(y_pred,y_pred_train)

Knn
      metric     Train      Test
0   Accuarcy  0.901612  0.710942
1  Precision  0.918524  0.539980
2     Recall  0.900538  0.568859
3         F1  0.900425  0.535486
SVM
      metric     Train      Test
0   Accuarcy  0.906279  0.744692
1  Precision  0.914243  0.571641
2     Recall  0.905542  0.618308
3         F1  0.905704  0.576342
Random Forest
      metric  Train      Test
0   Accuarcy    1.0  0.872618
1  Precision    1.0  0.436309
2     Recall    1.0  0.500000
3         F1    1.0  0.465988


In [17]:
print("Adaboost")
abc = ABC(learning_rate=0.1, n_estimators=100)
abc.fit(X_train,y_train)
y_pred_train = abc.predict(X_train)  
y_pred = abc.predict(X_test)  
metrics(y_pred,y_pred_train)

Adaboost
      metric     Train      Test
0   Accuarcy  0.643937  0.583016
1  Precision  0.646075  0.555535
2     Recall  0.643193  0.624210
3         F1  0.641878  0.498803


This is the best we are getting.  
Generally getting great results on ECG/EEG signal is hard without some form of feature engineering. And as we are conisdering this to be a static wave, we are not trying to extract features.    
Deep learning classifiers can better extract features and classify.