In [65]:
import csv
import os
import biosppy.signals.ecg as ecg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

In [66]:
X = pd.read_csv('X_test.csv', index_col='id')

In [3]:
y = pd.read_csv('y_train.csv', index_col='id')

In [67]:
X.shape

(3411, 17842)

In [68]:
data = []
for i in range(len(X)):
    data.append(X.loc[i].dropna().to_numpy(dtype='float32'))

# Signal Processing Features

In [128]:
Features = []

In [129]:
from scipy.signal import periodogram
from scipy.stats import entropy
from scipy.fft import fft, ifft

In [130]:
fft_mag = []
for i in range(len(data)):
    fft_mag.append(np.sort(abs(fft(data[i])))[::-1][:100])

Features.append(fft_mag)
    
fft_index = []
for i in range(len(data)):
    fft_index.append(fft(data[i]).argsort()[-100:][::-1])

Features.append(fft_index)

    

In [131]:
from scipy.signal import argrelmin
rel_min = []
for i in range(len(data)):
    rel_min.append(argrelmin(data[i])[0][:5])
Features.append(rel_min)

In [132]:
from scipy.signal import argrelmax
rel_max = []
for i in range(len(data)):
    rel_max.append(argrelmax(data[i])[0][:5])
Features.append(rel_max)

In [133]:
from scipy.signal import find_peaks
peaks = []
for i in range(len(data)):
    peaks.append(find_peaks(data[i])[0][:5])
Features.append(peaks)

In [134]:
from scipy.signal import peak_prominences
peak_prominence = []
left_basis = []
right_basis = []
for i in range(len(data)):
    pro,l,r = peak_prominences(data[i],peaks[i])
    peak_prominence.append(pro)
    left_basis.append(l)
    right_basis.append(r)
Features.append(peak_prominence)
Features.append(left_basis)
Features.append(right_basis)

In [135]:
from scipy.signal import peak_widths
widths = []
width_heights = []
left_ips = []
right_ips = []
for i in range(len(data)):
    w,wh,l,r = peak_widths(data[i],peaks[i])
    widths.append(w)
    width_heights.append(wh)
    left_ips.append(l)
    right_ips.append(r)
Features.append(widths)
Features.append(width_heights)
Features.append(left_ips)
Features.append(right_ips)

In [136]:
power_periodogram = []
for i in range(len(data)):
    sample_freq, psd = periodogram(data[i], fs=300)
    mean_power = np.mean(psd)
    sum_power = np.sum(psd)
    max_power = max(psd)
    min_power = min(psd)
    signal_power = 0
    noise_power = 0
    partition = [0,2,4,8,16,32,64,128]
    partition = [int(j * len(data[i]) / 300) for j in partition]
    band_power = [sum(psd[partition[j]:partition[j+1]]) for j in range(len(partition)-1)]
    
    
    for j in range(len(psd)):
        if psd[j] < 5:
            signal_power += psd[j]
        else:
            noise_power += psd[j]
    
    SNR = signal_power / noise_power
    
    p = psd / psd.sum()
    Shannon = entropy(p)
    L = [mean_power,sum_power,max_power,min_power,SNR,Shannon]
    L.extend(band_power)
    power_periodogram.append(L)
Features.append(power_periodogram)

In [137]:
from scipy.signal import welch
power_welch = []
for i in range(len(data)):
    sample_freq, psd = welch(data[i], fs=300)
    mean_power = np.mean(psd)
    sum_power = np.sum(psd)
    max_power = max(psd)
    min_power = min(psd)
    partition = [0,2,4,8,16,32,64,128]
    partition = [int(j * len(data[i]) / 300) for j in partition]
    band_power = [sum(psd[partition[j]:partition[j+1]]) for j in range(len(partition)-1)]
    for j in range(len(psd)):
        if psd[j] < 5:
            signal_power += psd[j]
        else:
            noise_power += psd[j]
    
    SNR = signal_power / noise_power
    
    p = psd / psd.sum()
    Shannon = entropy(p)
    L = [mean_power,sum_power,max_power,min_power,SNR,Shannon]
    L.extend(band_power)
    power_welch.append(L)
Features.append(power_welch)

In [138]:
X = np.concatenate((Features[0],Features[1],Features[2],Features[3],Features[4],Features[5],Features[6],
                  Features[7],Features[8],Features[9],Features[10],Features[11],Features[12],Features[13]),axis=1)

In [139]:
X.shape

(3411, 276)

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [312]:
clf = RandomForestClassifier(class_weight={0: 0.4254677754677755,
 1: 2.842361111111111,
 2: 0.859151973131822,
 3: 7.468978102189781})

In [313]:
from sklearn.model_selection import cross_val_score

In [314]:
scores = cross_val_score(clf, X, y.values.ravel(), cv=5)

In [315]:
scores

array([0.59765625, 0.60253906, 0.59824047, 0.59237537, 0.60117302])

# Statistical Features

In [52]:
statfeature = []

In [53]:
count = []
for i in range(len(data)):
    count.append(len(data[i]))
statfeature.append(np.array(count))
len(count)

3411

In [54]:
mean = []
for i in data:
    mean.append(i.mean())
statfeature.append(np.array(mean))
len(mean)

3411

In [55]:
max_ = []
for i in data:
    max_.append(i.max())
statfeature.append(np.array(max_))
len(max_)

3411

In [56]:
min_ = []
for i in data:
    min_.append(i.min())
statfeature.append(np.array(min_))
len(min_)

3411

In [57]:
var = []
for i in data:
    var.append(i.var())
statfeature.append(np.array(var))
len(var)

3411

In [58]:
skeness = []
for i in data:
    skeness.append(stats.skew(i))
statfeature.append(np.array(skeness))
len(skeness)

3411

In [59]:
kurtosis = []
for i in data:
    kurtosis.append(stats.kurtosis(i))
statfeature.append(np.array(kurtosis))
len(kurtosis)

3411

In [60]:
perc25 = []
perc50 = []
perc75 = []
for i in data:
    perc25.append(np.percentile(i, 25))
    perc50.append(np.percentile(i, 50))
    perc75.append(np.percentile(i, 75))
statfeature.append(np.array(perc25))
statfeature.append(np.array(perc50))
statfeature.append(np.array(perc75))

In [61]:
len(statfeature)

10

In [62]:
statfeature = np.array(statfeature)

In [63]:
X = np.concatenate((Features[0],Features[1],Features[2],Features[3],Features[4],Features[5],Features[6],
                  Features[7],Features[8],Features[9],Features[10],Features[11],Features[12],Features[13],statfeature.T),axis=1)



ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 2 has 1 dimension(s)

In [89]:
from sklearn.ensemble import RandomForestClassifier

In [90]:
clf = RandomForestClassifier(class_weight={0: 0.4254677754677755,
 1: 2.842361111111111,
 2: 0.859151973131822,
 3: 7.468978102189781})

In [91]:
from sklearn.model_selection import cross_val_score

In [92]:
scores = cross_val_score(clf, X, y.values.ravel(), cv=5)
scores

array([0.60546875, 0.60351562, 0.59726295, 0.6001955 , 0.60312805])

In [None]:
X.to_csv('signal_stats_test.csv')

In [None]:
X = pd.DataFrame(X)
X

In [64]:
Features[0]

[array([391358.28 , 391358.28 , 347621.1  , 347621.1  , 301414.4  ,
        301414.4  , 278118.8  , 278118.8  , 274919.9  , 274919.9  ,
        270515.   , 260546.92 , 260546.92 , 253335.83 , 253335.83 ,
        244101.89 , 244101.89 , 240558.58 , 240558.58 , 234986.83 ,
        234986.83 , 226875.39 , 226875.39 , 216847.16 , 216847.16 ,
        210151.97 , 210151.97 , 200753.9  , 200753.9  , 191104.89 ,
        191104.89 , 181706.83 , 181706.83 , 179991.69 , 179991.69 ,
        179255.39 , 179255.39 , 167211.88 , 167211.88 , 166368.44 ,
        166368.44 , 163814.77 , 163814.77 , 155714.53 , 155714.53 ,
        152391.95 , 152391.95 , 149233.06 , 149233.06 , 148964.23 ,
        148964.23 , 147979.69 , 147979.69 , 144668.47 , 144668.47 ,
        140321.77 , 140321.77 , 139947.03 , 139947.03 , 139426.62 ,
        139426.62 , 139013.02 , 139013.02 , 138800.94 , 138800.94 ,
        137547.38 , 137547.38 , 137473.62 , 137473.62 , 134841.44 ,
        134841.44 , 134774.08 , 134774.08 , 1295