In [None]:
import pandas as pd
from matplotlib import pyplot as plt

import numpy as np
import statsmodels.tsa.stattools as ts

## read example train features

In [None]:
downsample_pts = 1

In [None]:
def my_load(subj_ids:list, series_ids:list):
    features_all = []
    targets_all = []
    for i1 in subj_ids:
        for i2 in series_ids:
            for i3, fn in [
                ('features', 'data/raw/train/subj%i_series%i_data.csv'%(i1, i2)),
                ('targets', 'data/raw/train/subj%i_series%i_events.csv'%(i1, i2)),
            ]:
                print('status:', i1, i2, i3)
                xxx_i = pd.read_csv(fn)
                xxx_i['subj_id'] = i1
                xxx_i['series_id'] = i2
                xxx_i = xxx_i.set_index(['subj_id', 'series_id', 'id']).astype('int16')
                xxx_i = xxx_i[::downsample_pts] # downsample
                if i3=='features':
                    features_all.append(xxx_i)
                else:
                    targets_all.append(xxx_i)
            
    features_all = pd.concat(features_all, axis=0)
    targets_all = pd.concat(targets_all, axis=0)
    return features_all, targets_all

In [None]:
train_features, train_targets = my_load(subj_ids = [1], series_ids = [x+1 for x in range(8)])
train_features.shape, train_targets.shape

## subtract global mean

In [None]:
train_features_min = train_features.min(axis=0)
train_features_max = train_features.max(axis=0)
train_features_min.to_pickle('data/processed/train_features_min.pkl')
train_features_max.to_pickle('data/processed/train_features_max.pkl')

In [None]:
scale_df = lambda y: (y - y.min(axis=0)) / (y.max(axis=0) - y.min(axis=0)) # scale to 0-1
train_features = scale_df(train_features)

## plot all

In [None]:
for k in train_features.columns:
    x = train_features[k].head(n=10*1000)
    x.plot(figsize=(20,3))
    plt.title(k)
    plt.show()
    break

## correlation matrix

In [None]:
corr_df = train_features.corr(method='pearson')
corr_df.shape

In [None]:
corr_df.loc['Fp1'] > .7

In [None]:
import seaborn as sns
sns.heatmap(corr_df)
plt.show()

In [None]:
corr_df.index

In [None]:
def rm_mean_corr(s, corr_df):
    df3 = []
    for feat_i in corr_df.index:
        print(feat_i)
        feat_corr1 = corr_df.loc[feat_i] > 0.7
        feat_corr2 = corr_df.loc[feat_i][feat_corr1]
        feat_corr3 = feat_corr2.index
        print(feat_corr3)
        
        if len(feat_corr3) <= 1:
            df3.append(s[[feat_i]])
            continue
        
        # wontdo
        # df=s[feat_corr3] * feat_corr2.reshape((1,-1)) / feat_corr2.sum() # weighted by correlation
        
        # subtract mean
        df0 = s[feat_corr3]
        df1 = df0.mean(axis=1)
        df2 = df0[feat_i] - df1
        df3.append(df2)
        
    df3 = pd.concat(df3, axis=1)
    df3.columns = s.columns
    return df3

new_features = rm_mean_corr(train_features, corr_df)
new_features.shape

In [None]:
new_features.head()

In [None]:
new_features = pd.read_pickle('data/processed/0.5-features_minus_spatial_noise.pkl')

In [None]:
for k in new_features.columns:
    new_features[k].head(n=10000).plot(figsize=(20,3))
    plt.title(k)
    plt.show()
    break

## save

## compare to PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=train_features.shape[1]) # same number of features
pca_features =  pca.fit_transform(train_features)
pca_features = pd.DataFrame(pca_features, index=new_features.index)

In [None]:
pca.explained_variance_ratio_

In [None]:
pca_features.shape

In [None]:
for k in pca_features.columns:
    pca_features[k].head(n=10000).plot(figsize=(20,3))
    plt.title(k)
    plt.show()

In [None]:
# plot complete but downsampled
for k in pca_features.columns:
    pca_features[k].iloc[::100].plot(figsize=(20,3))
    plt.title(k)
    plt.show()