In [1]:
import sys
sys.path.insert(0, '../')

import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import sklearn

from typing import Tuple
from pickle import load, dump
from glob import glob
from sklearn.model_selection import train_test_split
from typing import Tuple
import scipy.stats as stats

from src.vg2signal import read_raw_vg_as_df, make_smoother, make_shoulder_getter, make_detilter, find_first_derivative_peaks, make_signal_getter
from src.load_dataset import load_dataset
from src.generate_dataset import make_xlsx_str

In [2]:
def v2signal_extra_features(vg_filename: str,
             do_log: bool,
             smoothing_bw: float,
             vcenter: float,
             vwidth: float,
             stiffness: float,
             noise=None,
             flip_signal=False,
             clip_signal=False):

    vg_df = read_raw_vg_as_df(vg_filename)

    if (vg_df['I'].to_numpy() < 0).any():
        temp = [None] * 11
        return [None, None, vg_df] + temp

    if do_log:
        cur_var_name = "logI"
        #vg_df[cur_var_name] = np.emath.logn(logbase, vg_df["I"])
        vg_df[cur_var_name] = np.log2(vg_df["I"])
    else:
        cur_var_name = "I"

    smoother = make_smoother(smoothing_bw)

    vg_df["smoothed"] = smoother(vg_df["V"], vg_df[cur_var_name].to_numpy())

    shoulder_getter = make_shoulder_getter(1, 1.1)
    (peak_signal, peak_v_shoulder) = shoulder_getter(vg_df["V"],
                                                     vg_df["smoothed"])

    vcenter = peak_v_shoulder
    vstart = vcenter - 0.5*vwidth
    vend   = vcenter + 0.5*vwidth

    detilter = make_detilter(vstart, vend, stiffness)
    vg_df["detilted"] = detilter(vg_df["V"].to_numpy(),
                                 vg_df["smoothed"].to_numpy())

    if flip_signal:
        if abs(vg_df['detilted'].min()) > abs(vg_df['detilted'].max()):
            print(vg_filename)
            #vg_df['detilted'] = -vg_df['detilted'] # Flip the signal if negative

    if clip_signal:
        vg_df['detilted'] = vg_df['detilted'].clip(lower=0.0)
        if vg_df['detilted'].sum()==0:
            return 0, 0, 0, vg_df, vcenter, 0, 0, 0,\
           0, 0, 0, 0, 0, 0
        

    # Add Noise to the dataset
    if noise=='gaussian':
        section = vg_df["detilted"][(vg_df['V']<1.15) & (vg_df['V']>0.93)]
        print(section.std())
        section += np.random.normal(0.0005, 0.001, len(section))
        
        vg_df["detilted"][(vg_df['V']<1.15) & (vg_df['V']>0.93)] = section
    
    # print(vg_df["detilted"].shape, vg_df["smoothed"].shape)

    signal_getter = make_signal_getter(vstart, vend)
    (peak_signal_return, peak_v_return) = signal_getter(vg_df["V"], vg_df["detilted"])
    ymaxidx = np.argmax(vg_df["detilted"])

    peakarea = sklearn.metrics.auc(vg_df["V"], vg_df["detilted"])*1000

    V, dS_dV, dS_dV_max_peak, dS_dV_min_peak, dS_dV_peak_diff, \
    dS_dV_max_V, dS_dV_min_V, dS_dV_area        = find_first_derivative_peaks(vg_df["V"].values, vg_df["detilted"].values)

    signal_std  = vg_df["detilted"].values.std()
    signal_mean = vg_df["detilted"].values.mean()
    
   
    return  peakarea, peak_signal_return, peak_v_return, vg_df, vcenter, vg_df["detilted"][ymaxidx], signal_mean, signal_std,\
           dS_dV_max_peak, dS_dV_min_peak, dS_dV_peak_diff, dS_dV_max_V, dS_dV_min_V, dS_dV_area


def run_vg2(folderpaths: str, 
            do_log:bool, 
            recenter:bool, 
            smoothing_bw:float, 
            stiffness:float, 
            vcenter:float, 
            vwidth1:float, 
            vwidth2:float,
            noise=None,
            flip_signal=False,
            clip_signal=False,
            savefilename = 'extracted_features') -> Tuple[dict, str]:
    
    # get filenames to save
    data_str = make_xlsx_str(do_log, recenter, smoothing_bw, stiffness, vcenter, vwidth1, vwidth2)
    vg_dict  = dict()
    dfxl     = pd.DataFrame()


    os.chdir(os.path.dirname(folderpaths[0]))  # change to desired folderpath

    signal_lst = []       # Store all the features
    conc_dict = dict()    # [cbz concentration]: peak signals

    for filename in folderpaths:
        
        if filename[-3:] == 'txt':
            #print("Analyzing:", filename)
            (peak_signal, peak_curvature, peak_v, vg_df, vcenter, ph, signal_mean, signal_std,\
                dS_dV_max_peak, dS_dV_min_peak, dS_dV_peak_diff, dS_dV_max_V, \
                dS_dV_min_V, dS_dV_area) = v2signal_extra_features(os.path.basename(filename),
                                                                    do_log,
                                                                    smoothing_bw,
                                                                    vcenter,
                                                                    vwidth1,
                                                                    stiffness,
                                                                    noise,
                                                                    flip_signal,
                                                                    clip_signal)
            
            if (peak_signal == None) or (peak_curvature==None):
                #print(f"peak_signal:{peak_signal} OR peak curvature: {peak_curvature}", filename)
                continue

            idx1 = filename.rfind("cbz")
            idx2 = filename[idx1:].find("_")
            conc = filename[idx1 + 3:idx1 + idx2]
            replicate = filename[idx1 + idx2 + 1:filename.rfind(".")]

            # for 7p5 concentration
            if 'p' in conc:  
                pi = conc.find('p')
                conctemp = conc[:pi] + '.' + conc[pi + 1:]
                conc = conctemp

            concstrxl   = str(float(conc))
            concxl      = list([concstrxl] * len(vg_df["V"]))
            replicatexl = list([replicate] * len(vg_df["V"]))

            if do_log:
                dfxl = pd.concat([dfxl, pd.DataFrame(
                    [concxl, replicatexl, vg_df["V"], vg_df["I"], vg_df["logI"], vg_df["smoothed"],
                     vg_df["detilted"]]).transpose()])
            else:
                dfxl = pd.concat([dfxl, pd.DataFrame(
                    [concxl, replicatexl, vg_df["V"], vg_df["I"], vg_df["smoothed"], vg_df["detilted"]]).transpose()])

            if peak_signal is None: peak_signal = 0 # if find no peak
            if peak_v is None: peak_v = 0           # if find no peak V

            signal_lst.append([filename, round(peak_signal, 4), round(peak_curvature, 4), round(peak_v, 4), round(vcenter, 4), ph, round(signal_mean, 4), round(signal_std, 4),\
                              round(dS_dV_max_peak, 4), round(dS_dV_min_peak, 4), round(dS_dV_peak_diff, 4), round(dS_dV_max_V, 4), round(dS_dV_min_V, 4), round(dS_dV_area, 4)])  # add text filename & peak signal to signal list
            
             # for each concentration
            if conc in conc_dict.keys(): 
                conclst = conc_dict[conc]
                conclst.append((peak_signal, peak_v))  # add peak signal to concentration dictionary
                conc_dict[conc] = conclst

                # for plotting purposes
                plst = vg_dict[conc]
                plst.append(vg_df)
                vg_dict[conc] = plst
            
            else:
                conc_dict[conc] = [(peak_signal, peak_v)]
                vg_dict[conc] = [vg_df]

    signal_df = pd.DataFrame(signal_lst)
    conc_list = []
    concs_targetlst = sorted([c for idx, c in enumerate(list(conc_dict.keys()))], key=lambda v: float(v))

    # for each concentration
    for key in conc_dict:  
        vals       = conc_dict[key]                                  # all the signals for conc
        avgval     = round(np.average([val[0] for val in vals]), 2)  # avg signal for conc
        stdval     = round(np.std([val[0] for val in vals]), 2)      # std of signals for conc
        avgpeakval = round(np.average([val[1] for val in vals]), 2)  # avg peak voltage for conc
        stdpeakval = round(np.std([val[1] for val in vals]), 2)      # std of peak voltage for conc
        
        if avgval != 0:
            cvval  = round(stdval / avgval, 3)
        
        else:
            cvval = 0  # if average is 0, make CV 0
        
        concstr = str(float(key)) + " \u03BCM"
        # compare signal list for this conc to closest lower conc
        currentidx = concs_targetlst.index(key)
        if currentidx == 0:
            lowervals = conc_dict[key]
        else:
            lowervals = conc_dict[concs_targetlst[currentidx-1]]
        ttest = round(stats.ttest_ind([val[0] for val in vals], [val[0] for val in lowervals], equal_var=False)[0], 2)
        conc_list.append([concstr, avgval, stdval, cvval, ttest, avgpeakval, stdpeakval])  # add stats for conc

    conc_lst_sorted = sorted(conc_list, key=lambda x: float(x[0][:-2]))
    conc_df = pd.DataFrame(conc_lst_sorted)
    
    # save stats list to excel
    stats_str     = "stats" + data_str
    signal_str    = f"{savefilename}.xlsx"
    dataframe_str = "dataframe" + data_str

    conc_df.to_excel(stats_str, index=False,
                     header=["conc", "average", "std", "CV", "T-Statistic", "avg peak", "std peak"])
    signal_df.to_excel(signal_str, index=False,
                       header=["file", "peak area", "peak curvature", "peak V", "vcenter", "PH", "signal_mean", "signal_std", \
                              "dS_dV_max_peak", "dS_dV_min_peak", "dS_dV_peak_diff", "dS_dV_max_V", "dS_dV_min_V", "dS_dV_area"])  # save signal list to excel
    if do_log:
        dfxl.to_excel(dataframe_str, index=False,
                      header=["conc", "replicate", "V", "I", "logI", "smoothed", "detilted"])
    else:
        dfxl.to_excel(dataframe_str, index=False, header=["conc", "replicate", "V", "I", "smoothed", "detilted"])
    
    return signal_df

In [None]:
do_log        = True
recenter      = False
smoothing_bw  = 0.006
stiffness     = 0
vcenter       = 1.04
vwidth1       = 0.15

indx          = 0
all_files     = glob('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/*.txt')

In [None]:
indx += 1

(peak_signal, peak_curvature, peak_v, df, vcenter, ph, signal_mean, signal_std,\
                dS_dV_max_peak, dS_dV_min_peak, dS_dV_peak_diff, dS_dV_max_V, \
                dS_dV_min_V, dS_dV_area) = v2signal_extra_features(all_files[indx],
                                                                    do_log,
                                                                    smoothing_bw,
                                                                    vcenter,
                                                                    vwidth1,
                                                                    stiffness,
                              
                                                                noise='none')


print(indx, os.path.basename(all_files[indx]), ph, peak_curvature)
plt.plot(df['detilted'])

In [3]:
# Load dataset
normalization    = False
standardize_type = 'mean_std' if normalization else 'none'
split            = True
combat_norm      = False
showFileName     = True

if split:
    (ML1_X_train, ML1_X_test, ML1_y_train, ML1_y_test), _  = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1', normalization=normalization, standardize_type=standardize_type, split=split, showFileName=showFileName)
    (ML2_X_train, ML2_X_test, ML2_y_train, ML2_y_test), _  = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_22_ML2', normalization=normalization, standardize_type=standardize_type, split=split, showFileName=showFileName)
    (ML4_X_train, ML4_X_test, ML4_y_train, ML4_y_test), _  = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML4', normalization=normalization, standardize_type=standardize_type, split=split, showFileName=showFileName)

else:
    ML1_X, ML1_y = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1', normalization=normalization, standardize_type=standardize_type, split=split, showFileName=showFileName)
    ML2_X, ML2_y = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_22_ML2', normalization=normalization, standardize_type=standardize_type, split=split, showFileName=showFileName)
    ML4_X, ML4_y = load_dataset('/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML4', normalization=normalization, standardize_type=standardize_type, split=split, showFileName=showFileName)

######Data Distribution:#########
Training {'2024_02_19_cbz16_34.txt': 1, '2024_02_19_cbz16_28.txt': 1, '2024_02_19_cbz16_21.txt': 1, '2024_02_19_cbz00_35.txt': 1, '2024_02_19_cbz08_03.txt': 1, '2024_02_19_cbz16_41.txt': 1, '2024_02_19_cbz00_41.txt': 1, '2024_02_19_cbz16_20.txt': 1, '2024_02_19_cbz16_02.txt': 1, '2024_02_19_cbz16_16.txt': 1, '2024_02_19_cbz08_04.txt': 1, '2024_02_19_cbz00_38.txt': 1, '2024_02_19_cbz16_07.txt': 1, '2024_02_19_cbz00_28.txt': 1, '2024_02_19_cbz00_34.txt': 1, '2024_02_19_cbz00_01.txt': 1, '2024_02_19_cbz08_21.txt': 1, '2024_02_19_cbz00_22.txt': 1, '2024_02_19_cbz08_37.txt': 1, '2024_02_19_cbz00_25.txt': 1, '2024_02_19_cbz00_29.txt': 1, '2024_02_19_cbz08_45.txt': 1, '2024_02_19_cbz16_13.txt': 1, '2024_02_19_cbz16_37.txt': 1, '2024_02_19_cbz16_33.txt': 1, '2024_02_19_cbz16_10.txt': 1, '2024_02_19_cbz08_32.txt': 1, '2024_02_19_cbz00_31.txt': 1, '2024_02_19_cbz08_27.txt': 1, '2024_02_19_cbz00_21.txt': 1, '2024_02_19_cbz08_25.txt': 1, '2024_02_19_cbz08_42.txt':

In [4]:
training_data_file = [ML1_y_train.apply(lambda x:os.path.realpath('../dataset/ML1_ML2/2024_02_19_ML1/'+x)).values.tolist(),  ML2_y_train.apply(lambda x:os.path.realpath('../dataset/ML1_ML2/2024_02_22_ML2/'+x)).values.tolist(), ML4_y_train.apply(lambda x:os.path.realpath('../dataset/ML4/'+x)).values.tolist()]
testing_data_file  = [ML1_y_test.apply(lambda x:os.path.realpath('../dataset/ML1_ML2/2024_02_19_ML1/'+x)).values.tolist(),  ML2_y_test.apply(lambda x:os.path.realpath('../dataset/ML1_ML2/2024_02_22_ML2/'+x)).values.tolist(), ML4_y_test.apply(lambda x:os.path.realpath('../dataset/ML4/'+x)).values.tolist()]

In [5]:
training_data_file[0][0]

'/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/2024_02_19_cbz00_16.txt'

In [12]:
all_df = []

do_log        = True
recenter      = False
smoothing_bw  = 0.006
stiffness     = 0
vcenter       = 1.04
vwidth1       = 0.15
vwidth2       = None

for data in testing_data_file:
    df = run_vg2(data,  \
                do_log=do_log,
                recenter=recenter, 
                smoothing_bw=smoothing_bw, 
                stiffness=stiffness, 
                vcenter=vcenter, 
                vwidth1=vwidth1, 
                vwidth2=vwidth2,
                noise='',
                clip_signal=False,
                savefilename='feature_extraction_vwidth_0.15_testing')
    

In [9]:
training_data_file

[['/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/2024_02_19_cbz00_16.txt',
  '/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/2024_02_19_cbz08_08.txt',
  '/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/2024_02_19_cbz08_41.txt',
  '/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/2024_02_19_cbz00_12.txt',
  '/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/2024_02_19_cbz16_13.txt',
  '/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/2024_02_19_cbz16_25.txt',
  '/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/2024_02_19_cbz16_15.txt',
  '/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/2024_02_19_cbz16_02.txt',
  '/Users/sangam/Desktop/Epilepsey/Code/vgramreg/dataset/ML1_ML2/2024_02_19_ML1/2024_02_19_cbz00_05.txt',
  '/Users/sangam/Desktop/Epilepsey/Code/vgramr