[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/repos-especializacion-UdeA/data-raw/blob/main/notebooks/03_features_extraction.ipynb)

# Extracción de caracteristicas

El siguiente notebook explora de manera sencilla un archivo de matlab donde se guarda la información de un sensor.

In [12]:
try:
    import scipy.io
except ImportError:
    !pip install scipy

## 1. Librerias y configuraciones previas

In [14]:
import sys
import os
import zipfile

# Get the absolute path of the current notebook
notebook_path = "."
print(notebook_path)
try:
    import google.colab
    if not(os.path.exists("/content/data-raw/notebooks")):
        !git clone https://github.com/repos-especializacion-UdeA/data-raw.git
    %cd /content/data-raw/notebooks   
    %pwd
    ruta_base = '/content/data-raw/notebooks/'
    sys.path.append(ruta_base)
except ImportError:
    ruta_base = './'

.


In [15]:
# command to view figures in Jupyter notebook
# %matplotlib inline 

# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np
import scipy as sc

# Almacenar en caché los resultados de funciones en el disco
# ==============================================================================
import joblib


# Gestion de librerias
# ==============================================================================
from importlib import reload

# Matemáticas y estadísticas
# ==============================================================================
import math

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns


# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

# Formateo y estilo
# ==============================================================================
from IPython.display import Markdown, display

# Biblioteca scipy y componentes
# ==============================================================================
import scipy.io
from scipy import signal


## 2. Funciones

In [49]:
# Funciones de utilidad
# ==============================================================================

# Enventando
def segmentar_data_set( data_set, 
                        window_size = None, 
                        overlap_size = None):
    
    if len(data_set.columns) != 13:
        print("Dataset incompleto, revise las columnas")
        return -1

    sujeto_data = data_set.iloc[:,0]        # s
    emg_data = data_set.iloc[:,1:11]        # emg_1, emg_2, emg_3, ..., emg_10   
    postura_data =  data_set.iloc[:,-1]     # label
    repeticion_data = data_set.iloc[:,-2]   # rep

    ventanas = []
    ventana = pd.DataFrame()
    step_size = window_size - overlap_size
    for i in range(0, len(emg_data) - window_size + 1, step_size):
        sujeto_window = sujeto_data.iloc[i:i + window_size]
        label_window = postura_data.iloc[i:i + window_size]
        rep_window = repeticion_data.iloc[i:i + window_size]
        num_unique_labels = label_window.nunique()
        if isinstance(num_unique_labels, int) and num_unique_labels == 1: 
            ventana = pd.concat([sujeto_window.copy().reset_index(drop=True), 
                                 emg_data.iloc[i:i + window_size].copy().reset_index(drop=True), 
                                 rep_window.copy().reset_index(drop=True),
                                 label_window.copy().reset_index(drop=True)], 
                                 axis=1)
            ventanas.append(ventana)
    return ventanas

# Valor RMS
def rms_value(emg_values):
    N = len(emg_values)
    if not isinstance(emg_values, pd.DataFrame):
        raise TypeError("La ventana deben ser un dataframe")
    rms_emg_values = emg_values.apply(lambda x: np.sqrt(np.mean(np.square(x))), axis=0)
    rms_df = pd.DataFrame([rms_emg_values])
    return rms_df 

# Valor MAV
def mav_value(emg_values):
    if not isinstance(emg_values, pd.DataFrame):
        raise TypeError("La ventana deben ser un dataframe")
    mav_emg_values = emg_values.apply(lambda x: np.mean(np.abs(x)), axis=0)
    mav_df = pd.DataFrame([mav_emg_values])
    return mav_df

def features_data_set(data_set_windows):
    # Por ahora solo hace el calculo del RMS
    filas = []  # Lista para almacenar las filas aplanadas
    for i in range(len(data_set_windows)):
        rms_window = pd.DataFrame({'s': [data_set_windows[i].iloc[0,0]]})
        rms_seg = rms_value(data_set_windows[i].iloc[:,1:-2])
        rms_window = pd.concat([rms_window,rms_seg],axis=1)
        rms_window['rep'] = data_set_windows[i].iloc[0,-2]
        rms_window['label'] = data_set_windows[i].iloc[0,-1]
        filas.append(rms_window)  # Agregar a la lista en lugar de concatenar inmediatamente
    # Concatenar todo al final  
    features_data_base = pd.concat(filas, ignore_index=True)
    return features_data_base


**Por hace**:
- [ ] Migrar el enventanado usando Window functions del modulo [`scipy.signal.windows`](https://docs.scipy.org/doc/scipy/reference/signal.windows.html) 

## 3. Carga del dataset

Inicialmente se verifica que el archivo csv este disponible.

In [22]:
# Verificando que se encuentre el archivo

DATASETS_PATH = "./datasets/"
filter_dataset_zip = "filter_dataset.zip"
filter_dataset_csv = "filter_dataset.csv"

if os.path.exists(DATASETS_PATH + filter_dataset_zip):
    # Descoprimir dataset
    with zipfile.ZipFile(DATASETS_PATH + filter_dataset_zip, 'r') as zip_ref:
        zip_ref.extractall(".")
else:
    print("El dataset no existe - Ejecute el notebook \"preprocess_raw_signal.ipynb\" antes de seguir")

Si el dataset no ejecute el [`preprocess_raw_signal.ipynb`](preprocess_raw_signal.ipynb) para poder seguir. 

A continuación se realiza la carga del dataset completo

In [23]:
# Carga del dataset
df = pd.read_csv(DATASETS_PATH + filter_dataset_csv)

A continuación se verifica la carga del dataset:

In [24]:
# Mostrar las primeras filas del DataFrame
#if 't' in df.columns:
#    df.drop(columns=['t'], inplace=True)
df.head()

Unnamed: 0,s,emg_1,emg_2,emg_3,emg_4,emg_5,emg_6,emg_7,emg_8,emg_9,emg_10,rep,label
0,1,0.067523,0.002402,0.002399,0.002403,0.002395,0.0098,0.0024,0.050357,0.0024,0.034192,0,0
1,1,0.066284,0.002402,0.002402,0.002404,0.002395,0.009474,0.0024,0.049707,0.0024,0.03282,0,0
2,1,0.065045,0.002403,0.002405,0.002404,0.002395,0.009149,0.0024,0.04905,0.0024,0.031452,0,0
3,1,0.06381,0.002403,0.002409,0.002405,0.002395,0.008827,0.0024,0.04839,0.0024,0.030091,0,0
4,1,0.062581,0.002404,0.002412,0.002406,0.002395,0.008507,0.0024,0.047726,0.0024,0.028743,0,0


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2731393 entries, 0 to 2731392
Data columns (total 13 columns):
 #   Column  Dtype  
---  ------  -----  
 0   s       int64  
 1   emg_1   float64
 2   emg_2   float64
 3   emg_3   float64
 4   emg_4   float64
 5   emg_5   float64
 6   emg_6   float64
 7   emg_7   float64
 8   emg_8   float64
 9   emg_9   float64
 10  emg_10  float64
 11  rep     int64  
 12  label   int64  
dtypes: float64(10), int64(3)
memory usage: 270.9 MB


Hay un total de 13 columnas y ninguna tiene registros faltantes (missing values). Debido a esto, no nos tendremos que preocupar por realizar una imputación de datos. Pero hay muchos datos.

In [28]:
len(df.columns)

13

In [26]:
# Convertir a categorico
# df['s'] = pd.Categorical(df['s'])
# df['rep'] = pd.Categorical(df['rep'])
# df['label'] = pd.Categorical(df['label'])

Se verifica que los cambios en el dataframe se hayan efectuado.

In [11]:
#Lista de variables categóricas
catCols = df.select_dtypes(include = ['object', 'category']).columns.tolist()
print(f"Variables categoricas: {catCols}")
numCols = df.select_dtypes(include = ['float64','int32','int64']).columns.tolist()
print(f"Variables categoricas: {numCols}")

Variables categoricas: []
Variables categoricas: ['s', 'emg_1', 'emg_2', 'emg_3', 'emg_4', 'emg_5', 'emg_6', 'emg_7', 'emg_8', 'emg_9', 'emg_10', 'label']


## Extracción de caracteristicas del dataset

### Segmentacion de la señal

La segmentación consiste en extraer porciones de la señal mediante ventanas de tiempo (que pueden ir sobrelapadas o no) para realizar una analisis local mas focalizado. La siguiente figura resume el procedimiento:

![overlap_window](overlap_window.png)

### Extracción de caracteristicas

Existen diferentes caracteristicas, pero por ahora solo nos centraremos en el valor RMS

In [None]:
# Determinacion del numero de sujetos a emplear para definir los datos que van al modelo
numero_de_sujetos = 10
w_size = 30
o_size = 10

sujetos_muestra = df[df['s'] <= numero_de_sujetos]

segments = segmentar_data_set(data_set = sujetos_muestra, 
                              window_size = w_size, 
                              overlap_size = o_size)

Ahora vamos a verificar por encima que la segmentacion se haya realizado como se espera.

```
data_set --> [ window_0, window_1, window_2, ... window_n]  
         --> [ data_set[0:29], 
               data_set[20:49], 
               data_set[40:69], ...
             ]
```

Es importante tener en cuenta, es importante resaltar que aquellos segmentos en los cuales hay una transición (cambio en el label por ejemplo) son descartados de modo que no es de estrañar que hayan menos segmentos que el resultado teorico experado.

In [33]:
df.head(50)

Unnamed: 0,s,emg_1,emg_2,emg_3,emg_4,emg_5,emg_6,emg_7,emg_8,emg_9,emg_10,rep,label
0,1,0.067523,0.002402,0.002399,0.002403,0.002395,0.0098,0.0024,0.050357,0.0024,0.034192,0,0
1,1,0.066284,0.002402,0.002402,0.002404,0.002395,0.009474,0.0024,0.049707,0.0024,0.03282,0,0
2,1,0.065045,0.002403,0.002405,0.002404,0.002395,0.009149,0.0024,0.04905,0.0024,0.031452,0,0
3,1,0.06381,0.002403,0.002409,0.002405,0.002395,0.008827,0.0024,0.04839,0.0024,0.030091,0,0
4,1,0.062581,0.002404,0.002412,0.002406,0.002395,0.008507,0.0024,0.047726,0.0024,0.028743,0,0
5,1,0.061362,0.002404,0.002415,0.002407,0.002395,0.008192,0.0024,0.047061,0.0024,0.02741,0,0
6,1,0.060156,0.002405,0.002419,0.002407,0.002396,0.007881,0.0024,0.046395,0.0024,0.026096,0,0
7,1,0.058966,0.002406,0.002422,0.002408,0.002396,0.007576,0.0024,0.045729,0.0024,0.024805,0,0
8,1,0.057793,0.002406,0.002425,0.002409,0.002396,0.007278,0.0024,0.045064,0.0024,0.023539,0,0
9,1,0.056641,0.002407,0.002428,0.00241,0.002396,0.006986,0.0024,0.044399,0.0024,0.0223,0,0


Veamos los dos primeros segmentos

In [35]:
segments[0]

Unnamed: 0,s,emg_1,emg_2,emg_3,emg_4,emg_5,emg_6,emg_7,emg_8,emg_9,emg_10,rep,label
0,1,0.067523,0.002402,0.002399,0.002403,0.002395,0.0098,0.0024,0.050357,0.0024,0.034192,0,0
1,1,0.066284,0.002402,0.002402,0.002404,0.002395,0.009474,0.0024,0.049707,0.0024,0.03282,0,0
2,1,0.065045,0.002403,0.002405,0.002404,0.002395,0.009149,0.0024,0.04905,0.0024,0.031452,0,0
3,1,0.06381,0.002403,0.002409,0.002405,0.002395,0.008827,0.0024,0.04839,0.0024,0.030091,0,0
4,1,0.062581,0.002404,0.002412,0.002406,0.002395,0.008507,0.0024,0.047726,0.0024,0.028743,0,0
5,1,0.061362,0.002404,0.002415,0.002407,0.002395,0.008192,0.0024,0.047061,0.0024,0.02741,0,0
6,1,0.060156,0.002405,0.002419,0.002407,0.002396,0.007881,0.0024,0.046395,0.0024,0.026096,0,0
7,1,0.058966,0.002406,0.002422,0.002408,0.002396,0.007576,0.0024,0.045729,0.0024,0.024805,0,0
8,1,0.057793,0.002406,0.002425,0.002409,0.002396,0.007278,0.0024,0.045064,0.0024,0.023539,0,0
9,1,0.056641,0.002407,0.002428,0.00241,0.002396,0.006986,0.0024,0.044399,0.0024,0.0223,0,0


In [36]:
segments[1]

Unnamed: 0,s,emg_1,emg_2,emg_3,emg_4,emg_5,emg_6,emg_7,emg_8,emg_9,emg_10,rep,label
0,1,0.045784,0.002419,0.002461,0.002423,0.002402,0.004338,0.0024,0.037171,0.0024,0.010982,0,0
1,1,0.044986,0.00242,0.002464,0.002425,0.002403,0.004152,0.0024,0.036524,0.0024,0.010183,0,0
2,1,0.044221,0.002422,0.002467,0.002426,0.002404,0.003976,0.0024,0.035881,0.0024,0.009423,0,0
3,1,0.043488,0.002423,0.00247,0.002428,0.002405,0.003808,0.0024,0.035243,0.0024,0.008701,0,0
4,1,0.042789,0.002425,0.002474,0.002429,0.002406,0.003649,0.0024,0.034611,0.0024,0.008017,0,0
5,1,0.042121,0.002426,0.002477,0.002431,0.002407,0.0035,0.0024,0.033986,0.0024,0.00737,0,0
6,1,0.041485,0.002428,0.00248,0.002432,0.002408,0.003358,0.0024,0.033369,0.0024,0.00676,0,0
7,1,0.04088,0.002429,0.002484,0.002434,0.00241,0.003225,0.0024,0.032763,0.0024,0.006185,0,0
8,1,0.040305,0.002431,0.002487,0.002435,0.002411,0.0031,0.0024,0.032168,0.0024,0.005646,0,0
9,1,0.03976,0.002432,0.002491,0.002437,0.002413,0.002984,0.0024,0.031586,0.0024,0.00514,0,0


In [38]:
df.iloc[40:70]

Unnamed: 0,s,emg_1,emg_2,emg_3,emg_4,emg_5,emg_6,emg_7,emg_8,emg_9,emg_10,rep,label
40,1,0.035728,0.002449,0.002534,0.002452,0.002435,0.002151,0.0024,0.026515,0.0024,0.001529,0,0
41,1,0.03554,0.00245,0.002538,0.002453,0.002438,0.00211,0.0024,0.026208,0.0024,0.001349,0,0
42,1,0.035379,0.002451,0.002542,0.002454,0.00244,0.002073,0.0024,0.02593,0.0024,0.00119,0,0
43,1,0.035246,0.002452,0.002546,0.002455,0.002443,0.002041,0.0024,0.025682,0.0024,0.001049,0,0
44,1,0.035139,0.002453,0.002549,0.002455,0.002446,0.002013,0.0024,0.025462,0.0024,0.000926,0,0
45,1,0.035058,0.002454,0.002553,0.002455,0.002449,0.001988,0.0024,0.025271,0.0024,0.00082,0,0
46,1,0.035,0.002455,0.002556,0.002456,0.002452,0.001967,0.0024,0.025109,0.0024,0.00073,0,0
47,1,0.034966,0.002455,0.002559,0.002455,0.002455,0.00195,0.0024,0.024974,0.0024,0.000654,0,0
48,1,0.034954,0.002456,0.002562,0.002455,0.002458,0.001936,0.0024,0.024865,0.0024,0.000592,0,0
49,1,0.034961,0.002456,0.002564,0.002455,0.002461,0.001924,0.0024,0.024782,0.0024,0.000544,0,0


In [39]:
segments[2]

Unnamed: 0,s,emg_1,emg_2,emg_3,emg_4,emg_5,emg_6,emg_7,emg_8,emg_9,emg_10,rep,label
0,1,0.035728,0.002449,0.002534,0.002452,0.002435,0.002151,0.0024,0.026515,0.0024,0.001529,0,0
1,1,0.03554,0.00245,0.002538,0.002453,0.002438,0.00211,0.0024,0.026208,0.0024,0.001349,0,0
2,1,0.035379,0.002451,0.002542,0.002454,0.00244,0.002073,0.0024,0.02593,0.0024,0.00119,0,0
3,1,0.035246,0.002452,0.002546,0.002455,0.002443,0.002041,0.0024,0.025682,0.0024,0.001049,0,0
4,1,0.035139,0.002453,0.002549,0.002455,0.002446,0.002013,0.0024,0.025462,0.0024,0.000926,0,0
5,1,0.035058,0.002454,0.002553,0.002455,0.002449,0.001988,0.0024,0.025271,0.0024,0.00082,0,0
6,1,0.035,0.002455,0.002556,0.002456,0.002452,0.001967,0.0024,0.025109,0.0024,0.00073,0,0
7,1,0.034966,0.002455,0.002559,0.002455,0.002455,0.00195,0.0024,0.024974,0.0024,0.000654,0,0
8,1,0.034954,0.002456,0.002562,0.002455,0.002458,0.001936,0.0024,0.024865,0.0024,0.000592,0,0
9,1,0.034961,0.002456,0.002564,0.002455,0.002461,0.001924,0.0024,0.024782,0.0024,0.000544,0,0


En efecto, al parecer la segmentación es correcta. Veamos el numero de segmentos resultante:

In [40]:
len(segments)

46925

#### Valor RMS

Por ahora la unica caracteristica a emplear es el valor RMS de la señal de cada electrodo aplicado a cada una de las ventanas

$$\text{RMS} = \sqrt{\frac{1}{N} \sum_{i=1}^{N} x_i^2} $$

Donde:
* $x_i$: es cada valor individual de la señal dentro de la ventana.
* $N$: es el número total de muestras en la ventana.

In [56]:
# s = [segments[0].copy(),segments[1].copy()]
# len(s)
df_features_dataset = features_data_set(segments)

In [57]:
df_features_dataset.head()

Unnamed: 0,s,emg_1,emg_2,emg_3,emg_4,emg_5,emg_6,emg_7,emg_8,emg_9,emg_10,rep,label
0,1,0.05251,0.002414,0.002445,0.002417,0.0024,0.006204,0.0024,0.041218,0.0024,0.019526,0,0
1,1,0.038543,0.00244,0.002513,0.002443,0.002426,0.002803,0.0024,0.029789,0.0024,0.005035,0,0
2,1,0.035662,0.002448,0.002564,0.002446,0.002478,0.001975,0.0024,0.025287,0.0024,0.000813,0,0
3,1,0.037038,0.002425,0.002542,0.00242,0.002526,0.002129,0.0024,0.026216,0.0024,0.001485,0,0
4,1,0.035718,0.002404,0.002478,0.002401,0.002542,0.002346,0.0024,0.026433,0.0024,0.002234,0,0


In [58]:
df_features_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46925 entries, 0 to 46924
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   s       46925 non-null  int64  
 1   emg_1   46925 non-null  float64
 2   emg_2   46925 non-null  float64
 3   emg_3   46925 non-null  float64
 4   emg_4   46925 non-null  float64
 5   emg_5   46925 non-null  float64
 6   emg_6   46925 non-null  float64
 7   emg_7   46925 non-null  float64
 8   emg_8   46925 non-null  float64
 9   emg_9   46925 non-null  float64
 10  emg_10  46925 non-null  float64
 11  rep     46925 non-null  int64  
 12  label   46925 non-null  int64  
dtypes: float64(10), int64(3)
memory usage: 4.7 MB


In [59]:
df_features_dataset.isna().sum()

s         0
emg_1     0
emg_2     0
emg_3     0
emg_4     0
emg_5     0
emg_6     0
emg_7     0
emg_8     0
emg_9     0
emg_10    0
rep       0
label     0
dtype: int64

#### Otras caracteristicas

Esta parte queda como trabajo futuro

### Guardado del archivo

In [60]:
features_dataset_name = "features_data_set"
features_dataset_csv = features_dataset_name + ".csv"
dest_dir_datasets = "./datasets/"
if not(os.path.exists(dest_dir_datasets + features_dataset_csv)):
    # Archivo no existe
    # Se exporta el dataframe a un archivo CSV
    print(f"Generando archivo {features_dataset_csv}")
    df_features_dataset.to_csv(dest_dir_datasets + features_dataset_csv, index=False)
    stat_dataset = os.stat(dest_dir_datasets + features_dataset_csv)
    print(f"Tamaño del dataset {filter_dataset_csv}: {stat_dataset.st_size/((1024 * 1024))} MB")      
else:
   print("No se hace nada el archivo ya existe")

Generando archivo features_data_set.csv
Tamaño del dataset filter_dataset.csv: 9.554224967956543 MB


## Referencias

* https://github.com/chuawt/eda-starter
* https://www.kaggle.com/code/bextuychiev/my-6-part-powerful-eda-template
* https://community.ibm.com/community/user/ai-datascience/blogs/shivam-solanki1/2020/02/19/eda-exploratory-data-analysis-with-example-in-jupy
* https://github.com/Saba-Gul/Exploratory-Data-Analysis-and-Statistical-Analysis-Notebooks
* https://www.datacamp.com/es/tutorial/pandas-profiling-ydata-profiling-in-python-guide
* https://docs.profiling.ydata.ai/latest/
* https://github.com/Saba-Gul/Exploratory-Data-Analysis-and-Statistical-Analysis-Notebooks/blob/main/Statistics_for_ML.ipynb
* https://github.com/Saba-Gul/Exploratory-Data-Analysis-and-Statistical-Analysis-Notebooks/blob/main/Online_Ed_Adaptability.ipynb
* https://github.com/Saba-Gul/Exploratory-Data-Analysis-and-Statistical-Analysis-Notebooks/blob/main/Heart_Failure_Survival_Classification.ipynb
* https://github.com/akueisara/audio-signal-processing/blob/master/week%204/A4/A4Part2.py
* https://docs.profiling.ydata.ai/latest/