In [43]:
import h5py
import numpy as np
import pandas as pd

with h5py.File('RespCoupling.Mat', 'r') as file:
    # Extract the data matrix
    data_matrix = np.array(file['fArray'])
    
    # Extract the feature names
    feature_names = []
    for ref in file['fNames']:
        # Dereference the object reference
        name_data = file[ref[0]]
        # Convert the uint16 array to a string
        name = ''.join(chr(i[0]) for i in name_data[:])
        feature_names.append(name)

# Create a DataFrame
resp_df = pd.DataFrame(data_matrix.T, columns=feature_names)

# Display the first few rows and info of the DataFrame
print(resp_df.head())
print("\
DataFrame Info:")
resp_df.info()

# Save the DataFrame to a CSV file
csv_filename = 'RespCoupling.csv'
resp_df.to_csv(csv_filename, index=False)
print(f"\
Data saved to {csv_filename}")

print("Done")



  data_matrix = np.array(file['fArray'])


    SN  Label  breathNumber  taskOrder  loadOrderCT  timeOfBreath   Age  Sex  \
0  3.0    1.0           2.0        2.0          1.0      3.527500  27.0  0.0   
1  3.0    1.0           3.0        2.0          1.0      6.355000  27.0  0.0   
2  3.0    1.0           4.0        2.0          1.0      9.388333  27.0  0.0   
3  3.0    1.0           5.0        2.0          1.0     13.041667  27.0  0.0   
4  3.0    1.0           6.0        2.0          1.0     16.673333  27.0  0.0   

   Height  Weight  ...  lagCoef_{0.50}  lagCoef_{0.75}     % RCi  TP_{RC,0}  \
0   158.0    52.9  ...       -0.216789       -0.160364  0.668837   0.541774   
1   158.0    52.9  ...       -0.274332       -0.205534  0.724171   0.000000   
2   158.0    52.9  ...       -0.302754       -0.206114  0.753504   0.512530   
3   158.0    52.9  ...        0.086828        0.031575  0.438083   0.488389   
4   158.0    52.9  ...        0.102044        0.044175  0.465689   0.143559   

    TP_{AB,0}   TP_{RC,0.10}  TP_{AB,0.10}  

In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the data

df = pd.read_csv('RespCoupling.csv')

# 1. Basic information about the dataset
print("1. Basic Information about the Dataset:")
print(df.info())

# 2. Summary statistics
print("\n2. Summary Statistics:")
print(df.describe())

# 3. Check for missing values
print("\n3. Missing Values:")
print(df.isnull().sum())

#replace column names
df.columns = df.columns.str.replace('\
abla ', 'del_', regex=True)
df.columns = df.columns.str.replace('\t', '', regex=True)
df.columns = df.columns.str.replace("\\n", '')
df.columns = df.columns.str.strip()

print("Column names have been updated.")
print("\
Updated column names:")
print(df.columns.tolist())

print("Done")

df['Nicottine Use'].replace([np.inf, -np.inf], np.nan, inplace=True)

# Define a function to remove outliers using the IQR method with adjustable factors
def remove_outliers_iqr(df, column, lower_factor=1.5, upper_factor=1.5):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - lower_factor * IQR
    upper_bound = Q3 + upper_factor * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Apply the IQR method to columns with extreme ranges
columns_to_clean = ['work', 'PTP', 'workPerLiter']

# Store the original shape
original_shape = df.shape[0]

# Remove outliers for each specified column
for column in columns_to_clean:
    df = remove_outliers_iqr(df, column)

# Display the new shape and percentage of data retained
print(f"Original dataset shape: {original_shape}")
print(f"Cleaned dataset shape: {df.shape[0]}")
print(f"Percentage of data retained: {df.shape[0] / original_shape * 100:.2f}%")

# Save the cleaned dataset to a new CSV file
cleaned_file_name = 'RespCoupling_cleaned_v2.csv'
df.to_csv(cleaned_file_name, index=False)
print(f"Cleaned dataset saved as '{cleaned_file_name}'")

# 4. Distribution of 'work' variable
plt.figure(figsize=(10, 6))
sns.histplot(df['work'], kde=True)
plt.title('Distribution of Work of Breathing')
plt.xlabel('Work')
plt.savefig('work_distribution.png')
plt.close()

# 5. Box plot of 'work' for each Label
plt.figure(figsize=(12, 6))
sns.boxplot(x='Label', y='work', data=df)
plt.title('Work of Breathing by Label')
plt.savefig('work_by_label_boxplot.png')
plt.close()

# 6. Correlation matrix
corr_matrix = df.corr()
plt.figure(figsize=(20, 16))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.close()

# 7. Pairplot of key variables
key_vars = ['work', 'PTP', 'workPerLiter', 'Volume', 'T_i']
sns.pairplot(df[key_vars], diag_kind='kde')
plt.suptitle('Pairplot of Key Variables', y=1.02)
plt.savefig('key_variables_pairplot.png')
plt.close()

# 8. Time series plot of work for each label
plt.figure(figsize=(12, 6))
for label in df['Label'].unique():
    subset = df[df['Label'] == label]
    plt.plot(subset['timeOfBreath'], subset['work'], label=f'Label {label}')
plt.title('Work Over Time for Different Labels')
plt.xlabel('Time of Breath')
plt.ylabel('Work')
plt.legend()
plt.savefig('work_over_time.png')
plt.close()

# 9. Additional statistics
print("\n9. Unique values in 'Label' column:")
print(df['Label'].value_counts())

print("\n10. Correlation of 'work' with other variables:")
work_corr = df.corr()['work'].sort_values(ascending=False)
print(work_corr)

print("\nAll plots have been generated and saved.")

1. Basic Information about the Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9961 entries, 0 to 9960
Data columns (total 43 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   SN                       9961 non-null   float64
 1   Label                    9961 non-null   float64
 2   breathNumber             9961 non-null   float64
 3   taskOrder                9961 non-null   float64
 4   loadOrderCT              9961 non-null   float64
 5   timeOfBreath             9961 non-null   float64
 6   Age                      9961 non-null   float64
 7   Sex                      9961 non-null   float64
 8   Height                   9961 non-null   float64
 9   Weight                   9961 non-null   float64
 10  BMI                      9961 non-null   float64
 11  Body fat                 9961 non-null   float64
 12  Visceral fat             9961 non-null   float64
 13  Muscle                   9961 non-null

  sqr = _ensure_numeric((avg - values) ** 2)
  diff_b_a = subtract(b, a)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Nicottine Use'].replace([np.inf, -np.inf], np.nan, inplace=True)


Cleaned dataset saved as 'RespCoupling_cleaned_v2.csv'

9. Unique values in 'Label' column:
Label
2.0    2034
1.0    1982
4.0    1192
3.0    1180
5.0    1150
6.0    1130
Name: count, dtype: int64

10. Correlation of 'work' with other variables:
work                   1.000000
PTP                    0.879341
workPerLiter           0.859728
Volume                 0.382438
Label                  0.369402
C_{flow,RC}            0.349865
C_{flow,AB}            0.333556
T_i                    0.284254
MSSE_{flow}            0.209631
timeOfBreath           0.159046
breathNumber           0.142545
BMI                    0.098391
Visceral fat           0.098096
Body fat               0.092901
lagCoef_{0.25}         0.075643
% RCi                  0.073261
VR_{RC}                0.072690
loadOrderCT            0.063808
Waist Size             0.057996
Chest Circunference    0.057401
Age                    0.049693
lagCoef_{0.50}         0.049324
C_{del_RC,del_AB}      0.047335
Weight             

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Assuming 'work' is our target variable for regression
X = df.drop(['work', 'SN', 'Label', 'breathNumber', 'taskOrder', 'loadOrderCT', 'timeOfBreath'], axis=1)
y = df['work']

# Convert categorical variables to numeric
X = pd.get_dummies(X, columns=['Sex', 'Nicottine Use'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform LASSO for feature selection
lasso = Lasso(alpha=0.1, random_state=42)
selector = SelectFromModel(lasso, prefit=False)
selector.fit(X_train_scaled, y_train)

# Get selected feature names
selected_features = X.columns[selector.get_support()].tolist()

print("Selected features by LASSO:")
print(selected_features)
print("\
Number of selected features:", len(selected_features))

# Create a dataframe with feature importances
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(selector.estimator_.coef_)
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print("\
Top 10 most important features:")
print(feature_importance.head(10))

Selected features by LASSO:
['Muscle', 'C_{flow,AB}', 'MSSE_{RC}', 'MSSE_{AB}', 'T_i', 'Volume', '% RCi', 'PTP', 'workPerLiter']
Number of selected features: 9
Top 10 most important features:
         feature  importance
33  workPerLiter    3.271022
32           PTP    2.625097
23        Volume    2.377092
22           T_i    0.435593
6         Muscle    0.347831
18     MSSE_{RC}    0.250786
19     MSSE_{AB}    0.126741
12   C_{flow,AB}    0.085290
27         % RCi    0.054505
34       Sex_0.0    0.000000
