In [15]:
import h5py
import numpy as np
import pandas as pd

with h5py.File('RespCoupling.Mat', 'r') as file:
    # Extract the data matrix
    data_matrix = np.array(file['fArray'])
    
    # Extract the feature names
    feature_names = []
    for ref in file['fNames']:
        # Dereference the object reference
        name_data = file[ref[0]]
        # Convert the uint16 array to a string
        name = ''.join(chr(i[0]) for i in name_data[:])
        feature_names.append(name)

# Create a DataFrame
resp_df = pd.DataFrame(data_matrix.T, columns=feature_names)

# Display the first few rows and info of the DataFrame
print(resp_df.head())
print("\
DataFrame Info:")
resp_df.info()

# Save the DataFrame to a CSV file
csv_filename = 'RespCoupling.csv'
resp_df.to_csv(csv_filename, index=False)
print(f"\
Data saved to {csv_filename}")

print("Done")

  data_matrix = np.array(file['fArray'])


    SN  Label  breathNumber  taskOrder  loadOrderCT  timeOfBreath   Age  Sex  \
0  3.0    1.0           2.0        2.0          1.0      3.527500  27.0  0.0   
1  3.0    1.0           3.0        2.0          1.0      6.355000  27.0  0.0   
2  3.0    1.0           4.0        2.0          1.0      9.388333  27.0  0.0   
3  3.0    1.0           5.0        2.0          1.0     13.041667  27.0  0.0   
4  3.0    1.0           6.0        2.0          1.0     16.673333  27.0  0.0   

   Height  Weight  ...  lagCoef_{0.50}  lagCoef_{0.75}     % RCi  TP_{RC,0}  \
0   158.0    52.9  ...       -0.216789       -0.160364  0.668837   0.541774   
1   158.0    52.9  ...       -0.274332       -0.205534  0.724171   0.000000   
2   158.0    52.9  ...       -0.302754       -0.206114  0.753504   0.512530   
3   158.0    52.9  ...        0.086828        0.031575  0.438083   0.488389   
4   158.0    52.9  ...        0.102044        0.044175  0.465689   0.143559   

    TP_{AB,0}   TP_{RC,0.10}  TP_{AB,0.10}  

In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the data
df = pd.read_csv('RespCoupling.csv')

# 1. Basic information about the dataset
print("1. Basic Information about the Dataset:")
print(df.info())

# 2. Summary statistics
print("\n2. Summary Statistics:")
print(df.describe())

# 3. Check for missing values
print("\n3. Missing Values:")
print(df.isnull().sum())

#replace column names
df.columns = df.columns.str.replace('\
abla ', 'del_', regex=True)
df.columns = df.columns.str.replace('\t', '', regex=True)
df.columns = df.columns.str.replace("\\n", '')
df.columns = df.columns.str.strip()

print("Column names have been updated.")
print("\
Updated column names:")
print(df.columns.tolist())

print("Done")

# 4. Distribution of 'work' variable
plt.figure(figsize=(10, 6))
sns.histplot(df['work'], kde=True)
plt.title('Distribution of Work of Breathing')
plt.xlabel('Work')
plt.savefig('work_distribution.png')
plt.close()

# 5. Box plot of 'work' for each Label
plt.figure(figsize=(12, 6))
sns.boxplot(x='Label', y='work', data=df)
plt.title('Work of Breathing by Label')
plt.savefig('work_by_label_boxplot.png')
plt.close()

# 6. Correlation matrix
corr_matrix = df.corr()
plt.figure(figsize=(20, 16))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.close()

# 7. Pairplot of key variables
key_vars = ['work', 'PTP', 'workPerLiter', 'Volume', 'T_i']
sns.pairplot(df[key_vars], diag_kind='kde')
plt.suptitle('Pairplot of Key Variables', y=1.02)
plt.savefig('key_variables_pairplot.png')
plt.close()

# 8. Time series plot of work for each label
plt.figure(figsize=(12, 6))
for label in df['Label'].unique():
    subset = df[df['Label'] == label]
    plt.plot(subset['timeOfBreath'], subset['work'], label=f'Label {label}')
plt.title('Work Over Time for Different Labels')
plt.xlabel('Time of Breath')
plt.ylabel('Work')
plt.legend()
plt.savefig('work_over_time.png')
plt.close()

# 9. Additional statistics
print("\n9. Unique values in 'Label' column:")
print(df['Label'].value_counts())

print("\n10. Correlation of 'work' with other variables:")
work_corr = df.corr()['work'].sort_values(ascending=False)
print(work_corr)

print("\nAll plots have been generated and saved.")

1. Basic Information about the Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9961 entries, 0 to 9960
Data columns (total 43 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   SN                       9961 non-null   float64
 1   Label                    9961 non-null   float64
 2   breathNumber             9961 non-null   float64
 3   taskOrder                9961 non-null   float64
 4   loadOrderCT              9961 non-null   float64
 5   timeOfBreath             9961 non-null   float64
 6   Age                      9961 non-null   float64
 7   Sex                      9961 non-null   float64
 8   Height                   9961 non-null   float64
 9   Weight                   9961 non-null   float64
 10  BMI                      9961 non-null   float64
 11  Body fat                 9961 non-null   float64
 12  Visceral fat             9961 non-null   float64
 13  Muscle                   9961 non-null

  sqr = _ensure_numeric((avg - values) ** 2)
  diff_b_a = subtract(b, a)



9. Unique values in 'Label' column:
Label
2.0    2036
1.0    1987
3.0    1549
4.0    1499
6.0    1463
5.0    1427
Name: count, dtype: int64

10. Correlation of 'work' with other variables:
work                   1.000000
workPerLiter           0.857577
PTP                    0.837147
Volume                 0.513891
C_{flow,AB}            0.354106
Label                  0.340345
C_{flow,RC}            0.334530
BMI                    0.313816
Visceral fat           0.300882
Weight                 0.252271
Waist Size             0.237276
Resting Metabolism     0.230059
MSSE_{flow}            0.217893
T_i                    0.213887
Chest Circunference    0.199436
timeOfBreath           0.174839
Nicottine Use          0.172101
% RCi                  0.165509
Sex                    0.138751
VR_{RC}                0.115023
Body fat               0.109353
lagCoef_{0.25}         0.091027
Height                 0.073719
lagCoef_{0.50}         0.042106
Age                    0.037467
breathNumb

KeyError: "['C_{flow,\\t\\nabla RC}', 'C_{flow,\\t\\nabla AB}', 'C_{\\nabla RC,\\nabla AB}'] not in index"