### Libraries

In [11]:
import numpy as np

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error


### Dataset

In [5]:
#Import dataset
train_data = np.load('simu_20000_0.1_90_140_train.npy')

test_data = np.load('simu_10000_0.1_141_178_test.npy')

# Display shapes and sample data
train_shape, test_shape, train_data_sample, test_data_sample = train_data.shape, test_data.shape, train_data[0], test_data[0]
# write print statement and show sample data
#print(train_shape, test_shape, train_data_sample, test_data_sample)
train_shape, test_shape, train_data_sample, test_data_sample


((20000, 1006),
 (10000, 1006),
 array([-2.45845714e-07, -2.06162897e-07,  1.56348382e-06, ...,
         1.90000000e+01,  9.10000000e+01,  9.50000000e+01]),
 array([-7.14960675e-08, -3.61296976e-08,  3.41059619e-07, ...,
         1.10000000e+01,  1.72000000e+02,  8.30000000e+01]))

In [6]:
# Splitting the data into input and outputs

# For training data
X_train = train_data[:, :1000]
y_train_systolic = train_data[:, -2]
y_train_diastolic = train_data[:, -1]

# For test data
X_test = test_data[:, :1000]
y_test_systolic = test_data[:, -2]
y_test_diastolic = test_data[:, -1]

# Displaying shapes to verify the split and print statemnt
#print(X_train.shape, y_train_systolic.shape, y_train_diastolic.shape, X_test.shape, y_test_systolic.shape, y_test_diastolic.shape)
X_train.shape, y_train_systolic.shape, y_train_diastolic.shape, X_test.shape, y_test_systolic.shape, y_test_diastolic.shape


((20000, 1000), (20000,), (20000,), (10000, 1000), (10000,), (10000,))

In [7]:
# Apply PCA to reduce dimensionality
pca = PCA(n_components=100)  # Reduce to 100 principal components
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


### Ridge Regression

In [8]:
# Train Ridge Regression models for systolic and diastolic pressures
ridge_systolic = Ridge(alpha=1.0)
ridge_diastolic = Ridge(alpha=1.0)

ridge_systolic.fit(X_train_pca, y_train_systolic)
ridge_diastolic.fit(X_train_pca, y_train_diastolic)

In [12]:
# prediction of the te4st set
y_predict_systolic = ridge_systolic.predict(X_test_pca)
y_predict_diastolic = ridge_diastolic.predict(X_test_pca)

# Predict on the test set
y_pred_systolic_ridge = ridge_systolic.predict(X_test_pca)
y_pred_diastolic_ridge = ridge_diastolic.predict(X_test_pca)

# Calculate the MAE for both systolic and diastolic predictions
mae_systolic_ridge = mean_absolute_error(y_test_systolic, y_pred_systolic_ridge)
mae_diastolic_ridge = mean_absolute_error(y_test_diastolic, y_pred_diastolic_ridge)

mae_systolic_ridge, mae_diastolic_ridge

(44.53694702295776, 10.205348634704755)

- These error values are higher than with Gradient Boosting, because Ridge Regression is a simpler model and we've also reduced the dimensionality of the data. However, the training time is significantly reduced with this approach.

- To balance accuracy and training time, we experiment with different numbers of principal components in PCA or trying other dimensionality reduction techniques. 
- Using TensorFlow or other deep learning libraries, neural networks could potentially provide better accuracy without being overly time-consuming.

### Linear Regression

In [13]:
from sklearn.linear_model import LinearRegression

# Train Linear Regression models for systolic and diastolic pressures
linreg_systolic = LinearRegression()
linreg_diastolic = LinearRegression()

linreg_systolic.fit(X_train, y_train_systolic)
linreg_diastolic.fit(X_train, y_train_diastolic)

# Predict on the test set
y_pred_systolic_linreg = linreg_systolic.predict(X_test)
y_pred_diastolic_linreg = linreg_diastolic.predict(X_test)

# Calculate the MAE for both systolic and diastolic predictions
mae_systolic_linreg = mean_absolute_error(y_test_systolic, y_pred_systolic_linreg)
mae_diastolic_linreg = mean_absolute_error(y_test_diastolic, y_pred_diastolic_linreg)

mae_systolic_linreg, mae_diastolic_linreg


(25.27432752763387, 83.84633477096953)

- The error for the diastolic blood pressure is quite high compared to previous models. This suggests that the relationship between the input features and diastolic blood pressure might not be purely linear.

- Linear regression model does provide a quick baseline. We can also explore other simple models, as mentioned. For example, Support Vector Machines (SVM) for regression can capture non-linearities without being as computationally intensive as Gradient Boosting.

### SVR model

In [14]:
from sklearn.svm import SVR

# Use a subset of the training data for faster results
subset_indices = np.random.choice(X_train.shape[0], 5000, replace=False)
X_train_subset = X_train[subset_indices]
y_train_systolic_subset = y_train_systolic[subset_indices]
y_train_diastolic_subset = y_train_diastolic[subset_indices]

# Train SVR models for systolic and diastolic pressures
svr_systolic = SVR(kernel='rbf', C=1.0, epsilon=0.2)
svr_diastolic = SVR(kernel='rbf', C=1.0, epsilon=0.2)

svr_systolic.fit(X_train_subset, y_train_systolic_subset)
svr_diastolic.fit(X_train_subset, y_train_diastolic_subset)

# Predict on the test set
y_pred_systolic_svr = svr_systolic.predict(X_test)
y_pred_diastolic_svr = svr_diastolic.predict(X_test)

# Calculate the MAE for both systolic and diastolic predictions
mae_systolic_svr = mean_absolute_error(y_test_systolic, y_pred_systolic_svr)
mae_diastolic_svr = mean_absolute_error(y_test_diastolic, y_pred_diastolic_svr)

mae_systolic_svr, mae_diastolic_svr


(36.243610709596666, 10.202642191617658)

In [15]:
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train SVR models with a linear kernel on the scaled data
svr_systolic_linear = SVR(kernel='linear', C=1.0, epsilon=0.2)
svr_diastolic_linear = SVR(kernel='linear', C=1.0, epsilon=0.2)

# Using a subset of the training data for faster results
svr_systolic_linear.fit(X_train_scaled[subset_indices], y_train_systolic_subset)
svr_diastolic_linear.fit(X_train_scaled[subset_indices], y_train_diastolic_subset)

# Predict on the test set
y_pred_systolic_svr_linear = svr_systolic_linear.predict(X_test_scaled)
y_pred_diastolic_svr_linear = svr_diastolic_linear.predict(X_test_scaled)

# Calculate the MAE for both systolic and diastolic predictions
mae_systolic_svr_linear = mean_absolute_error(y_test_systolic, y_pred_systolic_svr_linear)
mae_diastolic_svr_linear = mean_absolute_error(y_test_diastolic, y_pred_diastolic_svr_linear)

mae_systolic_svr_linear, mae_diastolic_svr_linear


(24.583030143282166, 48.70141800888808)