<a href="https://colab.research.google.com/github/sbhavya28/Customer-Churn-Prediction/blob/main/VT_Project2_ppg_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Objective:
Estimate Heart Rate (HR), Systolic Blood Pressure (SBP), and Diastolic Blood Pressure (DBP) from
smartphone-acquired signals stored in .txt files, using corresponding labels provided in a master .csv
file.

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.signal import butter, filtfilt

In [None]:
master_df = pd.read_csv('/content/Subject_masterdata.csv')
master_df.head()

Unnamed: 0,Subject_ID,Age,Gender,Height,Weight,Pulse,SpO2,SBP,DBP,Diet
0,Sub1,23,M,180,98,82,98,128,71,Fasting since 2hrs
1,Sub2,20,M,167,69,77,98,135,91,Fasting since 2hrs
2,Sub3,21,M,190,90,82,98,125,81,Fasting since 2hrs
3,Sub4,20,F,157,47,100,98,125,78,Fasting since 2hrs
4,Sub5,21,M,175,75,75,98,121,66,Fasting since 2hrs


In [None]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Subject_ID  24 non-null     object
 1   Age         24 non-null     int64 
 2   Gender      24 non-null     object
 3   Height      24 non-null     int64 
 4   Weight      24 non-null     int64 
 5   Pulse       24 non-null     int64 
 6   SpO2        24 non-null     int64 
 7   SBP         24 non-null     int64 
 8   DBP         24 non-null     int64 
 9   Diet        24 non-null     object
dtypes: int64(7), object(3)
memory usage: 2.0+ KB


Master Data CSV: A master_data.csv file containing the mapping of each subject’s file name
with their corresponding HR, SBP, and DBP values.

In [None]:
sub1 = pd.read_csv('/content/Sub1.txt', header = None)
sub1

Unnamed: 0,0
0,0.581644
1,0.864051
2,3.322083
3,10.593866
4,36.053854
...,...
2395,176.139332
2396,175.865738
2397,175.105460
2398,175.423799


In [None]:

# --- Helper: Butterworth low-pass filter ---
def butter_lowpass_filter(data, cutoff=3.0, fs=60.0, order=4):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return filtfilt(b, a, data)

# --- Main signal cleaning function ---
def clean_signal(signal_array):
    signal = np.ravel(signal_array)  # flatten to 1D
    signal = pd.Series(signal).dropna().reset_index(drop=True).values  # remove NaNs

    # Apply Butterworth low-pass filter
    filtered_signal = butter_lowpass_filter(signal, cutoff=3.0, fs=60.0, order=4)

    # Normalize between 0 and 1
    min_val, max_val = np.min(filtered_signal), np.max(filtered_signal)
    normalized_signal = (filtered_signal - min_val) / (max_val - min_val + 1e-8)

    return normalized_signal

In [None]:
subjects = {}
for subject_id in range(1, 25):
    filename = f"Sub{subject_id}.txt"

    if not os.path.exists(filename):
        print(f"[!] File not found: {filename}")
        continue

    raw_signal = pd.read_csv(filename, header=None).values
    cleaned_signal = clean_signal(raw_signal)

    signal_df = pd.DataFrame([cleaned_signal], columns=[f'signal_{i}' for i in range(len(cleaned_signal))])

    # FIX: match Subject_ID like "Sub1"
    subject_key = f"Sub{subject_id}"
    subject_row = master_df[master_df['Subject_ID'] == subject_key]

    if subject_row.empty:
        print(f"[!] Metadata not found for {subject_key}")
        continue

    for col in ['Pulse', 'SBP', 'DBP']:
        if col in subject_row.columns:
            signal_df[col] = subject_row[col].values[0]

    signal_df['Subject_ID'] = subject_key

    subjects[subject_id] = signal_df


In [None]:
# === Concatenate all subjects into one DataFrame === #
merged_df = pd.concat(subjects.values(), ignore_index=True)

# === Save to CSV === #
output_filename = "final_data.csv"
merged_df.to_csv(output_filename, index=False)

print(f"✅ Merged data saved successfully to '{output_filename}'")
print("🔍 Shape of merged data:", merged_df.shape)


✅ Merged data saved successfully to 'final_data.csv'
🔍 Shape of merged data: (24, 2404)


In [None]:
df = pd.read_csv('/content/final_data.csv')
df.head()

Unnamed: 0,signal_0,signal_1,signal_2,signal_3,signal_4,signal_5,signal_6,signal_7,signal_8,signal_9,...,signal_2394,signal_2395,signal_2396,signal_2397,signal_2398,signal_2399,Pulse,SBP,DBP,Subject_ID
0,0.0,0.120789,0.239843,0.355408,0.46566,0.568742,0.662861,0.746431,0.818227,0.877515,...,0.897134,0.897943,0.898904,0.899936,0.900977,0.90198,82,128,71,Sub1
1,1.0,0.865379,0.735123,0.613327,0.503496,0.408314,0.329489,0.2677,0.222624,0.193057,...,0.059157,0.056536,0.052838,0.048231,0.042954,0.037299,77,135,91,Sub2
2,1.0,0.877297,0.757955,0.645117,0.541504,0.449245,0.36976,0.303708,0.251007,0.21092,...,0.018181,0.018132,0.017576,0.016584,0.015262,0.013731,82,125,81,Sub3
3,1.0,0.901714,0.804993,0.711158,0.621536,0.537426,0.46003,0.390355,0.329116,0.276662,...,0.050225,0.045124,0.03821,0.029777,0.020226,0.01003,100,125,78,Sub4
4,0.0,0.153089,0.301815,0.442049,0.570152,0.6832,0.779146,0.856924,0.916464,0.958624,...,0.773984,0.777236,0.781199,0.785658,0.790391,0.795182,75,121,66,Sub5


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Columns: 2404 entries, signal_0 to Subject_ID
dtypes: float64(2400), int64(3), object(1)
memory usage: 450.9+ KB


In [None]:
df.describe()

Unnamed: 0,signal_0,signal_1,signal_2,signal_3,signal_4,signal_5,signal_6,signal_7,signal_8,signal_9,...,signal_2393,signal_2394,signal_2395,signal_2396,signal_2397,signal_2398,signal_2399,Pulse,SBP,DBP
count,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,...,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0
mean,0.588772,0.579672,0.571277,0.564148,0.55867,0.555012,0.55311,0.552701,0.553368,0.554613,...,0.365823,0.363746,0.36197,0.360586,0.359632,0.359085,0.358881,95.291667,122.083333,77.291667
std,0.482809,0.375582,0.277554,0.198339,0.155216,0.161277,0.198178,0.24158,0.280471,0.311221,...,0.398224,0.398649,0.398802,0.398674,0.398304,0.397769,0.397171,15.779676,17.011292,12.019836
min,0.0,0.071964,0.145258,0.220927,0.216487,0.124937,0.057935,0.016469,0.0,0.006662,...,0.012858,0.017284,0.018132,0.017576,0.016584,0.014276,0.0,75.0,89.0,58.0
25%,0.0,0.150619,0.295184,0.392874,0.464012,0.476557,0.447019,0.378159,0.318471,0.268116,...,0.048666,0.046197,0.044485,0.041695,0.041217,0.042503,0.044947,82.0,109.25,68.5
50%,1.0,0.842477,0.690875,0.559068,0.570149,0.575248,0.610049,0.589602,0.577522,0.541519,...,0.111792,0.105471,0.103132,0.103498,0.104917,0.107278,0.110339,94.5,125.0,74.5
75%,1.0,0.909858,0.822379,0.739624,0.663363,0.670367,0.665273,0.740538,0.798281,0.864024,...,0.857716,0.849922,0.839239,0.826061,0.810935,0.807061,0.810402,105.25,135.5,87.0
max,1.0,0.967152,0.934247,0.901284,0.868325,0.835518,0.803109,0.876807,0.933543,0.971621,...,0.944276,0.945494,0.946717,0.9479,0.94901,0.950023,0.950929,126.0,154.0,105.0


In [None]:
import plotly.express as px

# Assuming final_data is already loaded into a pandas DataFrame named `df`

# Reshape the DataFrame for plotting
# We need a column for the signal value and a column for time index
# Since the signal columns are named 'signal_0', 'signal_1', etc., we can unpivot the DataFrame
signal_columns = [col for col in df.columns if col.startswith('signal_')]
df_melted = df.melt(id_vars=['Subject_ID', 'Pulse', 'SBP', 'DBP'],
                     value_vars=signal_columns,
                     var_name='Time_Index_Str',
                     value_name='PPG_Value')

# Extract the numerical time index from the column names (e.g., 'signal_0' -> 0)
df_melted['Time_Index'] = df_melted['Time_Index_Str'].apply(lambda x: int(x.split('_')[1]))

# Sort by Subject_ID and Time_Index to ensure the line is plotted correctly
df_melted = df_melted.sort_values(by=['Subject_ID', 'Time_Index']).reset_index(drop=True)

# Create the plot
fig = px.line(df_melted, x='Time_Index', y='PPG_Value', color='Subject_ID',
              title='PPG Signal vs. Time for Each Subject',
              labels={'Time_Index': 'Time (Arbitrary Units)', 'PPG_Value': 'Normalized PPG Signal'})

# Show the plot
fig.show()

In [None]:
# Select data for a single subject (e.g., Subject_ID 'Sub1')
single_subject_id = 'Sub1'
df_single_subject = df_melted[df_melted['Subject_ID'] == single_subject_id]

# Create the plot for the single subject
fig_single = px.line(df_single_subject, x='Time_Index', y='PPG_Value',
                     title=f'PPG Signal vs. Time for {single_subject_id}',
                     labels={'Time_Index': 'Time (Arbitrary Units)', 'PPG_Value': 'Normalized PPG Signal'})

# Show the plot for the single subject
fig_single.show()


Extract Features

In [None]:
from scipy.signal import find_peaks

def extract_features(ecg, ppg, sampling_rate=125):
    # --- Process ECG Signal ---
    ecg_signals, ecg_info = nk.ecg_process(ecg, sampling_rate=sampling_rate)
    r_peaks = ecg_info['ECG_R_Peaks']

    # --- Process PPG Signal ---
    ppg_signals = nk.ppg_process(ppg, sampling_rate=sampling_rate)[0]
    ppg_peaks, _ = find_peaks(ppg, distance=sampling_rate*0.5)  # ~0.5s between peaks

    # --- Heart Rate ---
    rr_intervals = np.diff(r_peaks) / sampling_rate
    heart_rate = 60 / rr_intervals
    mean_hr = np.mean(heart_rate)

    # --- Pulse Transit Time (PTT) ---
    # For each R-peak, find the next PPG peak (foot or max)
    ptt_values = []
    for r in r_peaks:
        ppg_candidates = ppg_peaks[ppg_peaks > r]
        if len(ppg_candidates) == 0:
            continue
        next_ppg_peak = ppg_candidates[0]
        ptt = (next_ppg_peak - r) / sampling_rate
        ptt_values.append(ptt)

    mean_ptt = np.mean(ptt_values) if len(ptt_values) > 0 else np.nan

    # --- PPG Morphological Features ---
    pulse_widths = []
    upstroke_times = []
    peak_amplitudes = []

    for i in range(1, len(ppg_peaks)-1):
        start = ppg_peaks[i-1]
        peak = ppg_peaks[i]
        end = ppg_peaks[i+1]
        segment = ppg[start:end]

        # Width at 50% amplitude
        half_amp = (np.max(segment) + np.min(segment)) / 2
        indices = np.where(segment >= half_amp)[0]
        if len(indices) > 1:
            width = (indices[-1] - indices[0]) / sampling_rate
        else:
            width = np.nan
        pulse_widths.append(width)

        # Upstroke time (foot to peak)
        foot_index = np.argmin(segment[:np.argmax(segment)])  # foot to peak
        upstroke = (np.argmax(segment) - foot_index) / sampling_rate
        upstroke_times.append(upstroke)

        # Peak amplitude
        peak_amplitudes.append(np.max(segment) - np.min(segment))

    # --- Create Feature Vector ---
    features = {
        'HR_mean': mean_hr,
        'PTT_mean': mean_ptt,
        'Pulse_Width_mean': np.nanmean(pulse_widths),
        'Upstroke_Time_mean': np.nanmean(upstroke_times),
        'Peak_Amplitude_mean': np.nanmean(peak_amplitudes)
    }

    return pd.DataFrame([features])

Feature Selection

In [None]:
df.head()

Unnamed: 0,signal_0,signal_1,signal_2,signal_3,signal_4,signal_5,signal_6,signal_7,signal_8,signal_9,...,signal_2394,signal_2395,signal_2396,signal_2397,signal_2398,signal_2399,Pulse,SBP,DBP,Subject_ID
0,0.0,0.120789,0.239843,0.355408,0.46566,0.568742,0.662861,0.746431,0.818227,0.877515,...,0.897134,0.897943,0.898904,0.899936,0.900977,0.90198,82,128,71,Sub1
1,1.0,0.865379,0.735123,0.613327,0.503496,0.408314,0.329489,0.2677,0.222624,0.193057,...,0.059157,0.056536,0.052838,0.048231,0.042954,0.037299,77,135,91,Sub2
2,1.0,0.877297,0.757955,0.645117,0.541504,0.449245,0.36976,0.303708,0.251007,0.21092,...,0.018181,0.018132,0.017576,0.016584,0.015262,0.013731,82,125,81,Sub3
3,1.0,0.901714,0.804993,0.711158,0.621536,0.537426,0.46003,0.390355,0.329116,0.276662,...,0.050225,0.045124,0.03821,0.029777,0.020226,0.01003,100,125,78,Sub4
4,0.0,0.153089,0.301815,0.442049,0.570152,0.6832,0.779146,0.856924,0.916464,0.958624,...,0.773984,0.777236,0.781199,0.785658,0.790391,0.795182,75,121,66,Sub5


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df = df.drop(columns=['Subject_ID'])
df['sub_ID'] = df.index + 1


In [None]:
df.head()

Unnamed: 0,signal_0,signal_1,signal_2,signal_3,signal_4,signal_5,signal_6,signal_7,signal_8,signal_9,...,signal_2394,signal_2395,signal_2396,signal_2397,signal_2398,signal_2399,Pulse,SBP,DBP,sub_ID
0,0.0,0.120789,0.239843,0.355408,0.46566,0.568742,0.662861,0.746431,0.818227,0.877515,...,0.897134,0.897943,0.898904,0.899936,0.900977,0.90198,82,128,71,1
1,1.0,0.865379,0.735123,0.613327,0.503496,0.408314,0.329489,0.2677,0.222624,0.193057,...,0.059157,0.056536,0.052838,0.048231,0.042954,0.037299,77,135,91,2
2,1.0,0.877297,0.757955,0.645117,0.541504,0.449245,0.36976,0.303708,0.251007,0.21092,...,0.018181,0.018132,0.017576,0.016584,0.015262,0.013731,82,125,81,3
3,1.0,0.901714,0.804993,0.711158,0.621536,0.537426,0.46003,0.390355,0.329116,0.276662,...,0.050225,0.045124,0.03821,0.029777,0.020226,0.01003,100,125,78,4
4,0.0,0.153089,0.301815,0.442049,0.570152,0.6832,0.779146,0.856924,0.916464,0.958624,...,0.773984,0.777236,0.781199,0.785658,0.790391,0.795182,75,121,66,5


In [None]:
X = df.drop(columns=["SBP", "DBP"])
y_sbp = df["SBP"]
y_dbp = df["DBP"]
y_hr = df['Pulse']

Train a Machine Learning Model

In [None]:
##trainig model for SBP prediction
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_sbp, test_size=0.2, random_state=40)
rf_model_sbp = RandomForestRegressor(n_estimators=100, random_state=40)
rf_model_sbp.fit(X_train, y_train)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = rf_model_sbp.predict(X_test)
print('For SBP:')
print('MSE: ',mean_squared_error(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('R2: ',r2_score(y_test, y_pred))


For SBP:
MSE:  88.61228000000003
MAE:  8.540000000000001
R2:  0.5759366385911178


In [None]:
##trainig model for DBP prediction
X_train_dbp, X_test_dbp, y_train_dbp, y_test_dbp = train_test_split(X, y_dbp, test_size=0.2, random_state=40)
rf_model_dbp = RandomForestRegressor(n_estimators=100)
rf_model_dbp.fit(X_train_dbp, y_train_dbp)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred_dbp = rf_model_dbp.predict(X_test)
print('For DBP:')
print('MSE: ',mean_squared_error(y_test, y_pred_dbp))
print('MAE: ', mean_absolute_error(y_test, y_pred_dbp))
print('R2: ',r2_score(y_test, y_pred_dbp))

For DBP:
MSE:  1652.26414
MAE:  39.178
R2:  -6.907083365237367


In [None]:
##training model for hr
X_train_hr, X_test_hr, y_train_hr, y_test_hr = train_test_split(X, y_hr, test_size=0.3, random_state=40)
rf_model_hr = RandomForestRegressor(n_estimators=150, random_state=90)
rf_model_hr.fit(X_train_hr, y_train_hr)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred_hr = rf_model_hr.predict(X_test)
print('For HR:')
print('MSE: ',mean_squared_error(y_test, y_pred_hr))
print('MAE: ', mean_absolute_error(y_test, y_pred_hr))
print('R2: ',r2_score(y_test, y_pred_hr))

For HR:
MSE:  749.6739022222223
MAE:  23.176000000000002
R2:  -2.587643100221202


In [None]:
plot_df = master_df[['Subject_ID', 'Pulse', 'SBP', 'DBP']].copy()
plot_df_melted = plot_df.melt(id_vars=['Subject_ID'],
                              value_vars=['Pulse', 'SBP', 'DBP'],
                              var_name='Measurement_Type',
                              value_name='Value')
fig = px.line(plot_df_melted,
              x='Subject_ID',
              y='Value',
              color='Measurement_Type',
              title='Variation of HR, SBP, and DBP Across Subjects',
              labels={'Subject_ID': 'Subject ID', 'Value': 'Measurement Value'},
              markers=True) # Use markers to denote the data points

fig.update_layout(xaxis_tickangle=-45)
fig.show()


In [None]:
# Save heart rate model
joblib.dump(rf_model_hr, "model_hr.pkl")

# Save systolic blood pressure model
joblib.dump(rf_model_sbp, "model_sbp.pkl")

# Save diastolic blood pressure model
joblib.dump(rf_model_dbp, "model_dbp.pkl")


['model_dbp.pkl']

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# 1. Load the Data
df = pd.read_csv('final_data.csv')

# Assume 'subject' column exists, drop it if needed
# This line might be unnecessary if 'Subject_ID' is the actual column name
# if 'subject' in df.columns:
#     df = df.drop(columns=['subject'])

# Remove non-numeric columns before scaling
# Select only columns with numerical data types
df_numeric = df.select_dtypes(include=[np.number])

# 2. Preprocess the Signals
# Drop missing values (if any) from the numerical DataFrame
df_numeric = df_numeric.dropna()

# 3. Normalize the Signals
scaler = StandardScaler()
# Fit and transform the numerical DataFrame
X = scaler.fit_transform(df_numeric)

# 4. Extract Features
# For now we use the signals directly as features
# Fake labels (simulate HR, SBP, DBP for training demo)
np.random.seed(0)
# The size of y should match the number of rows in df_numeric after dropping NaNs
HR = np.random.randint(60, 100, size=df_numeric.shape[0])
SBP = np.random.randint(100, 140, size=df_numeric.shape[0])
DBP = np.random.randint(60, 90, size=df_numeric.shape[0])
y = pd.DataFrame({'HR': HR, 'SBP': SBP, 'DBP': DBP})

# 5. Perform Feature Selection
selector = SelectKBest(score_func=f_regression, k=min(20, X.shape[1])) # Ensure k is not greater than the number of features
X_selected = selector.fit_transform(X, y['HR'])  # use HR for selection

# 6. Train a Machine Learning Model
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
models = {}
for col in y.columns:
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train[col])
    models[col] = model

# 7. Evaluate the Model
for col in y.columns:
    preds = models[col].predict(X_test)
    mae = mean_absolute_error(y_test[col], preds)
    print(f'{col} MAE:', round(mae, 2))

# 8. Test with Unseen Data
# Simulate new unseen signals by reusing some data
unseen_data = X[:5]
unseen_selected = selector.transform(unseen_data)
print("\nPredictions on new unseen data:")
for col in models:
    preds = models[col].predict(unseen_selected)
    print(f'{col}:', preds.round(1))

HR MAE: 8.71
SBP MAE: 9.81
DBP MAE: 6.13

Predictions on new unseen data:
HR: [70.  73.9 74.1 97.4 68.7]
SBP: [125.2 115.2 110.2 113.4 109.9]
DBP: [72.8 64.  68.3 75.4 69.9]
