In [5]:
import numpy as np
import pandas as pd

In [6]:
# Q1: Reading and Preprocessing the Data
data = pd.read_csv('HeartFailureDataset 2.csv') # Reading the CSV file into a Pandas DataFrame
headers = np.array(data.columns)  # Extracting column headers (features)
data = np.array(data)  # Converting the DataFrame to a Numpy array
numeric = np.setdiff1d(np.arange(50), np.array([0,1,3,5,6,7,8,9,10,11,12,13]))  # Indices of numeric features

In [7]:
# Q2: Calculating Mean and Standard Deviation of Heart Rate for Each Gender
males_mean = np.nanmean(data[data[:, 3] == 1, 14])  # Calculating mean heart rate for males (assuming value (1) represents males)
females_mean = np.nanmean(data[data[:, 3] == 2, 14])  # Calculating mean heart rate for females (assuming value (2) represents females)
males_std = np.nanstd(data[data[:, 3] == 1, 14])  # Calculating std deviation of heart rate for males
females_std = np.nanstd(data[data[:, 3] == 2, 14])  # Calculating std deviation of heart rate for females

print(f'Males Mean: {males_mean}')
print(f'Females Mean: {females_mean}')
print(f'Males std: {males_std}')
print(f'Females std: {females_std}')

if males_mean > females_mean:
    print('Females have a lower mean heart rate')
elif males_mean < females_mean:
    print('Males have a lower mean heart rate')
else:
    print('Both genders have the same mean heart rate')

Males Mean: 84.80731991110707
Females Mean: 84.36778837735726
Males std: 16.072054208238853
Females std: 15.954616665632308
Females have a lower mean heart rate


In [8]:
# Q3: Calculating Median and Interquartile Range for Numeric Features
median = np.nanmedian(data[:, numeric], axis=0)  # Calculating median for numeric features
q1 = np.nanpercentile(data[:, numeric], 25, axis=0)  # Calculating 25th percentile (Q1)
q3 = np.nanpercentile(data[:, numeric], 75, axis=0)  # Calculating 75th percentile (Q3)
IQR = q3 - q1  # Calculating Interquartile Range
min_IQR = headers[np.argmin(IQR)]  # Finding the feature with the minimum IQR
max_IQR = headers[np.argmax(IQR)]  # Finding the feature with the maximum IQR
print(f'Median = {median}')
print(f'IQR = {IQR}')
print(f'Minimum IQR feature = {min_IQR}')
print(f'Maximum IQR feature = {max_IQR}')

Median = [7.70000000e+01 2.83124741e+01 8.36107994e+01 1.16128205e+02
 5.84615385e+01 2.03723077e+01 3.66507942e+01 9.64522727e+01
 1.67500000e+03 3.08000000e+01 3.49000000e+00 2.97500000e+01
 3.29857143e+01 9.00000000e+01 1.55062500e+01 9.68000000e+00
 2.22666667e+02 8.24666667e+01 3.00000000e-01 1.04750000e+01
 1.46333333e+01 1.30000000e+00 5.84000000e+03 8.92500000e+01
 1.28750000e+00 3.06666667e+01 1.36400000e+02 4.11538461e+00
 1.39250000e+02 8.50000000e+00 1.02500000e+02 1.36666667e+01
 2.09230769e+00 7.38000000e+00 2.65000000e+01 1.60000000e+00
 4.30000000e+01 5.50000000e+01]
IQR = [2.00000000e+01 9.30704799e+00 2.35358929e+01 2.32336957e+01
 1.32903727e+01 5.46550532e+00 7.35945805e-01 2.91750000e+00
 1.52000000e+03 6.85250000e+00 7.80000000e-01 2.99000000e+00
 1.81388889e+00 7.60714286e+00 2.47750000e+00 5.30000000e+00
 1.35340909e+02 1.26750000e+01 3.00000000e-01 8.81250000e+00
 5.64000000e+00 5.96363636e-01 1.27170000e+04 1.39187500e+02
 9.60000000e-01 2.44166667e+01 5.55625

In [9]:
# Q4: Calculating Total Blood Pressure and Identifying Top 10 Patients
total_blood_pressure = np.nansum(data[:, 15:17], axis=1)  # Summing Systolic and Diastolic blood pressure for each patient
highest_10_indices = np.argsort(total_blood_pressure)[-10:]  # Finding indices of top 10 total blood pressure values
highest_total_blood_pressure = data[highest_10_indices, 0]  # Extracting patient IDs with the highest total blood pressure
print(f'Top 10 Blood Pressure: {highest_total_blood_pressure}')

Top 10 Blood Pressure: [114085. 184453. 196981. 189112. 169263. 118932. 196856. 190823. 163199.
 178001.]


In [10]:
# Q5: Sorting Patients Based on Glucose Values and Printing Top 10 Patients
sort_by_glucose = data[np.argsort(data[:, 38])[::-1],]  # Sorting the dataset based on glucose values in descending order
print(f'Top 10 patients based on their glucose values: {sort_by_glucose[0:11, 0]}')  # Printing patient IDs of top 10 based on glucose

Top 10 patients based on their glucose values: [144806. 105820. 144894. 113423. 178833. 167919. 150854. 178684. 124926.
 196562. 194409.]


In [11]:
# Q6: Substituting Top 5 Creatinine Values with Median
top_5_indices = np.argsort(data[:, 37])[-5:]  # Finding the indices of the top 5 Creatinine values
data[top_5_indices, 37] = np.median(data[:, 37])  # Replacing top 5 Creatinine values with the median

In [12]:
# Q7: Identifying Top 100 Values for Each Numeric Feature and Calculating Mean
top_100_indices = np.argpartition(data[:, numeric], 100, axis=0)[-100:,]  # Finding indices of top 100 values for each numeric feature
top_100_patient_means = np.nanmean(data[top_100_indices], axis=1)  # Calculating mean for each patient using top 100 values
top_100_patient_means

array([[1.59791105e+05, 6.31578947e-01, 7.19210526e+01, ...,
        2.51086466e+00, 3.92178942e+01, 4.48684211e+01],
       [1.70065632e+05, 5.26315789e-01, 7.33421053e+01, ...,
        2.25162582e+00, 4.87129218e+01, 4.40789474e+01],
       [1.56839316e+05, 4.73684211e-01, 6.94473684e+01, ...,
        1.93865306e+00, 4.58373608e+01, 5.08421053e+01],
       ...,
       [1.45597158e+05, 2.63157895e-02, 7.58947368e+01, ...,
        6.88066667e-01, 6.37161752e+01, 5.38157895e+01],
       [1.25473947e+05, 5.26315789e-02, 7.42631579e+01, ...,
        6.21047619e-01, 6.85567988e+01, 3.89473684e+01],
       [1.30697737e+05, 0.00000000e+00, 5.88157895e+01, ...,
        1.71578947e+00, 4.00000000e+01, 5.48684211e+01]])

In [13]:
# Q8: Identifying Patient IDs with 2 Standard Deviations from Mean Respiratory Rate
mean_value = np.nanmean(data[:, 17], axis=0)  # Calculating mean respiratory rate
std_value = np.nanstd(data[:, 17], axis=0)  # Calculating std deviation of respiratory rate
respiratory_rate_ids = data[(data[:, 17] > mean_value + 2 * std_value) | (data[:, 17] < mean_value - 2 * std_value), 0]
print(respiratory_rate_ids)

[127360. 191289. 116888. 113812. 138440. 190054. 153366. 116367. 133975.
 152960. 145790. 108084. 107777. 126474. 107462. 141222. 130354. 128899.
 180135. 173649. 151364. 154468. 155273. 109577. 153207. 166585. 155044.
 110347. 120626. 128969. 133499. 192198. 110335. 175630. 166387. 149010.
 196357. 161407. 184331. 145202. 102433. 111327. 123701. 145333. 150871.
 111573. 126717. 161037. 180744. 145248. 113009. 182813. 146160. 159785.]


In [14]:
# Q9: Computing k-NN (k-Nearest Neighbors) for Each Patient
k = 3
dis = np.sqrt(np.nansum((data[:, np.newaxis, :] - data[np.newaxis, :, :]) ** 2, axis=-1))  # Computing Euclidean distances
knn = np.argpartition(dis, k, axis=1)[:k]  # Finding the indices of k-NN for each patient

In [15]:
# Q10: Computing Pearson Correlation Coefficient and Identifying Top Correlated Patients
correlation_matrix = np.corrcoef(data, rowvar=False)  # Computing Pearson correlation matrix
max_corr_index = np.unravel_index(np.argmax(correlation_matrix), correlation_matrix.shape)  # Finding indices of max correlation
patient1, patient2 = max_corr_index

print(f'Top correlated patients: {headers[patient1]} and {headers[patient2]}')
print('Correlation Matrix:')
print(correlation_matrix)

Top correlated patients: ID and outcome
Correlation Matrix:
[[ 1.                 nan -0.0265465  ...         nan         nan
   0.01554666]
 [        nan         nan         nan ...         nan         nan
          nan]
 [-0.0265465          nan  1.         ...         nan         nan
   0.05439953]
 ...
 [        nan         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]
 [ 0.01554666         nan  0.05439953 ...         nan         nan
   1.        ]]
