In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("../data/processed/standardized_data.csv")

#Convert into X and y, where X is the feautures/conditions of users, and Y is if heart disease is present 
X = df.drop('Heart Disease', axis=1)
y = df['Heart Disease']


In [3]:
from sklearn.preprocessing import StandardScaler

# Standardize data to have mean=0 and variance=1
scaler = StandardScaler()
df[['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Blood Sugar', 'Exercise Hours', 'Stress Level']] = scaler.fit_transform(df[['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Blood Sugar','Exercise Hours', 'Stress Level']])

# Save the cleaned data to a new CSV file
df.to_csv("../data/processed/standardized_data.csv", index=False)


In [7]:
from sklearn.feature_selection import SelectKBest, f_classif

# Select the best features using SelectKBest
k= 10 
selector = SelectKBest(score_func=f_classif, k=k)
X_selected = selector.fit_transform(X, y)

selected_features = X.columns[selector.get_support()]
print(f"The {k} best features by SelectKBest are: {selected_features}")

The 10 best features by SelectKBest are: Index(['Age', 'Cholesterol', 'Gender_Male', 'Smoking_Former', 'Smoking_Never',
       'Alcohol Intake_Moderate', 'Family History_Yes', 'Obesity_Yes',
       'Chest Pain Type_Atypical Angina', 'Chest Pain Type_Non-anginal Pain'],
      dtype='object')


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Create a logistic regression model
model = LogisticRegression(max_iter=1000)

#Apply RFE with logistic regression
rfe = RFE(estimator=model, n_features_to_select=10)
X_rfe = rfe.fit_transform(X, y)

#Get the selected features from RFE (recursive feature elimination)
selected_features_rfe = X.columns[rfe.support_]
print(f"The {k} best features are: {selected_features_rfe}")

#Ranking of features
ranking = pd.DataFrame({'Feature': X.columns, 'Rank': rfe.ranking_})
ranking = ranking.sort_values(by='Rank')

The 10 best features are: Index(['Age', 'Cholesterol', 'Gender_Male', 'Smoking_Former',
       'Family History_Yes', 'Diabetes_Yes', 'Exercise Induced Angina_Yes',
       'Chest Pain Type_Atypical Angina', 'Chest Pain Type_Non-anginal Pain',
       'Chest Pain Type_Typical Angina'],
      dtype='object')


In [12]:
import numpy as np
import skfuzzy as fuzz
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/processed/cleaned_data.csv")

# Assume 'cholesterol' is the column with cholesterol levels in the dataset
cholesterol_values = df['Cholesterol'].values  # Extract the cholesterol values

# Define the range of cholesterol values for fuzzification
max_cholesterol = df['Cholesterol'].max()
print(f"The largest cholesterol value is: {max_cholesterol}")

cholesterol_range = np.arange(0, 350, 1)

# Create fuzzy membership functions for cholesterol levels
low_chol = fuzz.trimf(cholesterol_range, [0, 0, 200])     # Low cholesterol: 0-200
medium_chol = fuzz.trimf(cholesterol_range, [150, 200, 250])  # Medium cholesterol: 200-240
high_chol = fuzz.trimf(cholesterol_range, [200, 300, 350])    # High cholesterol: 240+

# Fuzzify the cholesterol values from the dataset
low_chol_membership = fuzz.interp_membership(cholesterol_range, low_chol, cholesterol_values)
medium_chol_membership = fuzz.interp_membership(cholesterol_range, medium_chol, cholesterol_values)
high_chol_membership = fuzz.interp_membership(cholesterol_range, high_chol, cholesterol_values)

# Add the membership values as new columns in the DataFrame
df['Cholesterol_low'] = low_chol_membership
df['Cholesterol_medium'] = medium_chol_membership
df['Cholesterol_high'] = high_chol_membership

# Optionally, you can check the first few rows to confirm the new columns
print(df[['Cholesterol', 'Cholesterol_low', 'Cholesterol_medium', 'Cholesterol_high']].head())

# Save the fuzzified data to a new CSV file
df.to_csv("../data/processed/cleaned_data.csv", index=False)


The largest cholesterol value is: 349.0
   Cholesterol  Cholesterol_low  Cholesterol_medium  Cholesterol_high
0        228.0             0.00                0.44              0.28
1        204.0             0.00                0.92              0.04
2        234.0             0.00                0.32              0.34
3        192.0             0.04                0.84              0.00
4        172.0             0.14                0.44              0.00


In [None]:
import numpy as np
import skfuzzy as fuzz
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/processed/cleaned_data.csv")

# Extract the blood pressure values
blood_pressure_values = df['Blood Pressure'].values 

# Define the range of blood pressure values for fuzzification
max_blood_pressure = df['Blood Pressure'].max()
print(f"The largest Blood Pressure value is: {max_blood_pressure}")

# Define the blood pressure range (from 0 to 180)
blood_pressure_range = np.arange(0, 181, 1)

# Create fuzzy membership functions for blood pressure levels
low_bp = fuzz.trimf(blood_pressure_range, [0, 0, 80])     # Low blood pressure: 0-80
medium_bp = fuzz.trimf(blood_pressure_range, [60, 90, 120])  # Medium blood pressure: 80-120
high_bp = fuzz.trimf(blood_pressure_range, [100, 140, 180])    # High blood pressure: 120-180

# Fuzzify the blood pressure values from the dataset
low_bp_membership = fuzz.interp_membership(blood_pressure_range, low_bp, blood_pressure_values)
medium_bp_membership = fuzz.interp_membership(blood_pressure_range, medium_bp, blood_pressure_values)
high_bp_membership = fuzz.interp_membership(blood_pressure_range, high_bp, blood_pressure_values)

# Add the membership values as new columns in the DataFrame
df['Blood_Pressure_Low'] = low_bp_membership
df['Blood_Pressure_Medium'] = medium_bp_membership
df['Blood_Pressure_High'] = high_bp_membership

# Optionally, you can check the first few rows to confirm the new columns
print(df[['Blood Pressure', 'Blood_Pressure_Low', 'Blood_Pressure_Medium', 'Blood_Pressure_High']].head())

# Save the fuzzified data to a new CSV file
df.to_csv("../data/processed/cleaned_data.csv", index=False)


The largest Blood Pressure value is: 179.0
   Blood Pressure  Blood_Pressure_Low  Blood_Pressure_Medium  \
0           119.0                 0.0               0.033333   
1           165.0                 0.0               0.000000   
2            91.0                 0.0               0.966667   
3            90.0                 0.0               1.000000   
4           163.0                 0.0               0.000000   

   Blood_Pressure_High  
0                0.475  
1                0.375  
2                0.000  
3                0.000  
4                0.425  


In [22]:
import numpy as np
import skfuzzy as fuzz
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/processed/cleaned_data.csv")  

# Extract the heart rate values
heart_rate_values = df['Heart Rate'].values  # Extract the heart rate values

# Get the max heart rate from dataset
max_heart_rate = df['Heart Rate'].max()
print(f"The highest Heart Rate value is: {max_heart_rate}")

# Define the heart rate range (0 to 90, based on the highest value in dataset)
heart_rate_range = np.arange(0, 91, 1)  # Adjusted to match dataset

# Create fuzzy membership functions for heart rate levels
low_hr = fuzz.trimf(heart_rate_range, [0, 0, 60])       # Low heart rate: 0-60
medium_hr = fuzz.trimf(heart_rate_range, [50, 65, 75])  # Medium heart rate: 60-75
high_hr = fuzz.trimf(heart_rate_range, [70, 90, 90])    # High heart rate: 75-90

# Fuzzify the heart rate values from the dataset
low_hr_membership = fuzz.interp_membership(heart_rate_range, low_hr, heart_rate_values)
medium_hr_membership = fuzz.interp_membership(heart_rate_range, medium_hr, heart_rate_values)
high_hr_membership = fuzz.interp_membership(heart_rate_range, high_hr, heart_rate_values)

# Add the membership values as new columns in the DataFrame
df['Heart_Rate_Low'] = low_hr_membership
df['Heart_Rate_Medium'] = medium_hr_membership
df['Heart_Rate_High'] = high_hr_membership

# Display a sample of the updated data
print(df[['Heart Rate', 'Heart_Rate_Low', 'Heart_Rate_Medium', 'Heart_Rate_High']].head())

# Save the fuzzified dataset
df.to_csv("../data/processed/cleaned_data.csv", index=False)


The highest Heart Rate value is: 99.0
   Heart Rate  Heart_Rate_Low  Heart_Rate_Medium  Heart_Rate_High
0        66.0             0.0                0.9              0.0
1        62.0             0.0                0.8              0.0
2        67.0             0.0                0.8              0.0
3        72.0             0.0                0.3              0.1
4        93.0             0.0                0.0              0.0
