In [4]:
# Import necessary libraries
import pandas as pd
from scipy.stats import zscore

# Load the dataset
df = pd.read_csv("../data/raw/project 2.csv")

# Display first 5 rows of the dataset
print("First 5 Rows of the Dataset:")
print(df.head())

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\nNumber of Duplicate Rows: {duplicates}")

# Detect outliers using Z-scores (threshold ±3)
numerical_cols = ['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Blood Sugar', 'Stress Level']
z_scores = df[numerical_cols].apply(zscore)
outliers = df[(z_scores.abs() > 3).any(axis=1)]

print("\nOutliers Detected Using Z-Score Method:")
print(outliers)

First 5 Rows of the Dataset:
   Age  Gender  Cholesterol  Blood Pressure  Heart Rate  Smoking  \
0   75  Female        228.0           119.0          66  Current   
1   48    Male        204.0           165.0          62  Current   
2   53    Male        234.0            91.0          67    Never   
3   69  Female        192.0            90.0          72  Current   
4   62  Female        172.0           163.0          93    Never   

  Alcohol Intake  Exercise Hours Family History Diabetes Obesity  \
0          Heavy               0             No       No     Yes   
1            NaN               5             No       No      No   
2          Heavy               3            Yes       No     Yes   
3            NaN               4             No      Yes      No   
4            NaN               6             No      Yes      No   

   Stress Level  Blood Sugar Exercise Induced Angina   Chest Pain Type  \
0             8          119                     Yes   Atypical Angina   
1    

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.impute import KNNImputer

# Load the dataset
df = pd.read_csv("../data/raw/project 2.csv")

# Separate numerical and categorical columns
numerical_cols = ['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Blood Sugar', 'Stress Level']
categorical_cols = ['Gender', 'Smoking', 'Alcohol Intake', 'Exercise Hours', 'Family History',
                    'Diabetes', 'Obesity', 'Exercise Induced Angina', 'Chest Pain Type', 'Heart Disease']

# Encode categorical variables temporarily (required for KNNImputer)
df_encoded = pd.get_dummies(df, columns=['Gender', 'Smoking', 'Alcohol Intake', 'Family History',
                                         'Diabetes', 'Obesity', 'Exercise Induced Angina',
                                         'Chest Pain Type'], drop_first=True)

# Initialize KNNImputer
imputer = KNNImputer(n_neighbors=5)

# Apply KNNImputer to numerical columns
df_imputed = df_encoded.copy()
df_imputed[numerical_cols] = imputer.fit_transform(df_encoded[numerical_cols])

# Decode categorical variables back to original format
df_final = df_imputed.copy()

# Save the cleaned data to a new CSV file
df_final.to_csv("../data/processed/cleaned_data.csv", index=False)

print("Missing values handled and saved to ../data/processed/cleaned_data.csv")

Missing values handled and saved to ../data/processed/cleaned_data.csv


In [6]:
# checking if the cleaned data csv is loading or not

# Import necessary libraries
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("../data/processed/cleaned_data.csv")

# Display first 5 rows of the dataset
print("First 5 Rows of the Dataset:")
print(df.head())

First 5 Rows of the Dataset:
    Age  Cholesterol  Blood Pressure  Heart Rate  Exercise Hours  \
0  75.0        228.0           119.0        66.0               0   
1  48.0        204.0           165.0        62.0               5   
2  53.0        234.0            91.0        67.0               3   
3  69.0        192.0            90.0        72.0               4   
4  62.0        172.0           163.0        93.0               6   

   Stress Level  Blood Sugar  Heart Disease  Gender_Male  Smoking_Former  \
0           8.0        119.0              1        False           False   
1           9.0         70.0              0         True           False   
2           5.0        196.0              1         True           False   
3           7.0        107.0              0        False           False   
4           2.0        183.0              0        False           False   

   Smoking_Never  Alcohol Intake_Moderate  Family History_Yes  Diabetes_Yes  \
0          False          

In [7]:
# Check the distribution of Gender_Male
print("Gender Encoding:")
print(df['Gender_Male'].value_counts())

Gender Encoding:
Gender_Male
False    503
True     497
Name: count, dtype: int64


In [8]:
# Save the final encoded dataset
df.to_csv("../data/processed/encoded_data.csv", index=False)

print("\nFinal encoded dataset saved to ../data/processed/encoded_data.csv")


Final encoded dataset saved to ../data/processed/encoded_data.csv


In [9]:
print(df.columns)

Index(['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Exercise Hours',
       'Stress Level', 'Blood Sugar', 'Heart Disease', 'Gender_Male',
       'Smoking_Former', 'Smoking_Never', 'Alcohol Intake_Moderate',
       'Family History_Yes', 'Diabetes_Yes', 'Obesity_Yes',
       'Exercise Induced Angina_Yes', 'Chest Pain Type_Atypical Angina',
       'Chest Pain Type_Non-anginal Pain', 'Chest Pain Type_Typical Angina'],
      dtype='object')


In [10]:
# 7:05 pm March 16, there are some mistakes with KNNImputer

# Load raw data
df = pd.read_csv("../data/raw/project 2.csv")

# One-hot encode categorical variables (WITHOUT dropping the target)
categorical_cols = ['Gender', 'Smoking', 'Alcohol Intake', 'Family History',
                    'Diabetes', 'Obesity', 'Exercise Induced Angina', 'Chest Pain Type']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Verify all encoded columns:
print("Encoded Columns:")
print(df_encoded.columns)

Encoded Columns:
Index(['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Exercise Hours',
       'Stress Level', 'Blood Sugar', 'Heart Disease', 'Gender_Male',
       'Smoking_Former', 'Smoking_Never', 'Alcohol Intake_Moderate',
       'Family History_Yes', 'Diabetes_Yes', 'Obesity_Yes',
       'Exercise Induced Angina_Yes', 'Chest Pain Type_Atypical Angina',
       'Chest Pain Type_Non-anginal Pain', 'Chest Pain Type_Typical Angina'],
      dtype='object')


In [9]:
# Split data into features (X) and target (y)
X = df_encoded.drop(columns=['Heart Disease'])
y = df_encoded['Heart Disease']

In [10]:
# Load raw data
df = pd.read_csv("../data/raw/project 2.csv")

# Split into original numerical and categorical columns
numerical_cols = ['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate',
                 'Exercise Hours', 'Stress Level', 'Blood Sugar']
categorical_cols = ['Gender', 'Smoking', 'Alcohol Intake', 'Family History',
                    'Diabetes', 'Obesity', 'Exercise Induced Angina',
                    'Chest Pain Type']

# Impute numerical columns with KNNImputer
from sklearn.impute import KNNImputer
imputer_numerical = KNNImputer(n_neighbors=5)
df[numerical_cols] = imputer_numerical.fit_transform(df[numerical_cols])

# Impute categorical columns with SimpleImputer
from sklearn.impute import SimpleImputer
imputer_categorical = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_categorical.fit_transform(df[categorical_cols])

# Now encode categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Save the cleaned dataset
df_encoded.to_csv("../data/processed/cleaned_data.csv", index=False)