In [10]:
# Import necessary libraries
import pandas as pd
from scipy.stats import zscore

# Load the dataset
df = pd.read_csv("../data/raw/project 2.csv")

# Display first 5 rows of the dataset
print("First 5 Rows of the Dataset:")
print(df.head())

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\nNumber of Duplicate Rows: {duplicates}")

# Detect outliers using Z-scores (threshold Â±3)
numerical_cols = ['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Blood Sugar', 'Stress Level']
z_scores = df[numerical_cols].apply(zscore)
outliers = df[(z_scores.abs() > 3).any(axis=1)]

print("\nOutliers Detected Using Z-Score Method:")
print(outliers)

First 5 Rows of the Dataset:
   Age  Gender  Cholesterol  Blood Pressure  Heart Rate  Smoking  \
0   75  Female        228.0           119.0          66  Current   
1   48    Male        204.0           165.0          62  Current   
2   53    Male        234.0            91.0          67    Never   
3   69  Female        192.0            90.0          72  Current   
4   62  Female        172.0           163.0          93    Never   

  Alcohol Intake  Exercise Hours Family History Diabetes Obesity  \
0          Heavy               0             No       No     Yes   
1            NaN               5             No       No      No   
2          Heavy               3            Yes       No     Yes   
3            NaN               4             No      Yes      No   
4            NaN               6             No      Yes      No   

   Stress Level  Blood Sugar Exercise Induced Angina   Chest Pain Type  \
0             8          119                     Yes   Atypical Angina   
1    

In [11]:
# Import necessary libraries
import pandas as pd
from sklearn.impute import KNNImputer

# Load the dataset
df = pd.read_csv("../data/raw/project 2.csv")

# ====== NEW: SEPARATE TARGET VARIABLE ======
y = df["Heart Disease"]  # Target
X = df.drop("Heart Disease", axis=1)  # Features

# Separate numerical and categorical columns (FROM X, NOT df)
numerical_cols = ['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Blood Sugar', 'Stress Level']
categorical_cols = ['Gender', 'Smoking', 'Alcohol Intake', 'Exercise Hours', 'Family History',
                    'Diabetes', 'Obesity', 'Exercise Induced Angina', 'Chest Pain Type']  # Removed 'Heart Disease'

# Encode categorical variables temporarily
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Apply KNNImputer to numerical columns
imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X_encoded)

# Convert back to DataFrame
X_imputed = pd.DataFrame(X_imputed, columns=X_encoded.columns)

# ====== COMBINE WITH TARGET AFTER IMPUTATION ======
df_final = pd.concat([X_imputed, y], axis=1)

# Save cleaned data
df_final.to_csv("../data/processed/cleaned_data.csv", index=False)
print("Missing values handled and saved to ../data/processed/cleaned_data.csv")

Missing values handled and saved to ../data/processed/cleaned_data.csv


In [12]:
# checking if the cleaned data csv is loading or not

# Import necessary libraries
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("../data/processed/cleaned_data.csv")

# Display first 5 rows of the dataset
print("First 5 Rows of the Dataset:")
print(df.head())

First 5 Rows of the Dataset:
    Age  Cholesterol  Blood Pressure  Heart Rate  Stress Level  Blood Sugar  \
0  75.0        228.0           119.0        66.0           8.0        119.0   
1  48.0        204.0           165.0        62.0           9.0         70.0   
2  53.0        234.0            91.0        67.0           5.0        196.0   
3  69.0        192.0            90.0        72.0           7.0        107.0   
4  62.0        172.0           163.0        93.0           2.0        183.0   

   Gender_Male  Smoking_Former  Smoking_Never  Alcohol Intake_Moderate  ...  \
0          0.0             0.0            0.0                      0.0  ...   
1          1.0             0.0            0.0                      0.0  ...   
2          1.0             0.0            1.0                      0.0  ...   
3          0.0             0.0            0.0                      0.0  ...   
4          0.0             0.0            1.0                      0.0  ...   

   Exercise Hours_8  

In [13]:
# Check the distribution of Gender_Male
print("Gender Encoding:")
print(df['Gender_Male'].value_counts())

Gender Encoding:
Gender_Male
0.0    503
1.0    497
Name: count, dtype: int64


In [14]:
# Save the final encoded dataset
df.to_csv("../data/processed/encoded_data.csv", index=False)

print("\nFinal encoded dataset saved to ../data/processed/encoded_data.csv")


Final encoded dataset saved to ../data/processed/encoded_data.csv


In [15]:
print(df.columns)

Index(['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Stress Level',
       'Blood Sugar', 'Gender_Male', 'Smoking_Former', 'Smoking_Never',
       'Alcohol Intake_Moderate', 'Exercise Hours_1', 'Exercise Hours_2',
       'Exercise Hours_3', 'Exercise Hours_4', 'Exercise Hours_5',
       'Exercise Hours_6', 'Exercise Hours_7', 'Exercise Hours_8',
       'Exercise Hours_9', 'Family History_Yes', 'Diabetes_Yes', 'Obesity_Yes',
       'Exercise Induced Angina_Yes', 'Chest Pain Type_Atypical Angina',
       'Chest Pain Type_Non-anginal Pain', 'Chest Pain Type_Typical Angina',
       'Heart Disease'],
      dtype='object')


In [16]:
# 7:05 pm March 16, there are some mistakes with KNNImputer

# Load raw data
df = pd.read_csv("../data/raw/project 2.csv")

# One-hot encode categorical variables (WITHOUT dropping the target)
categorical_cols = ['Gender', 'Smoking', 'Alcohol Intake', 'Family History',
                    'Diabetes', 'Obesity', 'Exercise Induced Angina', 'Chest Pain Type']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Verify all encoded columns:
print("Encoded Columns:")
print(df_encoded.columns)

Encoded Columns:
Index(['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Exercise Hours',
       'Stress Level', 'Blood Sugar', 'Heart Disease', 'Gender_Male',
       'Smoking_Former', 'Smoking_Never', 'Alcohol Intake_Moderate',
       'Family History_Yes', 'Diabetes_Yes', 'Obesity_Yes',
       'Exercise Induced Angina_Yes', 'Chest Pain Type_Atypical Angina',
       'Chest Pain Type_Non-anginal Pain', 'Chest Pain Type_Typical Angina'],
      dtype='object')


In [17]:
# Split data into features (X) and target (y)
X = df_encoded.drop(columns=['Heart Disease'])
y = df_encoded['Heart Disease']

In [18]:
# Load raw data
df = pd.read_csv("../data/raw/project 2.csv")

# Split into original numerical and categorical columns
numerical_cols = ['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate',
                 'Exercise Hours', 'Stress Level', 'Blood Sugar']
categorical_cols = ['Gender', 'Smoking', 'Alcohol Intake', 'Family History',
                    'Diabetes', 'Obesity', 'Exercise Induced Angina',
                    'Chest Pain Type']

# Impute numerical columns with KNNImputer
from sklearn.impute import KNNImputer
imputer_numerical = KNNImputer(n_neighbors=5)
df[numerical_cols] = imputer_numerical.fit_transform(df[numerical_cols])

# Impute categorical columns with SimpleImputer
from sklearn.impute import SimpleImputer
imputer_categorical = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_categorical.fit_transform(df[categorical_cols])

# Now encode categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Save the cleaned dataset
df_encoded.to_csv("../data/processed/cleaned_data.csv", index=False)