# Machine Learning Project

Using dataset from Kaggle.com labeled "Logistic Regression to predict Heart Disease"

#### Source
The dataset is publically available on the Kaggle website, and it is from an ongoing cardiovascular study on residents of the town of Framingham, Massachusetts. The classification goal is to predict whether the patient has 10-year risk of future coronary heart disease (CHD).The dataset provides the patients’ information. It includes over 4,000 records and 15 attributes.

#### Variables
Each attribute is a potential risk factor. There are both demographic, behavioral and medical risk factors.

## Introduction of dataset

## Imports and setup

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (20, 24)

print("✓ All libraries imported successfully")

## Data loading

In [None]:
df = pd.read_csv('framingham.csv')

#Load the dataset
print(f"✓ Dataset loaded successfully")
print(f"  - Shape: {df.shape}")
print(f"  - Instances: {df.shape[0]}")
print(f"  - Features: {df.shape[1]}")


### The first few rows


In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
print(df.head())

## Initial data exploration

### Data types

In [None]:
# Data types
print("\nDATA TYPES:")
print(df.dtypes)
print()

### Missing Values

In [None]:
# Missing values
print("MISSING VALUES:")
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(missing_values)
print(f"Total missing values: {df.isnull().sum().sum()}")
print()

### Basic Statistics

In [None]:
# Basic statistics
print("BASIC STATISTICS:")
print(df.describe())
print()

### Target Variable Analysis

In [None]:
# Target variable analysis
print("TARGET VARIABLE ANALYSIS:")
target_counts = df['TenYearCHD'].value_counts()
print(f"Class Distribution:")
print(f"  - Class 0 (No CHD): {target_counts[0]} ({target_counts[0]/len(df)*100:.2f}%)")
print(f"  - Class 1 (Has CHD): {target_counts[1]} ({target_counts[1]/len(df)*100:.2f}%)")
print()

## Visualization of dataset

### Visualizations


In [None]:
# Create a figure with multiple subplots
fig = plt.figure(figsize=(20, 24))

### Target Variable Distribution


In [None]:
# Target Variable Distribution
ax1 = plt.subplot(5, 3, 1)
target_counts.plot(kind='bar', ax=ax1, color=['skyblue', 'salmon'])
ax1.set_title('Target Variable Distribution (TenYearCHD)', fontsize=12, fontweight='bold')
ax1.set_xlabel('Class')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['No CHD (0)', 'CHD (1)'], rotation=0)

# Target Variable Pie Chart - FIXED using .values to convert pandas Series
ax2 = plt.subplot(5, 3, 2)
ax2.pie(target_counts.values, labels=['No CHD (0)', 'CHD (1)'], autopct='%1.1f%%', 
        colors=['skyblue', 'salmon'], startangle=90)
ax2.set_title('Target Variable Proportion', fontsize=12, fontweight='bold')

### Distributions of Numerical Features

In [None]:
# Distributions of Key Numerical Features
key_numerical = ['age', 'totChol', 'sysBP', 'BMI', 'heartRate', 'cigsPerDay', 'diaBP', 'glucose']
for idx, col in enumerate(key_numerical):
    if col in df.columns:
        ax = plt.subplot(5, 3, 4 + idx)
        df[col].hist(bins=30, ax=ax, color='steelblue', edgecolor='black')
        ax.set_title(f'Distribution of {col}', fontsize=10, fontweight='bold')
        ax.set_xlabel(col)
        ax.set_ylabel('Frequency')

### Box plots for numerical features (to show outliers)


In [None]:
# Box plots for numerical features (to show outliers)
ax9 = plt.subplot(5, 3, 9)
# Filter to only columns that exist
existing_numerical = [col for col in key_numerical if col in df.columns]
if existing_numerical:
    df[existing_numerical].boxplot(ax=ax9)
    ax9.set_title('Box Plots of Key Numerical Features', fontsize=10, fontweight='bold')
    ax9.tick_params(axis='x', rotation=45)

### Categorical/Binary Features Distribution


In [None]:
# Categorical/Binary Features Distribution
categorical_cols = ['male', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes']
# Filter to only columns that exist
existing_categorical = [col for col in categorical_cols if col in df.columns]

ax10 = plt.subplot(5, 3, 10)
if existing_categorical:
    categorical_data = [df[col].sum() for col in existing_categorical]
    ax10.barh(existing_categorical, categorical_data, color='lightcoral')
    ax10.set_title('Count of Binary Features (1 = Yes)', fontsize=10, fontweight='bold')
    ax10.set_xlabel('Count of Positive Cases')
else:
    ax10.text(0.5, 0.5, 'No categorical features', ha='center', va='center')


### Missing Values Visualization


In [None]:
# Missing Values Visualization
ax11 = plt.subplot(5, 3, 11)
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0]
if len(missing_data) > 0:
    missing_data.plot(kind='barh', ax=ax11, color='orange')
    ax11.set_title('Missing Values by Feature', fontsize=10, fontweight='bold')
    ax11.set_xlabel('Count')
else:
    ax11.text(0.5, 0.5, 'No Missing Values', ha='center', va='center', fontsize=12)
    ax11.set_title('Missing Values Analysis', fontsize=10, fontweight='bold')

### Correlation Heatmap


In [None]:
# Correlation Heatmap
ax12 = plt.subplot(5, 3, 12)
correlation_matrix = df.corr(numeric_only=True)
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', ax=ax12, cbar_kws={'label': 'Correlation'})
ax12.set_title('Correlation Matrix Heatmap', fontsize=10, fontweight='bold')


### Target vs Key Features

In [None]:
# Average Age by CHD Status
ax13 = plt.subplot(5, 3, 13)
if 'age' in df.columns and 'TenYearCHD' in df.columns:
    age_by_chd = df.groupby('TenYearCHD')['age'].mean()
    ax13.bar(range(len(age_by_chd)), age_by_chd.values, color=['skyblue', 'salmon'])
    ax13.set_title('Average Age by CHD Status', fontsize=10, fontweight='bold')
    ax13.set_ylabel('Average Age')
    ax13.set_xticks(range(len(age_by_chd)))
    ax13.set_xticklabels(age_by_chd.index, rotation=0)

In [None]:
# Average BMI by CHD Status
ax14 = plt.subplot(5, 3, 14)
if 'BMI' in df.columns and 'TenYearCHD' in df.columns:
    bmi_by_chd = df.groupby('TenYearCHD')['BMI'].mean()
    ax14.bar(range(len(bmi_by_chd)), bmi_by_chd.values, color=['skyblue', 'salmon'])
    ax14.set_title('Average BMI by CHD Status', fontsize=10, fontweight='bold')
    ax14.set_ylabel('Average BMI')
    ax14.set_xticks(range(len(bmi_by_chd)))
    ax14.set_xticklabels(bmi_by_chd.index, rotation=0)

In [None]:
# Proportion of Males by CHD Status
ax15 = plt.subplot(5, 3, 15)
if 'male' in df.columns and 'TenYearCHD' in df.columns:
    male_by_chd = df.groupby('TenYearCHD')['male'].mean()
    ax15.bar(range(len(male_by_chd)), male_by_chd.values, color=['skyblue', 'salmon'])
    ax15.set_title('Proportion of Males by CHD Status', fontsize=10, fontweight='bold')
    ax15.set_ylabel('Proportion Male')
    ax15.set_xticks(range(len(male_by_chd)))
    ax15.set_xticklabels(male_by_chd.index, rotation=0)


# Feature type identification

### Numerical Features

In [None]:
# Identify numerical features
numerical_features = ['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 
                      'BMI', 'heartRate', 'glucose']
numerical_features = [f for f in numerical_features if f in df.columns]

print(f"\nNUMERICAL FEATURES ({len(numerical_features)}):")
for feat in numerical_features:
    print(f"  - {feat}: range [{df[feat].min():.2f}, {df[feat].max():.2f}]")

### Categorical Features

In [None]:
# Identify categorical features
categorical_features = ['male', 'currentSmoker', 'BPMeds', 'prevalentStroke', 
                        'prevalentHyp', 'diabetes']
categorical_features = [f for f in categorical_features if f in df.columns]

print(f"CATEGORICAL FEATURES ({len(categorical_features)}):")
for feat in categorical_features:
    print(f"  - {feat}: values {df[feat].unique()}")

### Ordinal Features

In [None]:
# Identify ordinal features
ordinal_features = ['education']
ordinal_features = [f for f in ordinal_features if f in df.columns]

print(f"ORDINAL FEATURES ({len(ordinal_features)}):")
for feat in ordinal_features:
    print(f"  - {feat}: levels 1-4 (Some HS → HS/GED → Some College → College+)")

In [None]:
print(f"TOTAL FEATURES: {len(numerical_features) + len(categorical_features) + len(ordinal_features)}")

### Additional analysis

#### Edits: 
##### Edit to binary / categorical datatypes:
- Gender: Male/Female
- Current Smoker: Yes/No
- BP Meds: Yes/No
- Prevalent Stroke: Yes/No
- Prevalent Hyp: Yes/No
- Diabetes: Yes/No


In [None]:

binary_columns = {
    'male': 'Male/Female',
    'currentSmoker': 'Yes/No',
    'BPMeds': 'Yes/No',
    'prevalentStroke': 'Yes/No',
    'prevalentHyp': 'Yes/No',
    'diabetes': 'Yes/No'
}

# Method 1: Convert to category datatype (recommended for machine learning)
for col in binary_columns.keys():
    if col in df.columns:
        df[col] = df[col].astype('category')


# Check the datatypes
print("Data types after conversion:")
print(df.dtypes)
print("\n")

# Display first few rows to verify
print("First few rows:")
print(df.head())

# Save the converted dataset
df.to_csv('framingham.csv', index=False)

### Data Quality Report

In [None]:
print(f"Total Instances: {len(df)}")
print(f"Total Features: {len(df.columns)}")
print(f"Complete Cases (no missing): {len(df.dropna())}")
print(f"Incomplete Cases: {len(df) - len(df.dropna())}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")