1.Import libraries and load the dataset:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
data = pd.read_csv("cardiovascular_disease.csv")


2.Perform EDA:

In [None]:
# Display the first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Display summary statistics
print(data.describe())


Visualizations

In [None]:

# Positive and negative heart disease count
sns.countplot(x="cardio", data=data)
plt.xlabel("Heart Disease (0 = Negative, 1 = Positive)")
plt.ylabel("Count")
plt.title("Positive and Negative Heart Disease Cases")
plt.show()

# Pie chart for cholesterol levels (normal, above normal, and very high)
cholesterol_counts = data["cholesterol"].value_counts()
plt.pie(cholesterol_counts, labels=["Normal", "Above Normal", "Very High"], autopct="%1.1f%%")
plt.title("Cholesterol Levels Distribution")
plt.show()

# Age distribution
sns.histplot(data["age"], bins=30, kde=True)
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.title("Age Distribution")
plt.show()

# Number of smokers
sns.countplot(x="smoke", data=data)
plt.xlabel("Smoking (0 = Non-smoker, 1 = Smoker)")
plt.ylabel("Count")
plt.title("Smoker Distribution")
plt.show()

# Weight distribution
sns.histplot(data["weight"], bins=30, kde=True)
plt.xlabel("Weight (kg)")
plt.ylabel("Frequency")
plt.title("Weight Distribution")
plt.show()

# Height distribution
sns.histplot(data["height"], bins=30, kde=True)
plt.xlabel("Height (cm)")
plt.ylabel("Frequency")
plt.title("Height Distribution")
plt.show()

# Men and women percentage with heart disease
gender_disease = data[data["cardio"] == 1]["gender"].value_counts(normalize=True) * 100
plt.pie(gender_disease, labels=["Men", "Women"], autopct="%1.1f%%")
plt.title("Men and Women Percentage with Heart Disease")
plt.show()

# Check for correlations between variables
plt.figure(figsize=(10, 10))
sns.heatmap(data.corr(), annot=True, cmap="coolwarm")
plt.show()



3.Data preprocessing:

In [None]:
# Split the dataset into features (X) and target variable (y)
X = data.drop("cardio", axis=1)
y = data["cardio"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


4.Create and train a machine learning model:

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


5.Evaluate the model:

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

# Display the classification report
report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{report}")
