In [None]:
# ============================
#  Decision Tree & Random Forest (All-in-One Code)
# ============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

# -----------------------------------------------------
# 1. Load Dataset
# -----------------------------------------------------
# Replace with your dataset file
df = pd.read_csv(r"C:\Users\VAIBHAVI\Downloads\heart.csv")

# Assuming your target column is named "target"
X = df.drop("target", axis=1)
y = df["target"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------------------------------
# 2. Decision Tree Classifier (Full Tree)
# -----------------------------------------------------
dt_full = DecisionTreeClassifier(random_state=42)
dt_full.fit(X_train, y_train)

print("=== DECISION TREE (FULL) ===")
print("Train Accuracy:", dt_full.score(X_train, y_train))
print("Test Accuracy:", dt_full.score(X_test, y_test))

# Visualization
plt.figure(figsize=(18, 12))
plot_tree(dt_full, feature_names=X.columns, class_names=["No Disease","Disease"], filled=True)
plt.title("Full Decision Tree")
plt.show()


# -----------------------------------------------------
# 3. Decision Tree with Depth Control (Prevent Overfitting)
# -----------------------------------------------------
dt_limited = DecisionTreeClassifier(max_depth=4, random_state=42)
dt_limited.fit(X_train, y_train)

print("\n=== DECISION TREE (MAX DEPTH = 4) ===")
print("Train Accuracy:", dt_limited.score(X_train, y_train))
print("Test Accuracy:", dt_limited.score(X_test, y_test))


# -----------------------------------------------------
# 4. Random Forest Classifier
# -----------------------------------------------------
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

print("\n=== RANDOM FOREST ===")
print("Train Accuracy:", rf.score(X_train, y_train))
print("Test Accuracy:", rf.score(X_test, y_test))


# -----------------------------------------------------
# 5. Feature Importance
# -----------------------------------------------------
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.barh(X.columns[indices], importances[indices])
plt.title("Random Forest Feature Importance")
plt.xlabel("Importance Score")
plt.gca().invert_yaxis()
plt.show()

print("\n=== FEATURE IMPORTANCE ===")
for idx in indices:
    print(f"{X.columns[idx]}: {importances[idx]:.4f}")


# -----------------------------------------------------
# 6. Cross-Validation on Random Forest
# -----------------------------------------------------
cv_scores = cross_val_score(rf, X, y, cv=5)
print("\n=== CROSS-VALIDATION (Random Forest) ===")
print("CV Accuracy Mean:", cv_scores.mean())
print("CV All Scores:", cv_scores)


=== DECISION TREE (FULL) ===
Train Accuracy: 1.0
Test Accuracy: 0.9853658536585366
