In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn import tree
import matplotlib.pyplot as plt

# Load the dataset (adjust path to your actual file location)
file_path = '/content/heart_disease.xlsx'  # Replace with the correct path to your file
df = pd.read_excel(file_path)

# Display the first few rows to understand the structure of the data
print(df.head())

# Step 1: Check for missing values and data types
print(df.info())  # Check data types and missing values
print(df.isnull().sum())  # Check for missing values in each column

# Step 2: Handle missing values
# Numerical columns - Fill missing values with the median (common practice)
df.fillna(df.median(numeric_only=True), inplace=True)

# Categorical columns - Fill missing values with the mode (common practice)
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Step 3: Handle categorical columns - Encode them properly
# For columns with more than two categories, use LabelEncoder
for col in categorical_columns:
    if df[col].dtype == 'object':
        if df[col].nunique() > 2:
            label_encoder = LabelEncoder()
            df[col] = label_encoder.fit_transform(df[col])

# For binary categorical columns, we can also use OneHotEncoder if needed
# Example: if 'Gender' column exists, we can encode as a binary variable (if 'Male' and 'Female' only)
# Assuming 'Gender' is a binary column, we can use pd.get_dummies
if 'Gender' in df.columns:
    df = pd.get_dummies(df, columns=['Gender'], drop_first=True)

# Step 4: Split the dataset into features (X) and target (y)
# Ensure that the target column is not included in X (features)
# Replace 'target' with the actual column name you want to predict (e.g., 'HeartDisease')
X = df.drop(columns=['target'])  # Drop target column
y = df['target']  # The column you are predicting, replace 'target' with your actual column name

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Initialize the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Step 7: Train the classifier
dt_classifier.fit(X_train, y_train)

# Step 8: Make predictions
y_pred = dt_classifier.predict(X_test)

# Step 9: Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')  # Use 'binary' for binary classification
recall = recall_score(y_test, y_pred, average='binary')  # Same for recall
f1 = f1_score(y_test, y_pred, average='binary')
roc_auc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

# Step 10: Visualize the decision tree
plt.figure(figsize=(12, 8))
tree.plot_tree(dt_classifier, filled=True, feature_names=X.columns, class_names=str(np.unique(y)), rounded=True)
plt.show()

# Step 11: Hyperparameter Tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters from GridSearchCV
print("Best hyperparameters:", grid_search.best_params_)

# Step 12: Re-train the model with the best hyperparameters
best_dt_classifier = grid_search.best_estimator_
best_dt_classifier.fit(X_train, y_train)

# Evaluate the re-trained model on the test set
y_pred_best = best_dt_classifier.predict(X_test)

# Calculate performance metrics again
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best, average='binary')
recall_best = recall_score(y_test, y_pred_best, average='binary')
f1_best = f1_score(y_test, y_pred_best, average='binary')
roc_auc_best = roc_auc_score(y_test, y_pred_best)

print(f"Best Accuracy: {accuracy_best:.4f}")
print(f"Best Precision: {precision_best:.4f}")
print(f"Best Recall: {recall_best:.4f}")
print(f"Best F1-Score: {f1_best:.4f}")
print(f"Best ROC-AUC: {roc_auc_best:.4f}")

# Visualize the decision tree with the best hyperparameters
plt.figure(figsize=(12, 8))
tree.plot_tree(best_dt_classifier, filled=True, feature_names=X.columns, class_names=str(np.unique(y)), rounded=True)
plt.show()


        age                                       Age in years
0    Gender                       Gender ; Male - 1, Female -0
1        cp                                    Chest pain type
2  trestbps                             Resting blood pressure
3      chol                                cholesterol measure
4       fbs  (fasting blood sugar > 120 mg/dl) (1 = true; 0...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   age           12 non-null     object
 1   Age in years  12 non-null     object
dtypes: object(2)
memory usage: 324.0+ bytes
None
age             0
Age in years    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


KeyError: "['target'] not found in axis"