In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Load the dataset
CSV_PATH = "/content/drive/MyDrive/ML/Breast_Cancer_Diagnostic.csv"
df = pd.read_csv(CSV_PATH)

# Display basic information and summary statistics
display(df.head())
display(df.info())
display(df.describe())

# Check for null values
display(df.isnull().sum())

# Separate features (x) and target variable (y)
x = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.30, random_state=40)

# Train a Decision Tree Classifier
model2 = DecisionTreeClassifier()
model2.fit(x_train, y_train)

# Evaluate the model
print("Model Score:", model2.score(x_test, y_test))

# Display training and testing data shapes and class distributions
print("\nTraining data shape:", x_train.shape)
print("Testing data shape:", x_test.shape)
print("\nClass distribution in training set:")
print(y_train.value_counts())
print("\nClass distribution in test set:")
print(y_test.value_counts())

# Generate and display classification report and confusion matrix
y_pred = model2.predict(x_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred, labels=model2.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model2.classes_)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

# Plot the Decision Tree
plt.figure(figsize=(25,15))
plot_tree(model2,
          feature_names=x.columns,
          class_names=model2.classes_,
          filled=True,
          rounded=True,
          fontsize=8)
plt.show()

# Display important features
feat_importances = pd.Series(model2.feature_importances_, index=x.columns)
feat_importances = feat_importances.sort_values(ascending=False).head(10)

plt.figure(figsize=(8,5))
sns.barplot(x=feat_importances, y=feat_importances.index)
plt.title("Top 10 Important Features (Decision Tree)")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()

# Display class distribution
print("Class distribution:")
print(df['diagnosis'].value_counts())

df['diagnosis'].value_counts().plot(kind="bar", color=['skyblue','salmon'])
plt.title("Class Distribution")
plt.xlabel("Diagnosis")
plt.ylabel("Count")
plt.show()

# Display feature correlation heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.drop('diagnosis', axis=1).corr(), cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

# Show decision path for a sample
sample = x_test.iloc[0:1]
print("Sample Features:\n", sample)
decision_path = model2.decision_path(sample)
print("\nDecision Path Indices:", decision_path.indices)