In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns

warnings.filterwarnings("ignore")
plt.rcParams["figure.figsize"] = (8,6)

# Load the dataset
CSV_PATH = "/content/drive/MyDrive/ML/Breast_Cancer_Diagnostic.csv"
df = pd.read_csv(CSV_PATH)

display(df.head())
display(df.info())
display(df.describe())
display(df.isnull().sum())

x = df.drop('diagnosis', axis=1)
y = df['diagnosis']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42
)
rf_model.fit(x_train, y_train)

print("Random Forest Model Score:", rf_model.score(x_test, y_test))

# Class distribution
print("\nTraining data shape:", x_train.shape)
print("Testing data shape:", x_test.shape)
print("\nClass distribution in training set:")
print(y_train.value_counts())
print("\nClass distribution in test set:")
print(y_test.value_counts())

# Predictions + classification report
y_pred = rf_model.predict(x_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=rf_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf_model.classes_)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Random Forest")
plt.show()

# Feature Importance
feat_importances = pd.Series(rf_model.feature_importances_, index=x.columns)
top_features = feat_importances.sort_values(ascending=False).head(10)

plt.figure(figsize=(8,5))
sns.barplot(x=top_features, y=top_features.index, palette="viridis")
plt.title("Top 10 Important Features (Random Forest)")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()

# Class distribution visualization
df['diagnosis'].value_counts().plot(kind="bar", color=['skyblue','salmon'])
plt.title("Class Distribution")
plt.xlabel("Diagnosis")
plt.ylabel("Count")
plt.show()

# Feature correlation heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.drop('diagnosis', axis=1).corr(), cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

#  predict a single sample
sample = x_test.iloc[0:1]
print("Sample Features:\n", sample)
pred_sample = rf_model.predict(sample)
print("\nPredicted Class for Sample:", pred_sample[0])

sns.pairplot(df, hue='diagnosis', palette='viridis')
plt.suptitle("Pair Plot of Features by Diagnosis", y=1.02)
plt.show()

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle("Distribution of Selected Features by Diagnosis", fontsize=16)

sns.histplot(data=df, x='radius_mean', hue='diagnosis', kde=True, ax=axes[0, 0], palette='viridis')
sns.histplot(data=df, x='texture_mean', hue='diagnosis', kde=True, ax=axes[0, 1], palette='viridis')
sns.histplot(data=df, x='perimeter_mean', hue='diagnosis', kde=True, ax=axes[0, 2], palette='viridis')
sns.histplot(data=df, x='area_mean', hue='diagnosis', kde=True, ax=axes[1, 0], palette='viridis')
sns.histplot(data=df, x='concavity_mean', hue='diagnosis', kde=True, ax=axes[1, 1], palette='viridis')
sns.histplot(data=df, x='concave points_mean', hue='diagnosis', kde=True, ax=axes[1, 2], palette='viridis')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()