#### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed
np.random.seed(42)


#### Loading Dataset

In [2]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [3]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [4]:
df.shape


(569, 31)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

#### Creating priority labels

In [7]:
df['severity_score'] = (df['mean radius'] + df['mean area']/100) / 2

def assign_priority(row):
    if row['target'] == 1:  # Benign
        return 0  # Low priority
    else:  # Malignant
        if row['severity_score'] > df[df['target']==0]['severity_score'].median():
            return 2  # High priority
        else:
            return 1  # Medium priority

df['priority'] = df.apply(assign_priority, axis=1)
priority_labels = {0: 'Low', 1: 'Medium', 2: 'High'}
df['priority_label'] = df['priority'].map(priority_labels)

print("✓ Priority labels created")
print(f"   Distribution: {dict(df['priority_label'].value_counts())}")


✓ Priority labels created
   Distribution: {'Low': 357, 'High': 106, 'Medium': 106}


#### Visualizations

In [8]:
sns.set_style("whitegrid")

# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Priority distribution
priority_counts = df['priority_label'].value_counts()
axes[0, 0].bar(priority_counts.index, priority_counts.values, 
               color=['green', 'orange', 'red'])
axes[0, 0].set_title('Priority Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Priority Level')
axes[0, 0].set_ylabel('Count')

# Feature correlation heatmap
top_features = ['mean radius', 'mean texture', 'mean perimeter', 'mean area']
corr_matrix = df[top_features].corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            ax=axes[0, 1], cbar_kws={'shrink': 0.8})
axes[0, 1].set_title('Feature Correlation', fontsize=14, fontweight='bold')

# Box plot
df.boxplot(column='mean radius', by='priority_label', ax=axes[1, 0])
axes[1, 0].set_title('Mean Radius by Priority', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Priority Level')
axes[1, 0].set_ylabel('Mean Radius')

# Scatter plot
for priority, color in zip(['Low', 'Medium', 'High'], ['green', 'orange', 'red']):
    mask = df['priority_label'] == priority
    axes[1, 1].scatter(df[mask]['mean radius'], df[mask]['mean area'], 
                      c=color, label=priority, alpha=0.6, s=50)
axes[1, 1].set_title('Mean Area vs Mean Radius', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Mean Radius')
axes[1, 1].set_ylabel('Mean Area')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig('data_visualization.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Visualization saved: data_visualization.png")


✓ Visualization saved: data_visualization.png


#### Data Preprocessing

In [9]:
X = df[data.feature_names]
y = df['priority']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"✓ Data split: {X_train.shape[0]} train, {X_test.shape[0]} test samples")
print("✓ Features scaled using StandardScaler")


✓ Data split: 455 train, 114 test samples
✓ Features scaled using StandardScaler


#### Model Training

In [10]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_scaled, y_train)
print("✓ Model trained successfully")


✓ Model trained successfully


#### Predictions

In [11]:
y_train_pred = rf_model.predict(X_train_scaled)
y_test_pred = rf_model.predict(X_test_scaled)
print("✓ Predictions generated")


✓ Predictions generated


In [12]:
y_test_pred

array([0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       2, 2, 1, 0, 0, 2, 1, 1, 2, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0,
       0, 2, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 2, 1, 0, 0, 0, 0, 2, 0, 2, 2, 1, 0, 0, 0, 0, 2, 2,
       2, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 1, 0], dtype=int64)

#### Model Evaluation & Cross-Validation

In [13]:
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
test_f1_macro = f1_score(y_test, y_test_pred, average='macro')

# Cross-validation
cv_scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5, scoring='accuracy')

# Per-class metrics
f1_per_class = f1_score(y_test, y_test_pred, average=None)
target_names = ['Low Priority', 'Medium Priority', 'High Priority']


In [15]:
for name, f1 in zip(target_names, f1_per_class):
    print(f"  • {name:20s} F1-Score: {f1:.4f}")

# Detailed classification report
print("\nDetailed Classification Report:")
print("-"*70)
print(classification_report(y_test, y_test_pred, target_names=target_names))


# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names, 
            yticklabels=target_names,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Random Forest Classifier', 
          fontsize=16, fontweight='bold', pad=20)
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Confusion matrix saved: confusion_matrix.png")


  • Low Priority         F1-Score: 0.9730
  • Medium Priority      F1-Score: 0.8718
  • High Priority        F1-Score: 0.9756

Detailed Classification Report:
----------------------------------------------------------------------
                 precision    recall  f1-score   support

   Low Priority       0.95      1.00      0.97        72
Medium Priority       0.94      0.81      0.87        21
  High Priority       1.00      0.95      0.98        21

       accuracy                           0.96       114
      macro avg       0.96      0.92      0.94       114
   weighted avg       0.96      0.96      0.95       114

✓ Confusion matrix saved: confusion_matrix.png


#### Feature Importance

In [16]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': data.feature_names,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
top_n = 15
top_features_df = feature_importance.head(top_n)
plt.barh(range(top_n), top_features_df['importance'].values, color='steelblue')
plt.yticks(range(top_n), top_features_df['feature'].values)
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title(f'Top {top_n} Feature Importance', fontsize=16, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Feature importance saved: feature_importance.png")

# Performance Dashboard
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Accuracy comparison
metrics = ['Training', 'Testing']
accuracies = [train_accuracy, test_accuracy]
colors = ['#2ecc71', '#3498db']
bars = axes[0, 0].bar(metrics, accuracies, color=colors, alpha=0.7)
axes[0, 0].set_ylim([0, 1.1])
axes[0, 0].set_ylabel('Accuracy', fontsize=12)
axes[0, 0].set_title('Model Accuracy', fontsize=14, fontweight='bold')
for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    axes[0, 0].text(bar.get_x() + bar.get_width()/2., height + 0.02,
                    f'{acc:.4f}', ha='center', va='bottom', fontsize=10)

# Per-class F1-scores
axes[0, 1].bar(target_names, f1_per_class, 
               color=['green', 'orange', 'red'], alpha=0.7)
axes[0, 1].set_ylim([0, 1.1])
axes[0, 1].set_ylabel('F1-Score', fontsize=12)
axes[0, 1].set_title('F1-Score by Priority', fontsize=14, fontweight='bold')
axes[0, 1].set_xticklabels(target_names, rotation=15)
for i, score in enumerate(f1_per_class):
    axes[0, 1].text(i, score + 0.02, f'{score:.4f}', 
                   ha='center', va='bottom', fontsize=10)

# Prediction distribution
pred_counts = pd.Series(y_test_pred).value_counts().sort_index()
true_counts = pd.Series(y_test).value_counts().sort_index()
x = np.arange(len(target_names))
width = 0.35
axes[1, 0].bar(x - width/2, true_counts.values, width, 
               label='True', alpha=0.7, color='steelblue')
axes[1, 0].bar(x + width/2, pred_counts.values, width, 
               label='Predicted', alpha=0.7, color='coral')
axes[1, 0].set_xlabel('Priority Class', fontsize=12)
axes[1, 0].set_ylabel('Count', fontsize=12)
axes[1, 0].set_title('True vs Predicted', fontsize=14, fontweight='bold')
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(target_names, rotation=15)
axes[1, 0].legend()

# Cross-validation scores
axes[1, 1].plot(range(1, 6), cv_scores, marker='o', linestyle='-', 
                linewidth=2, markersize=8, color='purple')
axes[1, 1].axhline(y=cv_scores.mean(), color='red', linestyle='--', 
                  label=f'Mean: {cv_scores.mean():.4f}', linewidth=2)
axes[1, 1].set_xlabel('Fold', fontsize=12)
axes[1, 1].set_ylabel('Accuracy', fontsize=12)
axes[1, 1].set_title('Cross-Validation Scores', fontsize=14, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('performance_dashboard.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Performance dashboard saved: performance_dashboard.png")

✓ Feature importance saved: feature_importance.png
✓ Performance dashboard saved: performance_dashboard.png
