# Import essential libraries

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load the dataset

In [5]:

df = pd.read_csv('/kaggle/input/cancer-prediction-dataset/The_Cancer_data_1500_V2.csv')

# Data Preparation

In [6]:

X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

# Train-test split

In [7]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize and train Random Forest (typically best for this type of data)

In [8]:

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# Predictions

In [9]:

y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Performance metrics

In [None]:

accuracy = accuracy_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred_proba)

print("=" * 50)
print("RANDOM FOREST CLASSIFIER - FINAL RESULTS")
print("=" * 50)
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC Score: {auc_score:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

RANDOM FOREST CLASSIFIER - FINAL RESULTS
Accuracy: 0.9467
AUC Score: 0.9631

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96       189
           1       0.97      0.88      0.92       111

    accuracy                           0.95       300
   macro avg       0.95      0.93      0.94       300
weighted avg       0.95      0.95      0.95       300

Confusion Matrix:
[[186   3]
 [ 13  98]]


# Feature Importance

In [None]:

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Top 10 Most Important Features:
            feature  importance
7     CancerHistory    0.168932
6     AlcoholIntake    0.152074
2               BMI    0.150365
0               Age    0.131616
4       GeneticRisk    0.131022
5  PhysicalActivity    0.125990
1            Gender    0.082256
3           Smoking    0.057746


# Hyperparameter Tuning for even better performance

In [None]:

print("\n" + "="*50)
print("PERFORMING HYPERPARAMETER TUNING...")
print("="*50)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")


PERFORMING HYPERPARAMETER TUNING...
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best cross-validation score: 0.9413


# Train final model with best parameters

In [None]:

best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)

# Final predictions with tuned model

In [14]:

y_pred_tuned = best_rf_model.predict(X_test)
y_pred_proba_tuned = best_rf_model.predict_proba(X_test)[:, 1]

# Final performance metrics
final_accuracy = accuracy_score(y_test, y_pred_tuned)
final_auc = roc_auc_score(y_test, y_pred_proba_tuned)

print("\n" + "="*50)
print("TUNED MODEL - FINAL PERFORMANCE")
print("="*50)
print(f"Accuracy: {final_accuracy:.4f}")
print(f"AUC Score: {final_auc:.4f}")

print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_tuned))


TUNED MODEL - FINAL PERFORMANCE
Accuracy: 0.9467
AUC Score: 0.9629

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96       189
           1       0.97      0.88      0.92       111

    accuracy                           0.95       300
   macro avg       0.95      0.93      0.94       300
weighted avg       0.95      0.95      0.95       300



In [None]:
# Save the best model
import joblib
joblib.dump(best_rf_model, 'best_cancer_classifier.pkl')
print("\nBest model saved as 'best_cancer_classifier.pkl'")



Best model saved as 'best_cancer_classifier.pkl'

EXAMPLE PREDICTIONS
Sample 1: Predicted = 0, Actual = 0, Probability = 0.1064
Sample 2: Predicted = 0, Actual = 0, Probability = 0.1387
Sample 3: Predicted = 1, Actual = 1, Probability = 0.7361
Sample 4: Predicted = 0, Actual = 1, Probability = 0.0261
Sample 5: Predicted = 1, Actual = 1, Probability = 0.9896

PERFORMANCE SUMMARY
Final Model: Random Forest Classifier
Best Accuracy: 0.9467
Best AUC Score: 0.9629
Training samples: 1200
Test samples: 300


In [None]:

# Example of making predictions on new data
print("\n" + "="*50)
print("EXAMPLE PREDICTIONS")
print("="*50)

# Sample predictions
sample_data = X_test.iloc[:5]
sample_predictions = best_rf_model.predict(sample_data)
sample_probabilities = best_rf_model.predict_proba(sample_data)

for i, (pred, prob) in enumerate(zip(sample_predictions, sample_probabilities)):
    actual_label = y_test.iloc[i]
    print(f"Sample {i+1}: Predicted = {pred}, Actual = {actual_label}, Probability = {prob[1]:.4f}")

# Performance Summary
print("\n" + "="*50)
print("PERFORMANCE SUMMARY")
print("="*50)
print(f"Final Model: Random Forest Classifier")
print(f"Best Accuracy: {final_accuracy:.4f}")
print(f"Best AUC Score: {final_auc:.4f}")
print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")