In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB


In [2]:
from sklearn.datasets import load_breast_cancer

# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB()
}

results = []

# Evaluate all models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    })

df_results = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False)
print(df_results)


                 Model  Accuracy  Precision    Recall  F1 Score
4          Naive Bayes  0.973684   0.959459  1.000000  0.979310
1        Random Forest  0.964912   0.958904  0.985915  0.972222
0  Logistic Regression  0.956140   0.945946  0.985915  0.965517
2                  SVM  0.947368   0.922078  1.000000  0.959459
3        Decision Tree  0.929825   0.943662  0.943662  0.943662


In [4]:
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train, y_train)
print("Best RF Params (GridSearchCV):", grid_rf.best_params_)


Best RF Params (GridSearchCV): {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 150}


In [5]:
param_dist_svc = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

random_svc = RandomizedSearchCV(SVC(), param_distributions=param_dist_svc, n_iter=10, scoring='f1', cv=5, n_jobs=-1)
random_svc.fit(X_train, y_train)
print("Best SVC Params (RandomizedSearchCV):", random_svc.best_params_)


Best SVC Params (RandomizedSearchCV): {'kernel': 'linear', 'gamma': 0.01, 'C': 100}


In [6]:
# Evaluate tuned RF
best_rf = grid_rf.best_estimator_
rf_pred = best_rf.predict(X_test)

# Evaluate tuned SVC
best_svc = random_svc.best_estimator_
svc_pred = best_svc.predict(X_test)

# Compare with original results
tuned_results = [
    {
        "Model": "Tuned Random Forest",
        "Accuracy": accuracy_score(y_test, rf_pred),
        "Precision": precision_score(y_test, rf_pred),
        "Recall": recall_score(y_test, rf_pred),
        "F1 Score": f1_score(y_test, rf_pred)
    },
    {
        "Model": "Tuned SVC",
        "Accuracy": accuracy_score(y_test, svc_pred),
        "Precision": precision_score(y_test, svc_pred),
        "Recall": recall_score(y_test, svc_pred),
        "F1 Score": f1_score(y_test, svc_pred)
    }
]

df_tuned_results = pd.DataFrame(tuned_results)
print(df_tuned_results)


                 Model  Accuracy  Precision    Recall  F1 Score
0  Tuned Random Forest  0.964912   0.958904  0.985915  0.972222
1            Tuned SVC  0.956140   0.945946  0.985915  0.965517


In [7]:
final_results = pd.concat([df_results, df_tuned_results], ignore_index=True).sort_values(by="F1 Score", ascending=False)
print("📊 Final Model Comparison:")
print(final_results)


📊 Final Model Comparison:
                 Model  Accuracy  Precision    Recall  F1 Score
0          Naive Bayes  0.973684   0.959459  1.000000  0.979310
1        Random Forest  0.964912   0.958904  0.985915  0.972222
5  Tuned Random Forest  0.964912   0.958904  0.985915  0.972222
2  Logistic Regression  0.956140   0.945946  0.985915  0.965517
6            Tuned SVC  0.956140   0.945946  0.985915  0.965517
3                  SVM  0.947368   0.922078  1.000000  0.959459
4        Decision Tree  0.929825   0.943662  0.943662  0.943662


In [8]:
best_model_name = final_results.iloc[0]['Model']
print(f"✅ Best Performing Model: {best_model_name}")


✅ Best Performing Model: Naive Bayes


In [9]:
if best_model_name == "Tuned Random Forest":
    print(classification_report(y_test, rf_pred))
elif best_model_name == "Tuned SVC":
    print(classification_report(y_test, svc_pred))
else:
    print(classification_report(y_test, models[best_model_name].predict(X_test)))


              precision    recall  f1-score   support

           0       1.00      0.93      0.96        43
           1       0.96      1.00      0.98        71

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_breast_cancer

# 1. Load and prepare the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Train and evaluate base models
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB()
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    })

df_results = pd.DataFrame(results)

# 3. Hyperparameter tuning for Random Forest using GridSearchCV
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train, y_train)

# 4. Hyperparameter tuning for SVM using RandomizedSearchCV
param_dist_svc = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}
random_svc = RandomizedSearchCV(SVC(), param_distributions=param_dist_svc, n_iter=10, scoring='f1', cv=5, n_jobs=-1)
random_svc.fit(X_train, y_train)

# 5. Evaluate tuned models
best_rf = grid_rf.best_estimator_
best_svc = random_svc.best_estimator_

rf_pred = best_rf.predict(X_test)
svc_pred = best_svc.predict(X_test)

tuned_results = [
    {
        "Model": "Tuned Random Forest",
        "Accuracy": accuracy_score(y_test, rf_pred),
        "Precision": precision_score(y_test, rf_pred),
        "Recall": recall_score(y_test, rf_pred),
        "F1 Score": f1_score(y_test, rf_pred)
    },
    {
        "Model": "Tuned SVC",
        "Accuracy": accuracy_score(y_test, svc_pred),
        "Precision": precision_score(y_test, svc_pred),
        "Recall": recall_score(y_test, svc_pred),
        "F1 Score": f1_score(y_test, svc_pred)
    }
]

df_tuned = pd.DataFrame(tuned_results)

# 6. Final results and best model selection
final_df = pd.concat([df_results, df_tuned], ignore_index=True)
final_df = final_df.sort_values(by="F1 Score", ascending=False).reset_index(drop=True)
print("Final Model Comparison:\n")
print(final_df)

# 7. Best model
best_model_name = final_df.iloc[0]["Model"]
print(f"\n✅ Best Performing Model: {best_model_name}")

# 8. Print classification report of best model
print("\nClassification Report of Best Model:")
if best_model_name == "Tuned Random Forest":
    print(classification_report(y_test, rf_pred))
elif best_model_name == "Tuned SVC":
    print(classification_report(y_test, svc_pred))
else:
    best_model = models[best_model_name]
    y_pred_best = best_model.predict(X_test)
    print(classification_report(y_test, y_pred_best))


Final Model Comparison:

                 Model  Accuracy  Precision    Recall  F1 Score
0          Naive Bayes  0.973684   0.959459  1.000000  0.979310
1        Random Forest  0.964912   0.958904  0.985915  0.972222
2  Logistic Regression  0.956140   0.945946  0.985915  0.965517
3            Tuned SVC  0.956140   0.945946  0.985915  0.965517
4  Tuned Random Forest  0.956140   0.958333  0.971831  0.965035
5                  SVM  0.947368   0.922078  1.000000  0.959459
6        Decision Tree  0.938596   0.944444  0.957746  0.951049

✅ Best Performing Model: Naive Bayes

Classification Report of Best Model:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        43
           1       0.96      1.00      0.98        71

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

