In [None]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from mealpy.swarm_based.PSO import OriginalPSO
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)



In [None]:
# 2. Load and Explore Dataset

# Load dataset
df = pd.read_csv("../Datasets/ovariantotal.csv")

# Display basic info
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

print("\nClass distribution:")
print(df['TYPE'].value_counts())


In [None]:
#3. Preprocess Data

# Separate features and target
X = df.drop(columns=['TYPE'])
y = df['TYPE']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


In [None]:
#4. Define Fitness Function

def feature_selection_fitness(solution):
    # Convert continuous solution to binary (threshold at 0.5)
    binary_solution = (np.array(solution) > 0.5).astype(int)
    
    # Penalize if no features are selected
    if np.sum(binary_solution) == 0:
        return 1.0  # Worst possible error
    
    # Select features and evaluate
    selected_X = X_train[:, binary_solution == 1]
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    scores = cross_val_score(model, selected_X, y_train, cv=5, scoring='accuracy')
    
    return 1 - np.mean(scores)  # Return error (to minimize)


In [None]:
#5. Setup PSO Optimization

problem = {
    "fit_func": feature_selection_fitness,  # Fitness function
    "lb": [0] * X_train.shape[1],          # Lower bounds (0 for all features)
    "ub": [1] * X_train.shape[1],          # Upper bounds (1 for all features)
    "minmax": "min",                       # Minimize the fitness
    "log_to": None                         # Disable logging
}

# Initialize PSO
pso_model = OriginalPSO(
    epoch=50,           # Number of iterations
    pop_size=20,        # Population size (number of particles)
    c1=2.0,             # Cognitive factor
    c2=2.0,             # Social factor
    w_min=0.4,          # Minimum inertia weight
    w_max=0.9           # Maximum inertia weight
)


In [None]:
best_position, best_fitness = pso_model.solve(problem)


In [None]:
#7. Analyze Results

# Get selected features (binary conversion)
selected_features = (np.array(best_position) > 0.5).astype(int)
selected_indices = np.where(selected_features == 1)[0]
selected_feature_names = X.columns[selected_indices]

print("✅ Selected Features:", list(selected_feature_names))
print("🔍 Number of Features Selected:", len(selected_feature_names))
print("📉 Best Error (1 - accuracy):", best_fitness)
print("📈 Final CV Accuracy:", 1 - best_fitness)

# Plot feature importance scores
plt.figure(figsize=(10, 5))
plt.bar(range(len(best_position)), best_position)
plt.axhline(y=0.5, color='r', linestyle='--')
plt.title("PSO Feature Importance Scores")
plt.xlabel("Feature Index")
plt.ylabel("Selection Score")
plt.show()

In [None]:
#8. Evaluate on Test Set

# Prepare selected features
X_train_selected = X_train[:, selected_features == 1]
X_test_selected = X_test[:, selected_features == 1]

# Train final model
final_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_model.fit(X_train_selected, y_train)

# Evaluate
test_accuracy = final_model.score(X_test_selected, y_test)
print("🏁 Test Accuracy:", test_accuracy)

# Plot feature importances from final model
importances = final_model.feature_importances_
plt.figure(figsize=(10, 4))
plt.bar(selected_feature_names, importances)
plt.xticks(rotation=45, ha="right")
plt.title("Random Forest Feature Importances")
plt.tight_layout()
plt.show()

In [None]:

y_pred = final_model.predict(X_test_selected)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
