In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import os

csv_path = '/content/drive/MyDrive/butterfly_features_final.csv'
if os.path.exists(csv_path):
    df_full = pd.read_csv(csv_path)
    df_full.dropna(inplace=True)
    print("File loaded successfully!")
else:
    print(f"Error: Please upload '{csv_path}' to the Colab file pane.")

File loaded successfully!


In [11]:
feature_groups = {
    "All Features": df_full.drop(columns=['filename', 'label']),
    "Color + Texture": df_full.filter(regex='dom_color|hist|glcm|lbp'),
    "Color + Shape": df_full.filter(regex='dom_color|hist|hu|hog|zernike|solidity|circularity'),
    "Texture + Shape": df_full.filter(regex='glcm|lbp|hu|hog|zernike|solidity|circularity'),
    "Color Only": df_full.filter(regex='dom_color|hist'),
    "Texture Only": df_full.filter(regex='glcm|lbp'),
    "Shape Only": df_full.filter(regex='hu|hog|zernike|solidity|circularity')
}

results = []
y = df_full['label']


In [13]:
for group_name, df_features in feature_groups.items():
    print(f"--- Training model for: {group_name} ---")

    # a. Set the feature matrix X for the current group
    X = df_features

    # b. Split the data 80:20
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # c. Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # d. Train the Random Forest model
    rf_model = RandomForestClassifier(
        n_estimators=700,        # number of trees
        max_depth=None,          # let it expand fully
        class_weight='balanced', # handle class imbalance
        random_state=52,
        n_jobs=-1                # use all CPU cores
    )
    rf_model.fit(X_train_scaled, y_train)

    # e. Make predictions and evaluate
    y_pred = rf_model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy for {group_name}: {accuracy * 100:.2f}%\n")
    # f. Store the results
    results.append({
        'Feature Group': group_name,
        'Number of Features': X.shape[1],
        'Accuracy': accuracy
    })

--- Training model for: All Features ---
Accuracy for All Features: 57.09%

--- Training model for: Color + Texture ---
Accuracy for Color + Texture: 52.85%

--- Training model for: Color + Shape ---
Accuracy for Color + Shape: 39.45%

--- Training model for: Texture + Shape ---
Accuracy for Texture + Shape: 50.54%

--- Training model for: Color Only ---
Accuracy for Color Only: 31.12%

--- Training model for: Texture Only ---
Accuracy for Texture Only: 44.53%

--- Training model for: Shape Only ---
Accuracy for Shape Only: 19.80%



In [20]:
# --- Final Results ---
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by='Accuracy', ascending=False)

# Format the 'Accuracy' column for display as percentage
df_results['Accuracy'] = df_results['Accuracy'].apply(lambda x: f"{x * 100:.2f}%")

print("--- Final Model Comparison ---")
print(df_results.to_string(index=False))

--- Final Model Comparison ---
  Feature Group  Number of Features Accuracy
   All Features                 109   57.09%
   All Features                 109   57.09%
Color + Texture                  75   52.85%
Color + Texture                  75   52.85%
Texture + Shape                  64   50.54%
Texture + Shape                  64   50.54%
   Texture Only                  30   44.53%
  Color + Shape                  79   39.45%
  Color + Shape                  79   39.45%
     Color Only                  45   31.12%
     Color Only                  45   31.12%
     Shape Only                  34   19.80%
