# Clustering and Classification Fixed Notebook

This notebook performs clustering and classification on the preprocessed dataset.
- Load final_dataset.csv.
- Feature selection using RF feature importance.
- Clustering with KMeans, determine optimal k using elbow and silhouette.
- Add cluster labels as feature.
- Classification with RF and XGB, evaluate metrics.
- Recommendations: Top-3 tree species per cluster.
- Save models to models/ directory.

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import seaborn as sns
import os
from imblearn.over_sampling import SMOTE
import xgboost as xgb

# Load dataset
df = pd.read_csv('/Users/godishalarishi/AML-tree/final_dataset.csv')
print('Dataset shape:', df.shape)
print(df.head())

# Assume target is 'tree species' (encoded)
target = 'tree species'
features = [col for col in df.columns if col != target]
X = df[features]
y = df[target]

# Split for classification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Feature Selection

Use Random Forest feature importance to select top features.

In [None]:
# Feature importance with RF
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot top 10
plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.bar(range(10), importances[indices][:10], align='center')
plt.xticks(range(10), [features[i] for i in indices][:10], rotation=90)
plt.show()

# Select top 10 features
top_features = [features[i] for i in indices][:10]
X_selected = X[top_features]
X_train_sel = X_train[top_features]
X_test_sel = X_test[top_features]

print('Selected features:', top_features)

## Clustering

Use KMeans, determine optimal k with elbow and silhouette.

In [None]:
# Elbow method
inertias = []
silhouettes = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_selected)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X_selected, kmeans.labels_))

# Plot elbow
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(k_range, inertias, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')

# Plot silhouette
plt.subplot(1, 2, 2)
plt.plot(k_range, silhouettes, marker='o')
plt.title('Silhouette Score')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()

# Choose optimal k, e.g., k=4 based on plots
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(X_selected)

# Add cluster as feature
df['cluster'] = clusters
X_selected['cluster'] = clusters
X_train_sel['cluster'] = clusters[:len(X_train_sel)]
X_test_sel['cluster'] = clusters[len(X_train_sel):]

print('Optimal k:', optimal_k)

## Classification

Train RF and XGB with SMOTE if needed.

In [None]:
# Check class balance
print('Class distribution:', y_train.value_counts())

# Apply SMOTE if imbalanced
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train_sel, y_train)

# RF
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_sm, y_train_sm)
y_pred_rf = rf_clf.predict(X_test_sel)
y_prob_rf = rf_clf.predict_proba(X_test_sel)

print('RF Accuracy:', accuracy_score(y_test, y_pred_rf))
print('RF F1:', f1_score(y_test, y_pred_rf, average='weighted'))
print('RF ROC-AUC:', roc_auc_score(label_binarize(y_test, classes=np.unique(y)), y_prob_rf, multi_class='ovr', average='weighted'))

# XGB
xgb_clf = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_clf.fit(X_train_sm, y_train_sm)
y_pred_xgb = xgb_clf.predict(X_test_sel)
y_prob_xgb = xgb_clf.predict_proba(X_test_sel)

print('XGB Accuracy:', accuracy_score(y_test, y_pred_xgb))
print('XGB F1:', f1_score(y_test, y_pred_xgb, average='weighted'))
print('XGB ROC-AUC:', roc_auc_score(label_binarize(y_test, classes=np.unique(y)), y_prob_xgb, multi_class='ovr', average='weighted'))

## Recommendations

Top-3 tree species per cluster based on probabilities.

In [None]:
# For each cluster, get top 3 predicted species
for cluster in range(optimal_k):
    cluster_data = X_selected[X_selected['cluster'] == cluster]
    if not cluster_data.empty:
        probs = rf_clf.predict_proba(cluster_data.drop('cluster', axis=1))
        avg_probs = np.mean(probs, axis=0)
        top3 = np.argsort(avg_probs)[-3:][::-1]
        print(f'Cluster {cluster} top 3 species: {top3}')

# Discussion: Based on clusters, recommend species for regions.

## Save Models

Save RF, XGB, KMeans.

In [None]:
os.makedirs('/Users/godishalarishi/AML-tree/models', exist_ok=True)
import joblib
joblib.dump(rf_clf, '/Users/godishalarishi/AML-tree/models/rf_model.pkl')
joblib.dump(xgb_clf, '/Users/godishalarishi/AML-tree/models/xgb_model.pkl')
joblib.dump(kmeans, '/Users/godishalarishi/AML-tree/models/kmeans_model.pkl')
print('Models saved.')