In [24]:
import numpy as np
import os
import glob
import open3d as o3d
from plyfile import PlyData
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# === Load and convert PLY to Open3D PointCloud and extract label_ch if present ===
def load_ply_file(ply_path):
    try:
        plydata = PlyData.read(ply_path)
        print(f"  [INFO] Elements in PLY file {os.path.basename(ply_path)}: {[el.name for el in plydata.elements]}")

        # Detect the correct element name
        element_names = [el.name for el in plydata.elements]
        if 'vertex' in element_names:
            element = 'vertex'
        elif 'params' in element_names:
            element = 'params'
        else:
            raise ValueError(f"No recognized element in {ply_path}")

        data = plydata[element].data
        print(f"  [INFO] Fields in {element}: {data.dtype.names}")

        if len(data) == 0:
            raise ValueError("No points found in PLY file")

        points = np.vstack((data['x'], data['y'], data['z'])).T
        pcd = o3d.geometry.PointCloud()
        pcd.points = o3d.utility.Vector3dVector(points)

        if all(k in data.dtype.names for k in ('red', 'green', 'blue')):
            colors = np.vstack((data['red'], data['green'], data['blue'])).T / 255.0
            pcd.colors = o3d.utility.Vector3dVector(colors)
        else:
            colors = None

        label_ch = data['label_ch'] if 'label_ch' in data.dtype.names else None
        print(f"  [INFO] label_ch present: {label_ch is not None}")
        return pcd, label_ch

    except Exception as e:
        print(f"  [ERROR] Failed to load PLY file {ply_path}: {e}")
        return None, None

# === Preprocessing Function ===
def preprocess_point_cloud(pcd, nb_neighbors=20, std_ratio=2.0, voxel_size=0.05):
    if len(pcd.points) == 0:
        print("  [WARNING] Empty point cloud")
        return pcd

    pcd, _ = pcd.remove_statistical_outlier(nb_neighbors=nb_neighbors, std_ratio=std_ratio)
    pcd = pcd.voxel_down_sample(voxel_size=voxel_size)
    points = np.asarray(pcd.points)
    if len(points) == 0:
        return pcd
    centroid = np.mean(points, axis=0)
    points -= centroid
    max_dist = np.max(np.linalg.norm(points, axis=1))
    if max_dist > 0:
        points /= max_dist
    pcd.points = o3d.utility.Vector3dVector(points)
    return pcd

# === Main Logic ===
base_path = "/kaggle/input/urd3dcd/IEEE_Dataset_V2_Lid05_MS/1-Lidar05/Train"
all_features, all_labels = [], []

for scene_dir in glob.glob(os.path.join(base_path, "LyonN*")):
    scene_name = os.path.basename(scene_dir)
    print(f"\nProcessing Scene: {scene_name}")

    try:
        # Load and preprocess point clouds
        pcd0, _ = load_ply_file(os.path.join(scene_dir, "pointCloud0.ply"))
        pcd1, label_ch = load_ply_file(os.path.join(scene_dir, "pointCloud1.ply"))
        if pcd0 is None or pcd1 is None or label_ch is None:
            print("  [WARNING] Skipping scene due to loading issue or missing label_ch.")
            continue

        print(f"  Original points: Epoch0 = {len(pcd0.points)}, Epoch1 = {len(pcd1.points)}")
        pcd0 = preprocess_point_cloud(pcd0)
        pcd1 = preprocess_point_cloud(pcd1)

        if len(pcd0.points) == 0 or len(pcd1.points) == 0:
            print("  [WARNING] Skipping scene due to empty preprocessed point cloud.")
            continue

        # Align using ICP
        reg_p2p = o3d.pipelines.registration.registration_icp(
            pcd1, pcd0, max_correspondence_distance=0.2, init=np.identity(4))
        pcd1.transform(reg_p2p.transformation)

        # KDTree and feature extraction
        epoch0_points = np.asarray(pcd0.points)
        epoch1_points = np.asarray(pcd1.points)
        epoch0_colors = np.asarray(pcd0.colors)
        epoch1_colors = np.asarray(pcd1.colors)

        tree = o3d.geometry.KDTreeFlann(pcd0)
        distances = np.full(len(epoch0_points), np.inf)
        nn_indices = np.full(len(epoch0_points), -1)

        for i, point in enumerate(epoch1_points):
            [_, idx, dist] = tree.search_knn_vector_3d(point, 1)
            distances[idx[0]] = np.sqrt(dist[0])
            nn_indices[idx[0]] = i

        # Handle infs
        finite_distances = distances[np.isfinite(distances)]
        max_dist = np.max(finite_distances) if len(finite_distances) else 1.0
        distances[~np.isfinite(distances)] = max_dist * 100

        # Nearest neighbor avg distance
        nn_dists = []
        for pt in epoch0_points:
            [_, _, dist] = tree.search_knn_vector_3d(pt, 5)
            nn_dists.append(np.mean(np.sqrt(dist[1:])))
        nn_dists = np.array(nn_dists)

        # Color difference
        epoch1_mapped_colors = np.zeros_like(epoch0_colors)
        valid = nn_indices != -1
        epoch1_mapped_colors[valid] = epoch1_colors[nn_indices[valid]]
        color_diff = np.linalg.norm(epoch0_colors - epoch1_mapped_colors, axis=1)
        color_diff[~valid] = np.max(color_diff[valid]) * 100 if np.any(valid) else 1.0

        # Map labels
        labels = np.ones(len(epoch0_points), dtype=int)  # assume changed
        labels[valid] = label_ch[nn_indices[valid]].astype(int)

        # Stack features
        features = np.vstack((distances, nn_dists, color_diff)).T
        all_features.append(features)
        all_labels.append(labels)

    except Exception as e:
        print(f"  [ERROR] Scene {scene_name} failed: {e}")
        continue


Processing Scene: LyonN1
  [INFO] Elements in PLY file pointCloud0.ply: ['params']
  [INFO] Fields in params: ('x', 'y', 'z', 'red', 'green', 'blue', 'label_ch', 'label_mono')
  [INFO] label_ch present: True
  [INFO] Elements in PLY file pointCloud1.ply: ['params']
  [INFO] Fields in params: ('x', 'y', 'z', 'red', 'green', 'blue', 'label_ch', 'label_mono')
  [INFO] label_ch present: True
  Original points: Epoch0 = 161174, Epoch1 = 156977

Processing Scene: LyonN3
  [INFO] Elements in PLY file pointCloud0.ply: ['params']
  [INFO] Fields in params: ('x', 'y', 'z', 'red', 'green', 'blue', 'label_ch', 'label_mono')
  [INFO] label_ch present: True
  [INFO] Elements in PLY file pointCloud1.ply: ['params']
  [INFO] Fields in params: ('x', 'y', 'z', 'red', 'green', 'blue', 'label_ch', 'label_mono')
  [INFO] label_ch present: True
  Original points: Epoch0 = 165042, Epoch1 = 162555

Processing Scene: LyonN14
  [INFO] Elements in PLY file pointCloud0.ply: ['params']
  [INFO] Fields in params: 

In [25]:
# === Final dataset ===
X = np.vstack(all_features)
y = np.hstack(all_labels)
print(f"\n Total points: {len(y)}, Changed points: {np.sum(y)}")

# Optional checks
assert len(X) == len(y)
assert np.all(np.isfinite(X))

# === Additional stats before ML ===
print("\n Feature stats:")
print("  C2C distance    -> mean:", np.mean(X[:, 0]), "std:", np.std(X[:, 0]))
print("  NN distance     -> mean:", np.mean(X[:, 1]), "std:", np.std(X[:, 1]))
print("  Color difference-> mean:", np.mean(X[:, 2]), "std:", np.std(X[:, 2]))


 Total points: 1548949, Changed points: 929249

 Feature stats:
  C2C distance    -> mean: 1.480338661040918 std: 2.0111491208340215
  NN distance     -> mean: 0.0023886801348258014 std: 0.00047091540327564365
  Color difference-> mean: 53.45503916451079 std: 68.1954672511682


In [5]:
import numpy as np
import os
import time
import psutil
import pickle
from memory_profiler import memory_usage
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Check unique classes
print("Unique classes in y:", np.unique(y))

# Class weights
class_weights = {}
unique_classes = np.unique(y)
for cls in unique_classes:
    class_count = np.sum(y == cls)
    class_weights[cls] = len(y) / (len(unique_classes) * class_count) if class_count > 0 else 1.0

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define models (excluding SVM)
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight=class_weights),
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight=class_weights, random_state=42, multi_class='ovr'),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    "XGBoost Classifier": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', verbosity=0)
}

# Evaluate all
results = {}
process = psutil.Process(os.getpid())
for name, clf in models.items():
    print(f"\nEvaluating {name}...")

    start_time = time.time()
    clf.fit(X_train, y_train)
    training_time = time.time() - start_time

    start_time = time.time()
    mem_usage = memory_usage((clf.predict, (X_test,)), interval=0.1)
    y_pred = clf.predict(X_test)
    inference_time = time.time() - start_time
    peak_memory = max(mem_usage)

    model_file = f"/kaggle/working/{name.replace(' ', '_')}_model.pkl"
    with open(model_file, 'wb') as f:
        pickle.dump(clf, f)
    model_size = os.path.getsize(model_file) / (1024 * 1024)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "Inference Time (s)": inference_time,
        "Memory Usage (MB)": peak_memory,
        "Training Time (s)": training_time,
        "Model Size (MB)": model_size
    }

# Print results
print("\nMetrics:")
print("=" * 120)
print(f"{'Model':<25} {'Accuracy':>8} {'Precision':>10} {'Recall':>8} {'F1-Score':>10} "
      f"{'Inference Time (s)':>18} {'Memory Usage (MB)':>18} {'Training Time (s)':>18} "
      f"{'Model Size (MB)':>15}")
print("-" * 120)
for name, m in results.items():
    print(f"{name:<25} {m['Accuracy']:>8.2f} {m['Precision']:>10.2f} {m['Recall']:>8.2f} {m['F1-Score']:>10.2f} "
          f"{m['Inference Time (s)']:>18.2f} {m['Memory Usage (MB)']:>18.2f} {m['Training Time (s)']:>18.2f} {m['Model Size (MB)']:>15.2f}")
print("=" * 120)


Unique classes in y: [0 1 2 3 4 5 6]

Evaluating Random Forest...

Evaluating Logistic Regression...

Evaluating K-Nearest Neighbors...

Evaluating XGBoost Classifier...

Metrics:
Model                     Accuracy  Precision   Recall   F1-Score Inference Time (s)  Memory Usage (MB)  Training Time (s) Model Size (MB)
------------------------------------------------------------------------------------------------------------------------
Random Forest                 0.75       0.85     0.75       0.79               6.99            4906.14              60.91         1642.27
Logistic Regression           0.40       0.42     0.40       0.40               0.36            4945.11              19.56            0.00
K-Nearest Neighbors           0.90       0.86     0.90       0.87              52.35            4949.91               0.92           46.36
XGBoost Classifier            0.90       0.86     0.90       0.88               4.87            4933.81              25.78            2.24
