In [69]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.sparse.csgraph import minimum_spanning_tree, connected_components
from scipy.spatial.distance import pdist, squareform
from scipy.stats import mode
import scipy.io
import pandas as pd
import numpy as np



In [72]:
def load_data(file_path):
    """Load the LungA dataset from .mat file."""
    matData = scipy.io.loadmat(file_path)
    X = pd.DataFrame(matData['lungA'])
    # Flatten the labels array to 1D
    y = matData['labA'].flatten()
    return X, y

def encode_labels(y):
    """Encode categorical labels to numeric values."""
    le = LabelEncoder()
    # Ensure y is 1D array
    y = np.asarray(y).ravel()
    return le.fit_transform(y)

def main():
    # Load and prepare data
    X, y = load_data('lungA.mat')
    y_encoded = encode_labels(y)
    
    # Compute MST and form clusters
    mst = compute_mst(X)
    n_clusters, cluster_labels = cut_mst(mst)
    
    # Map clusters and calculate accuracy
    mapped_clusters = map_clusters_to_labels(cluster_labels, y_encoded)
    accuracy = calculate_accuracy(y_encoded, mapped_clusters)
    
    # Print results
    print(f"Number of clusters found: {n_clusters}")
    print(f"Accuracy for two clusters: {accuracy:.2f}%")
    
    # Create LabelEncoder for cluster info printing
    le = LabelEncoder()
    le.fit(y)  # Fit on original labels
    print_cluster_info(cluster_labels, y_encoded, le)

main()

TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['ndarray']

In [71]:
def main():
    X, y = load_data('lungA.mat')
    print(f"Original y shape: {y.shape}")
    
    y_encoded = encode_labels(y)
    print(f"Encoded y shape: {y_encoded.shape}")
    
    # ... rest of the code ...

In [82]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.sparse.csgraph import minimum_spanning_tree, connected_components
from scipy.spatial.distance import pdist, squareform
from scipy.stats import mode
import scipy.io
import pandas as pd
import numpy as np

def load_data(file_path):
    """Load the LungA dataset from .mat file."""
    matData = scipy.io.loadmat(file_path)
    X = pd.DataFrame(matData['lungA'])
    # Extract labels and convert to list of strings
    y = np.array([str(label[0]) for label in matData['labA']])
    return X, y

def map_clusters_to_labels(cluster_labels, y):
    """Map cluster labels to original labels using mode."""
    mapped_labels = np.zeros_like(cluster_labels)
    unique_clusters = np.unique(cluster_labels)
    
    # Debug print
    print(f"Shape of cluster_labels: {cluster_labels.shape}")
    print(f"Shape of y: {y.shape}")
    
    for cluster in unique_clusters:
        mask = (cluster_labels == cluster)
        # Ensure mask and y have compatible shapes
        if len(y[mask]) > 0:
            mode_result = mode(y[mask])
            mapped_labels[mask] = mode_result.mode[0] if hasattr(mode_result, 'mode') else mode_result[0]
    return mapped_labels

def compute_mst(X):
    """Compute Minimum Spanning Tree from feature matrix."""
    distance_matrix = squareform(pdist(X, metric='euclidean'))
    return minimum_spanning_tree(distance_matrix)

def cut_mst(mst, percentile=50):
    """Cut MST to form clusters using a threshold."""
    threshold = np.percentile(mst.data[mst.data > 0], percentile)
    mst.data[mst.data > threshold] = 0
    return connected_components(csgraph=mst, directed=False)

def map_clusters_to_labels(cluster_labels, y):
    """Map cluster labels to original labels using mode."""
    mapped_labels = np.zeros_like(cluster_labels)
    for cluster in np.unique(cluster_labels):
        mask = (cluster_labels == cluster)
        mode_result = mode(y[mask])
        if hasattr(mode_result, 'mode'):
            mapped_labels[mask] = mode_result.mode[0]
        else:
            mapped_labels[mask] = mode_result[0]
    return mapped_labels

def calculate_accuracy(y_true, y_pred):
    """Calculate clustering accuracy."""
    return accuracy_score(y_true, y_pred) * 100

def print_cluster_info(cluster_labels, y, le):
    """Print information about clusters."""
    unique, counts = np.unique(cluster_labels, return_counts=True)
    print("\nCluster sizes:")
    for i, count in zip(unique, counts):
        mask = (cluster_labels == i)
        most_common_label = le.inverse_transform([mode(y[mask]).mode[0]])[0]
        print(f"Cluster {i}: {count} samples (mostly {most_common_label})")

def main():
    # Load and prepare data
    X, y = load_data('lungA.mat')
    
    # Debug print
    print("Data shapes:")
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    
    y_encoded = encode_labels(y)
    
    # Compute MST and form clusters
    mst = compute_mst(X)
    n_clusters, cluster_labels = cut_mst(mst)
    
    # Ensure cluster_labels has the same length as y_encoded
    if len(cluster_labels) != len(y_encoded):
        raise ValueError(f"Cluster labels length ({len(cluster_labels)}) "
                        f"doesn't match encoded labels length ({len(y_encoded)})")
    
    # Map clusters and calculate accuracy
    mapped_clusters = map_clusters_to_labels(cluster_labels, y_encoded)
    accuracy = calculate_accuracy(y_encoded, mapped_clusters)
    
    # Create LabelEncoder for cluster info printing
    le = LabelEncoder()
    le.fit(y)
    
    # Print results
    print(f"Number of clusters found: {n_clusters}")
    print(f"Accuracy for two clusters: {accuracy:.2f}%")
    print_cluster_info(cluster_labels, y_encoded, le)

if __name__ == "__main__":
    main()

Data shapes:
X shape: (3312, 203)
y shape: (1,)


ValueError: Cluster labels length (3312) doesn't match encoded labels length (1)