In [14]:
import pandas as pd
import numpy as np

'''
Some functions for loading a dataset and performing simple data preparation
'''

def load_dataset(filename, filetype='csv', header=True):

    '''
    Loads a dataset from file
    
    Parameters:
    -----------
    filename: str
        Name of data file
    filetype: str
        The type of data file (csv, tsv)
    Returns:
    --------
    DataFrame
        Dataset as pandas DataFrame
    '''

    in_file = open(filename)
    data = []
    header_row = ''

    # Read the file line by line into instance structure
    for line in in_file.readlines():

        # Skip comments
        if not line.startswith("#"):
            
            # TSV file
            if filetype == 'tsv':
                if header:
                    header_row = line.strip().split('\t')
                else:
                    raw = line.strip().split('\t')
                    
            # CSV file
            elif filetype =='csv':
                if header:
                    header_row = line.strip().split(',')
                else:
                    raw = line.strip().split(',')
            
            # Neither = problem
            else:
                print ('Invalid file type')
                exit()

            # Append to dataset appropriately
            if not header:
                data.append(raw)
            header = False
    
    # Build a new dataframe of the data instance list of lists and return
    df = pd.DataFrame(data, columns=header_row)
    return df


def to_numeric(dataset, attr_name):
    
    '''
    Performs a simple categorical to numeric attribute value transformation
    
    Parameters:
    -----------
    dataset: DataFrame
        Dataset on which to perform transformation
    attr_name: str
        Dataset attribute name to convert from nominal to numeric values
    Returns:
    --------
    DataFrame
        DataFrame of with data transformation performed
    dict
        Python dictionary of attribute name to integer mappings
    '''
    
    # Get unique entries in column
    unique_vals = dataset[attr_name].unique()
    
    # Create dict
    val_dict = {}
    for val in unique_vals:
        if not val in val_dict:
            val_dict[val] = len(val_dict)
    
    # Replace values in attr_name col as per dict
    dataset[attr_name].replace(val_dict, inplace=True)
    #print val_dict
    # Return dataset and value dictionary
    return dataset, val_dict
  

def from_str(dataset, attrs):
    
    '''
    Performs numeric values stored as strings to numeric value transformation
    
    Parameters:
    -----------
    dataset: DataFrame
        Dataset on which to perform transformation
    attr_name: str
        Dataset attribute name to convert from strings to equivalent numeric values
    Returns:
    --------
    DataFrame
        DataFrame with data transformation performed
    '''

    # Make conversions on list of attributes
    if type(attrs) == list:
        for attr_name in attrs:
            dataset[attr_name] = dataset[attr_name].astype(float)

    # Make conversion on single attribute
    else:
        data[attrs] = data[attrs].astype(float).fillna(0.0)
    
    # Return dataset after conversion
    return dataset


def to_matrix(dataset):
    
    '''
    Converts a pandas DataFrame dataset to a numpy matrix representation
    
    Parameters:
    -----------
    dataset: DataFrame
        Dataset to convert to matrix representation
    Returns:
    --------
    ndarray
        numpy ndarray representation of dataset
    '''
    
    return dataset.as_matrix()

In [18]:
from math import sqrt, floor
import numpy as np
import scipy.spatial.distance as metric

'''
k-means clustering algorithm
'''

def initialize(ds, k):

	'''
	Create random cluster centroids
	Parameters:
	-----------
	ds: ndarray
		Dataset to cluster instances from
	k: int
		The number of clusters
	Returns:
	--------
	ndarray
		centroids as numpy ndarray
	'''

	# Number of columns in dataset
	n = np.shape(ds)[1]

	# The centroids
	centroids = np.mat(np.zeros((k,n)))

	# Create random centroids
	for j in range(n):
		min_j = min(ds[:,j])
		range_j = float(max(ds[:,j]) - min_j)
		centroids[:,j] = min_j + range_j * np.random.rand(k, 1)

	# Return centroids as numpy array
	return centroids


def euclidean_dist(A, B):

	'''
	Calculate Euclidean distance between 2 n-dimension points
	Parameters:
	-----------
	A: ndarray
		vector of point coordinates to compare
	B: ndarray
		vector of point coordinates to compare
	Returns:
	--------
	float
		calculated Euclidean distance of the 2 vectors
	'''

	return metric.euclidean(A, B)


def cluster(ds, k):

	'''
	The k-means clustering algorithm
	Parameters:
	-----------
	ds: ndarray
		Dataset to cluster instances from
	k: int
		The number of clusters
	Returns:
	--------
	ndarray
		Resulting centroids after clustering
	ndarray
		Cluster assignments after clustering
	int
		Number of iterations required by clustering algorithm
	ndarray
		Original centroids
	'''

	# Number of rows in dataset
	m = np.shape(ds)[0]

	# Hold the instance cluster assignments
	cluster_assignments = np.mat(np.zeros((m, 2)))

	# Initialize centroids
	cents = initialize(ds, k)

	# Preserve original centroids
	cents_orig = cents.copy()

	changed = True
	num_iter = 0

	# Loop until no changes to cluster assignments
	while changed:

		changed = False

		# For every instance (row in dataset)
		for i in range(m):

			# Track minimum distance, and vector index of associated cluster
			min_dist = np.inf
			min_index = -1

			# Calculate distances
			for j in range(k):

				dist_ji = euclidean_dist(cents[j,:], ds[i,:])
				if dist_ji < min_dist:
					min_dist = dist_ji
					min_index = j

			# Check if cluster assignment of instance has changed
			if cluster_assignments[i, 0] != min_index: 
				changed = True

			# Assign instance to appropriate cluster
			cluster_assignments[i, :] = min_index, min_dist**2

		# Update centroid location
		for cent in range(k):
			points = ds[np.nonzero(cluster_assignments[:,0].A==cent)[0]]
			cents[cent,:] = np.mean(points, axis=0)

		# Count iterations
		num_iter += 1

	# Return important stuff when done
        return cents,cluster_assignments,num_iter,cents_orig

TabError: inconsistent use of tabs and spaces in indentation (<ipython-input-18-16e7418dc4a1>, line 133)

In [None]:
import scipy.spatial.distance as metric

def euclidean_dist(A, B):

    '''
    Calculate Euclidean distance between 2 n-dimension points
    
    Parameters:
    -----------
    A: ndarray
        vector of point coordinates to compare
    B: ndarray
        vector of point coordinates to compare
    Returns:
    --------
    float
        calculated Euclidean distance of the 2 vectors
    '''
    
    return metric.euclidean(A, B)

In [None]:
import numpy as np

def cluster(ds, k):

    '''
    The k-means clustering algorithm
    
    Parameters:
    -----------
    ds: ndarray
        Dataset to cluster instances from
    k: int
        The number of clusters
    Returns:
    --------
    ndarray
        Resulting centroids after clustering
    ndarray
        Cluster assignments after clustering
    int
        Number of iterations required by clustering algorithm
    ndarray
        Original centroids
    '''

    # Number of rows in dataset
    m = np.shape(ds)[0]

    # Hold the instance cluster assignments
    cluster_assignments = np.mat(np.zeros((m, 2)))

    # Initialize centroids
    cents = initialize(ds, k)
    
    # Preserve original centroids
    cents_orig = cents.copy()
    
    changed = True
    num_iter = 0

    # Loop until no changes to cluster assignments
    while changed:

        changed = False

        # For every instance (row in dataset)
        for i in range(m):

            # Track minimum distance, and vector index of associated cluster
            min_dist = np.inf
            min_index = -1

            # Calculate distances
            for j in range(k):

                dist_ji = euclidean_dist(cents[j,:], ds[i,:])
                if dist_ji < min_dist:
                    min_dist = dist_ji
                    min_index = j

            # Check if cluster assignment of instance has changed
            if cluster_assignments[i, 0] != min_index: 
                changed = True

            # Assign instance to appropriate cluster
            cluster_assignments[i, :] = min_index, min_dist**2

        # Update centroid location
        for cent in range(k):
            points = ds[np.nonzero(cluster_assignments[:,0].A==cent)[0]]
            cents[cent,:] = np.mean(points, axis=0)

        # Count iterations
        num_iter += 1

    # Return important stuff when done
    return cents, cluster_assignments, num_iter, cents_orig

In [None]:
import dataset
import kmeans

# Load dataset
iris_data = dataset.load_dataset('iris.csv')

# Convert class names to numeric representations
iris_data, iris_classes = dataset.to_numeric(iris_data, 'species')

# Convert dataframe strings to floats
attrs_conv = list(iris_data.axes[1][:-1])
data = dataset.from_str(iris_data, attrs_conv)

# Convert dataset to matrix representation
iris_ds = dataset.to_matrix(iris_data)

# Perform k-means clustering
centroids, cluster_assignments, iters, orig_centroids = kmeans.cluster(iris_ds, 3)

# Output results
print ('Number of iterations:', iters)
print ('\nFinal centroids:\n', centroids)
print ('\nCluster membership and error of first 10 instances:\n', cluster_assignments[:10])
print ('\nOriginal centroids:\n', orig_centroids)