In [1]:
import math
import numpy as np

def parseRDD(point):
    """ Parser para a base de dados de vertices e metricas. Recebe um ponto onde o primeiro campo
        eh o id do vertice o os 13 seguintes sao os valores para cada metrica e retorna
        uma tupla composta pelo id e pelas 13 metricas (lista de floats).
    Args:
        point (str): um ponto onde o primeiro campo
        eh o id do vertice o os 13 seguintes sao os valores para cada metrica
    Returns:
        (nodeId, nodeMetrics): uma tupla composta pelo id do vertice e uma lista das 13 metricas
    """
    data = point.split(';')
    floatMetrics = [float(m) for m in data[1:]]
    return (data[0], floatMetrics)

def notZero(parsedPoint):
    """ Retorna true se o ponto contem alguma metrica diferente de 0.
    Args:
        parsedPoint (str, list): uma tupla composta pelo id do vertice e a lista de metricas
    Returns:
        bool: True se a lista contém pelo menos um valor nao nulo
    """
    return sum(parsedPoint[1]) > 0

def normalize(parsedPoint, means, standardDeviations):
    """ Normaliza um ponto. Recebe um ponto cujas valores maximo e minimo para as metricas 
        podem ser muito amplos e retorna um ponto cujas metricas estao entre 0 e 1.0.
    Args:
        parsedPoint (str, list): uma tupla composta pelo id do vertice e a lista de metricas
        means (list): lista de medias das metricas
        standardDeviations (list): lista de desvios-padrao das metricas
    Returns:
        normalizedParsedPoint (str, list): uma tupla de id (str) e metricas normalizadas (list)
    """
    return (parsedPoint[0], [(parsedPoint[1][i] - means[i])/standardDeviations[i] for i in range(len(means))])

def euclidianDistance(parsedPointA, parsedPointB):
    """ Calcula a distancia euclidiana entre dois pontos. Recebe dois pontos e retorna a distancia
        euclidiana entre eles.
    Args:
        parsedPointA (str, list): uma tupla de id e lista de floats em que a lista contem os valores do ponto a
        parsedPointB (str, list): uma tupla de id e lista de floats em que a lista contem os valores do ponto b
    Returns:
        euclidianDistance (float): a distancia entre dois pontos
    """
    return math.sqrt(sum([math.pow(parsedPointB[1][i] - parsedPointA[1][i], 2) for i in range(len(parsedPointA[1]))]))

def generateRandomPoint(i):
    return (i, list(np.random.uniform(-1,1,13)))

def arePointsDifferent(pointA, pointB):
    for i in range(len(pointA)):
        if pointA[i] != pointB[i]:
            return True
    return False
        
# FIX: mutable number of centroids 
def kmeans(data, k, iteractions):
    centroids = []
    for i, value in enumerate(range(k)):
        centroids.append(generateRandomPoint(str(i)))
            
    for i in range(iteractions):
        print('\tkmeans interaction: %s' %(i + 1))
        clustersRDD = data.map(lambda x:(np.argmin([euclidianDistance(x, c) for c in centroids]), x[0], x[1]))
        newCentroids = sorted((clustersRDD
         .map(lambda j:(j[0], j[2]))
         .reduceByKey(lambda l,m:((np.array(l)+np.array(m))/len(l)))
         .map(lambda z:(str(z[0]), list(z[1])))
        ).collect(), key=lambda x:int(x[0]))
        
        # temporary fix
        if len(newCentroids) != len(centroids):
            diff = len(centroids) - len(newCentroids)
            for i in range(diff):
                newCentroids.append(generateRandomPoint(str(i)))
        
        centroidHasChanged = False
        
        for i in range(len(centroids)):
            centroidHasChanged = arePointsDifferent(centroids[1][i], newCentroids[1][i])
        
        if not centroidHasChanged:
            print('\t\tcentroids have not changed')
            return clustersRDD
        else:
            centroids = newCentroids
    
    return clustersRDD

# TODO: implement cluster evaluation
def bisectKmeans(data, k, iteractions):
    finalClusters = []
    clusterToSplit = data
    for i in range(k):
        print('bisect interaction: %s - clusters: %s' %(i, len(finalClusters)))
        clustersRDD = kmeans(data=clusterToSplit, k=2, iteractions=iteractions)
        finalClusters.append(clustersRDD.filter(lambda x:x[0] == 0))
        clusterToSplit = clustersRDD.filter(lambda x:x[0] != 0).map(lambda x:(x[1],x[2]))
    return finalClusters

In [2]:
fileName = os.path.join('/home/rafael/', 'metricas_t.csv')
rawRDD = sc.textFile(fileName)
metricsHeader = rawRDD.take(1)[0]
metricsRDD = (rawRDD
              .filter(lambda x: x != metricsHeader)
              .map(lambda x:parseRDD(x))
              .filter(lambda x: notZero(x))
             )

In [3]:
means = []
stdevs = []
for i in range(13):
    metricI = metricsRDD.map(lambda x: x[1][i])
    means.append(metricI.mean())
    stdevs.append(metricI.stdev())

In [4]:
normalizedMetricsRDD = metricsRDD.map(lambda x:normalize(x, means, stdevs))

In [5]:
clusters = bisectKmeans(data=normalizedMetricsRDD, iteractions=30, k=10)

bisect interaction: 0 - clusters: 0
	kmeans interaction: 1
	kmeans interaction: 2
	kmeans interaction: 3
		centroids have not changed
bisect interaction: 1 - clusters: 1
	kmeans interaction: 1
	kmeans interaction: 2
	kmeans interaction: 3
	kmeans interaction: 4
	kmeans interaction: 5
	kmeans interaction: 6
	kmeans interaction: 7
	kmeans interaction: 8
	kmeans interaction: 9
	kmeans interaction: 10
	kmeans interaction: 11
	kmeans interaction: 12
	kmeans interaction: 13
	kmeans interaction: 14
	kmeans interaction: 15
	kmeans interaction: 16
	kmeans interaction: 17
	kmeans interaction: 18
	kmeans interaction: 19
	kmeans interaction: 20
	kmeans interaction: 21
	kmeans interaction: 22
	kmeans interaction: 23
	kmeans interaction: 24
	kmeans interaction: 25
	kmeans interaction: 26
	kmeans interaction: 27
	kmeans interaction: 28
	kmeans interaction: 29
	kmeans interaction: 30
bisect interaction: 2 - clusters: 2
	kmeans interaction: 1
	kmeans interaction: 2
	kmeans interaction: 3
	kmeans intera

In [6]:
for c in clusters:
    print(c.count())

949
141
11
79
49
11
500
99
130
15
