In [838]:
import math
import numpy as np

def parseRDD(point):
    """ Parser para a base de dados de vertices e metricas. Recebe um ponto onde o primeiro campo
        eh o id do vertice o os 13 seguintes sao os valores para cada metrica e retorna
        uma tupla composta pelo id e pelas 13 metricas (lista de floats).
    Args:
        point (str): um ponto onde o primeiro campo
        eh o id do vertice o os 13 seguintes sao os valores para cada metrica
    Returns:
        (nodeId, nodeMetrics): uma tupla composta pelo id do vertice e uma lista das 13 metricas
    """
    data = point.split(';')
    floatMetrics = [float(m) for m in data[1:]]
    return (data[0], floatMetrics)

def notZero(parsedPoint):
    """ Retorna true se o ponto contem alguma metrica diferente de 0.
    Args:
        parsedPoint (str, list): uma tupla composta pelo id do vertice e a lista de metricas
    Returns:
        bool: True se a lista contém pelo menos um valor nao nulo
    """
    return sum(parsedPoint[1]) > 0

def normalize(parsedPoint, means, standardDeviations):
    """ Normaliza um ponto. Recebe um ponto cujas valores maximo e minimo para as metricas 
        podem ser muito amplos e retorna um ponto cujas metricas estao entre 0 e 1.0.
    Args:
        parsedPoint (str, list): uma tupla composta pelo id do vertice e a lista de metricas
        means (list): lista de medias das metricas
        standardDeviations (list): lista de desvios-padrao das metricas
    Returns:
        normalizedParsedPoint (str, list): uma tupla de id (str) e metricas normalizadas (list)
    """
    return (parsedPoint[0], [(parsedPoint[1][i] - means[i])/standardDeviations[i] for i in range(len(means))])

def euclidianDistance(parsedPointA, parsedPointB):
    """ Calcula a distancia euclidiana entre dois pontos. Recebe dois pontos e retorna a distancia
        euclidiana entre eles.
    Args:
        parsedPointA (str, list): uma tupla de id e lista de floats em que a lista contem os valores do ponto a
        parsedPointB (str, list): uma tupla de id e lista de floats em que a lista contem os valores do ponto b
    Returns:
        euclidianDistance (float): a distancia entre dois pontos
    """
    return math.sqrt(sum([math.pow(parsedPointB[1][i] - parsedPointA[1][i], 2) for i in range(len(parsedPointA[1]))]))

def generateRandomPoint(i):
    return (i, list(np.random.uniform(-1,1,13)))

def areEqualPoints(pointA, pointB):
    for i in range(len(pointA)):
        if pointA[i] != pointB[i]:
            return False
    return True

# FIX: mutable number of centroids 
def kmeans(data, k, iteractions):
    centroids = []
    for i, value in enumerate(range(k)):
        centroids.append(generateRandomPoint(str(i)))
            
    for i in range(iteractions):
        print('kmeans interaction: %s - centroids: %s)' %(i + 1, len(centroids)))
        clustersRDD = data.map(lambda x:(np.argmin([euclidianDistance(x, c) for c in centroids]), x[0], x[1]))
        newCentroids = sorted((clustersRDD
         .map(lambda j:(j[0], j[2]))
         .reduceByKey(lambda l,m:((np.array(l)+np.array(m))/len(l)))
         .map(lambda z:(str(z[0]), list(z[1])))
        ).collect(), key=lambda x:int(x[0]))
        
        centroidHasChanged = False
        
        for i in newCentroids:
            if i[1] not in [c[1] for c in centroids]:
                centroidHasChanged = True           
        
        if not centroidHasChanged:
            print('centroids have not changed')
            return clustersRDD
        else:
            centroids = newCentroids
    
    return clustersRDD

# TODO: implement cluster evaluation
def bisectKmeans(data, k, iteractions):
    finalClusters = []
    clusterToSplit = data
    for i in range(k):
        print('bisect interaction: %s - clusters: %s' %(i, len(finalClusters)))
        clustersRDD = kmeans(data=clusterToSplit, k=2, iteractions=iteractions)
        finalClusters.append(clustersRDD.filter(lambda x:x[0] == 0))
        clusterToSplit = clustersRDD.filter(lambda x:x[0] != 0).map(lambda x:(x[1],x[2]))
    return finalClusters

In [800]:
fileName = os.path.join('/home/rafael/', 'metricas_t.csv')
rawRDD = sc.textFile(fileName)
metricsHeader = rawRDD.take(1)[0]
metricsRDD = (rawRDD
              .filter(lambda x: x != metricsHeader)
              .map(lambda x:parseRDD(x))
              .filter(lambda x: notZero(x))
             )

In [801]:
means = []
stdevs = []
for i in range(13):
    metricI = metricsRDD.map(lambda x: x[1][i])
    means.append(metricI.mean())
    stdevs.append(metricI.stdev())

In [803]:
normalizedMetricsRDD = metricsRDD.map(lambda x:normalize(x, means, stdevs))

In [839]:
clusters = bisectKmeans(data=normalizedMetricsRDD, iteractions=100, k=10)

bisect interaction: 0 - clusters: 0
kmeans interaction: 1 - centroids: 2)
kmeans interaction: 2 - centroids: 2)
kmeans interaction: 3 - centroids: 2)
centroids have not changed
bisect interaction: 1 - clusters: 1
kmeans interaction: 1 - centroids: 2)
kmeans interaction: 2 - centroids: 2)
kmeans interaction: 3 - centroids: 2)
kmeans interaction: 4 - centroids: 2)
kmeans interaction: 5 - centroids: 2)
centroids have not changed
bisect interaction: 2 - clusters: 2
kmeans interaction: 1 - centroids: 2)
kmeans interaction: 2 - centroids: 1)
centroids have not changed
bisect interaction: 3 - clusters: 3
kmeans interaction: 1 - centroids: 2)
centroids have not changed
bisect interaction: 4 - clusters: 4
kmeans interaction: 1 - centroids: 2)
centroids have not changed
bisect interaction: 5 - clusters: 5
kmeans interaction: 1 - centroids: 2)
centroids have not changed
bisect interaction: 6 - clusters: 6
kmeans interaction: 1 - centroids: 2)
centroids have not changed
bisect interaction: 7 - clu

In [867]:
for c in clusters:
    print(c.take(5))
    print('\n')

[(0, '6', [-0.1468720813469858, 2.1157389497824437, -0.2570665808902406, 1.5276195130512888, -0.1423136133929641, 1.7124298672563634, 0.16374740861731052, -0.10806230738849276, -0.33280325827468865, 0.5337524871726095, -0.14600709871060533, 2.455439618966183, -0.1534474453512897]), (0, '7', [-0.1468720813469858, 0.6088988198040558, -0.2570665808902406, 1.5276195130512888, -0.1423136133929641, 1.7124298672563634, 3.606208550627554, -0.10806230738849276, -0.33280325827468865, -0.19206309511009, -0.14600709871060533, 0.7789044029190674, -0.1534474453512897]), (0, '10', [0.3448193288871933, 0.39363594409285746, 1.7387431172060281, 1.5276195130512888, -0.1423136133929641, 1.7124298672563634, 0.08701575195649458, -0.10806230738849276, 1.4517417806901278, -0.19206309511009, 0.3401782724002054, 0.3597705989072884, -0.1534474453512897]), (0, '14', [-0.1468720813469858, 0.17837306838165914, -0.2570665808902406, 1.5276195130512888, -0.1423136133929641, -0.1410675995393959, 1.2414784044442253, -0.

In [None]:
len(list(clusters.groupBy(lambda x:x[0]).take(10)[8][1]))