In [14]:
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
The K-means algorithm written from scratch against PySpark. In practice,
one may prefer to use the KMeans algorithm in ML, as shown in
examples/src/main/python/ml/kmeans_example.py.

This example requires NumPy (http://www.numpy.org/).
"""
from __future__ import print_function

import sys

import numpy as np
from datetime import datetime
from pyspark.sql import SparkSession

In [3]:
def parseVector(line):
    return np.array([float(x) for x in line.split(',')])


def closestPoint(p, centers):
    bestIndex = 0
    closest = float("+inf")
    for i in range(len(centers)):
        tempDist = np.sum((p - centers[i]) ** 2)
        if tempDist < closest:
            closest = tempDist
            bestIndex = i
    return bestIndex

In [60]:
spark = SparkSession\
    .builder\
    .appName("PythonKMeans")\
    .getOrCreate()

lines = spark.read.text('smallpointdata2018.txt').rdd.map(lambda r: r[0])
lines.takeSample(True, 10000000, 1)

data = lines.map(parseVector).cache()
K = 4
convergeDist = 0.0001
kPoints = data.takeSample(False, K, int(datetime.timestamp(datetime.now())))
print(kPoints)
tempDist = 1.0
i = 0
while tempDist > convergeDist:
    closest = data.map(lambda p: (closestPoint(p, kPoints), (p, 1)))
    pointStats = closest.reduceByKey(lambda p1_c1, p2_c2:
                                     (p1_c1[0] + p2_c2[0], p1_c1[1] + p2_c2[1]))
    newPoints = pointStats.map(lambda st: (st[0], st[1][0] / st[1][1])).collect()
    tempDist = sum(np.sum((kPoints[iK] - p) ** 2) for (iK, p) in newPoints)
    print(tempDist)
    ##updates new points coordinates
    for (iK, p) in newPoints:
        kPoints[iK] = p
    i+=1

print("Final centers: " + str(kPoints))

spark.stop()

[array([43.10497168, 41.16390415, 43.93244262, 42.41966085, 41.06097828]), array([4.9031759 , 3.97454354, 2.96748017, 4.40817375, 3.23359892]), array([61.83426467, 64.71737169, 64.7113976 , 63.60758347, 64.72639195]), array([24.90271732, 21.29497081, 24.92517731, 20.6080589 , 20.47811634])]
1758.114863489408
335.78956989415286
204.0331831532172
0.0
Final centers: [array([52.61451094, 52.45972538, 52.50091443, 52.34306416, 52.47587122]), array([3.36525373, 3.35021267, 3.42078562, 3.55184528, 3.44857118]), array([92.46972701, 92.58677669, 92.59040945, 92.44669281, 92.50228411]), array([22.48575648, 22.50197355, 22.68095344, 22.57773126, 22.47048334])]
