In [3]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [4]:
sc = SparkContext()
sqlContext = SQLContext(sc)

In [5]:
df = sqlContext.createDataFrame([[0, 33.3, -17.5],
                              [1, 40.4, -20.5],
                              [2, 28., -23.9],
                              [3, 29.5, -19.0],
                              [4, 32.8, -18.84]
                             ],["other","lat", "long"])

In [6]:
df.show()

+-----+----+------+
|other| lat|  long|
+-----+----+------+
|    0|33.3| -17.5|
|    1|40.4| -20.5|
|    2|28.0| -23.9|
|    3|29.5| -19.0|
|    4|32.8|-18.84|
+-----+----+------+



In [7]:
from pyspark.ml.feature import VectorAssembler #TAKE A SET OF COL AND DEFINE FEATURES

In [8]:
vecAssembler = VectorAssembler(inputCols=["lat", "long"], outputCol="features")
new_df = vecAssembler.transform(df)
new_df.show()

+-----+----+------+-------------+
|other| lat|  long|     features|
+-----+----+------+-------------+
|    0|33.3| -17.5| [33.3,-17.5]|
|    1|40.4| -20.5| [40.4,-20.5]|
|    2|28.0| -23.9| [28.0,-23.9]|
|    3|29.5| -19.0| [29.5,-19.0]|
|    4|32.8|-18.84|[32.8,-18.84]|
+-----+----+------+-------------+



In [9]:
from pyspark.ml.clustering import KMeans

In [11]:
kmeans = KMeans(k=2, seed=1)  # 2 clusters here
model = kmeans.fit(new_df.select('features'))

In [13]:
transformed = model.transform(new_df)
transformed.show()  

+-----+----+------+-------------+----------+
|other| lat|  long|     features|prediction|
+-----+----+------+-------------+----------+
|    0|33.3| -17.5| [33.3,-17.5]|         0|
|    1|40.4| -20.5| [40.4,-20.5]|         1|
|    2|28.0| -23.9| [28.0,-23.9]|         0|
|    3|29.5| -19.0| [29.5,-19.0]|         0|
|    4|32.8|-18.84|[32.8,-18.84]|         0|
+-----+----+------+-------------+----------+



In [14]:
df.select('lat', 'long').rdd.collect()

[Row(lat=33.3, long=-17.5),
 Row(lat=40.4, long=-20.5),
 Row(lat=28.0, long=-23.9),
 Row(lat=29.5, long=-19.0),
 Row(lat=32.8, long=-18.84)]

In [15]:
df.select('lat', 'long').rdd.map(lambda x: (x[0], x[1])).collect()

[(33.3, -17.5), (40.4, -20.5), (28.0, -23.9), (29.5, -19.0), (32.8, -18.84)]

In [16]:
from pyspark.mllib.clustering import KMeans, KMeansModel

rdd = df.select('lat', 'long').rdd.map(lambda x: (x[0], x[1]))

clusters = KMeans.train(rdd, 2, maxIterations=10, initializationMode="random")

In [17]:
clusters.centers

[array([ 36.85, -19.  ]), array([ 30.1 , -20.58])]