In [1]:
## Set Python - Spark environment.
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.6-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

In [2]:
## Create SparkContext, SparkSession
from os.path import expanduser, join, abspath

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark import SparkContext
sc = SparkContext()

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs:///apps/hive/warehouse/'

spark = SparkSession \
    .builder \
    .appName("Spark Machine Learning Example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

In [4]:
data = sc.textFile("/user/jayantm/Uber/HistData/FlumeData.*")

In [5]:
data.count()

3296451

In [6]:
data.take(2)

[u'"12/1/2016 0:03:00",40.7586,-73.9706,"B02512"',
 u'"12/1/2016 0:05:00",40.7605,-73.9994,"B02512"']

In [7]:
def convertDataFloat(line):
    return array([float(line[1]),float(line[2])])

In [8]:
fea_data = data.map(lambda data:data.split(','))
parsedData = fea_data.map(lambda line : convertDataFloat(line))

In [9]:
parsedData.take(5)

[array([ 40.7586, -73.9706]),
 array([ 40.7605, -73.9994]),
 array([ 40.732 , -73.9999]),
 array([ 40.7635, -73.9793]),
 array([ 40.7204, -74.0047])]

In [10]:
clusters = KMeans.train(parsedData,8, maxIterations=10, initializationMode="random")

In [11]:
clusters.centers

[array([ 40.71589091, -74.00240766]),
 array([ 40.70872413, -73.94599527]),
 array([ 40.69929502, -74.20363784]),
 array([ 40.74283935, -73.9925041 ]),
 array([ 40.66707756, -73.75984324]),
 array([ 40.77038509, -73.96916379]),
 array([ 40.65909998, -73.97709984]),
 array([ 40.79906929, -73.8739884 ])]

In [12]:
def wsssError(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

In [13]:
WSSSE = parsedData.map(lambda point: wsssError(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

Within Set Sum of Squared Error = 66111.1047027


In [14]:
clusters.predict(array([40.6988701 , -74.20341933]))

2

In [15]:
sqrt(sum((array([40.7204,-74.0047]) - array([ 40.71743048, -74.002436  ])) ** 2))

0.0037341324334293183

In [14]:
#40.7204 - 40.71743048

In [15]:
#-74.0047 - -74.002436

In [14]:
clusters.save(sc, "/user/jayantm/Uber/kmeanModel")