In [1]:
## Set Python - Spark environment.
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.6-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

In [2]:
## Create SparkContext, SparkSession
from os.path import expanduser, join, abspath

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark import SparkContext
sc = SparkContext()

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs:///apps/hive/warehouse/'

spark = SparkSession \
    .builder \
    .appName("Spark Machine Learning Example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()

In [35]:
uberdata = sc.textFile("/user/2052B43/Uber/HistData/FlumeData.*")

In [36]:
uberdata.count()

3296451

In [37]:
uberdata.take(5)

[u'"12/1/2016 0:03:00",40.7586,-73.9706,"B02512"',
 u'"12/1/2016 0:05:00",40.7605,-73.9994,"B02512"',
 u'"12/1/2016 0:06:00",40.732,-73.9999,"B02512"',
 u'"12/1/2016 0:09:00",40.7635,-73.9793,"B02512"',
 u'"12/1/2016 0:20:00",40.7204,-74.0047,"B02512"']

In [38]:
splitdata = uberdata.map(lambda s: s.split(','))

In [39]:
splitdata.take(10)

[[u'"12/1/2016 0:03:00"', u'40.7586', u'-73.9706', u'"B02512"'],
 [u'"12/1/2016 0:05:00"', u'40.7605', u'-73.9994', u'"B02512"'],
 [u'"12/1/2016 0:06:00"', u'40.732', u'-73.9999', u'"B02512"'],
 [u'"12/1/2016 0:09:00"', u'40.7635', u'-73.9793', u'"B02512"'],
 [u'"12/1/2016 0:20:00"', u'40.7204', u'-74.0047', u'"B02512"'],
 [u'"12/1/2016 0:35:00"', u'40.7487', u'-73.9869', u'"B02512"'],
 [u'"12/1/2016 0:57:00"', u'40.7444', u'-73.9961', u'"B02512"'],
 [u'"12/1/2016 0:58:00"', u'40.7132', u'-73.9492', u'"B02512"'],
 [u'"12/1/2016 1:04:00"', u'40.759', u'-73.973', u'"B02512"'],
 [u'"12/1/2016 1:08:00"', u'40.7601', u'-73.9823', u'"B02512"']]

In [40]:
sampledata = splitdata.sample(False,0.02,4567)

In [98]:
sampledata.count()

65922

In [42]:
df = sampledata.zipWithIndex().map(lambda (x,y) : (y, x[1] , x[2]) ).toDF(["index", "lat" , "long"])

In [43]:
df.dtypes

[('index', 'bigint'), ('lat', 'string'), ('long', 'string')]

In [44]:
## Converting the strings to Float 
from pyspark.sql.types import FloatType
df = df.withColumn("lat", df["lat"].cast(FloatType()))
df = df.withColumn("long", df["long"].cast(FloatType()))

In [45]:
df.dtypes

[('index', 'bigint'), ('lat', 'float'), ('long', 'float')]

In [46]:
## Dropping Index column
df = df.drop('index')

In [47]:
df.show(5)

+-------+--------+
|    lat|    long|
+-------+--------+
| 40.759| -73.973|
|40.7608|-73.9988|
|40.7782| -73.959|
|40.7462|-73.9835|
|40.7593|-73.9749|
+-------+--------+
only showing top 5 rows



In [49]:
## Coverting SparkDataframe to PandasDataframe 
pandasDF= df.toPandas()

In [50]:
pandasDF.head()

Unnamed: 0,lat,long
0,40.758999,-73.973
1,40.760799,-73.998802
2,40.778198,-73.959
3,40.746201,-73.983498
4,40.7593,-73.974899


In [83]:
## VectorAssembler for Numerical Columns
## Name the Outputcol as features and transform the dataframe
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['lat', 'long'], outputCol='features')
features = assembler.transform(df)
features.show(truncate=False)

+-------+--------+---------------------------------------+
|lat    |long    |features                               |
+-------+--------+---------------------------------------+
|40.759 |-73.973 |[40.75899887084961,-73.9729995727539]  |
|40.7608|-73.9988|[40.760799407958984,-73.9988021850586] |
|40.7782|-73.959 |[40.7781982421875,-73.95899963378906]  |
|40.7462|-73.9835|[40.74620056152344,-73.9834976196289]  |
|40.7593|-73.9749|[40.759300231933594,-73.97489929199219]|
|40.714 |-74.0145|[40.7140007019043,-74.0145034790039]   |
|40.7366|-73.997 |[40.73659896850586,-73.99700164794922] |
|40.7376|-74.0067|[40.73759841918945,-74.00669860839844] |
|40.7243|-73.9999|[40.724300384521484,-73.9999008178711] |
|40.7559|-73.9728|[40.75590133666992,-73.9728012084961]  |
|40.7548|-73.9778|[40.754798889160156,-73.97779846191406]|
|40.7618|-73.9943|[40.76179885864258,-73.99430084228516] |
|40.7614|-73.9823|[40.76139831542969,-73.9822998046875]  |
|40.7386|-73.9919|[40.73860168457031,-73.99189758300781]

In [84]:
## Applying Kmeans Clustering on Features
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=7,seed=123)
model = kmeans.fit(features)

In [85]:
# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(features)
print("Within Set Sum of Squared Errors = " + str(wssse))

Within Set Sum of Squared Errors = 89.1375170727


In [86]:
# Printing Centers of the clusters
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[ 40.76192691 -73.87442296]
[ 40.76395276 -73.97433094]
[ 40.68709434 -73.96435953]
[ 40.65714742 -73.78045657]
[ 40.72851382 -74.00458842]
[ 40.7707666  -73.50339872]
[ 40.88578614 -73.8975529 ]


In [92]:
tr_pred = model.transform(features)

In [93]:
type(tr_pred)

pyspark.sql.dataframe.DataFrame

In [94]:
tr_pred.count()

65922

In [97]:
tr_pred.show(10)

+-------+--------+--------------------+----------+
|    lat|    long|            features|prediction|
+-------+--------+--------------------+----------+
| 40.759| -73.973|[40.7589988708496...|         1|
|40.7608|-73.9988|[40.7607994079589...|         1|
|40.7782| -73.959|[40.7781982421875...|         1|
|40.7462|-73.9835|[40.7462005615234...|         1|
|40.7593|-73.9749|[40.7593002319335...|         1|
| 40.714|-74.0145|[40.7140007019043...|         4|
|40.7366| -73.997|[40.7365989685058...|         4|
|40.7376|-74.0067|[40.7375984191894...|         4|
|40.7243|-73.9999|[40.7243003845214...|         4|
|40.7559|-73.9728|[40.7559013366699...|         1|
+-------+--------+--------------------+----------+
only showing top 10 rows



In [105]:
model.save("/user/2052B43/Uber/kmeanModel")