In [1]:
import findspark
findspark.init('/home/karan/spark-2.1.0-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
Spark = SparkSession.builder.appName('UberCSV').getOrCreate()
from pyspark.sql.types import *

In [4]:
schema = StructType([StructField('Dt', TimestampType(), True),
                     StructField('Lat', DoubleType(), True),
                     StructField('Lon', DoubleType(), True),
                     StructField('Base', StringType(), True)])

In [29]:
df = Spark.read.format("csv").option("header","true").load("/home/karan/Downloads/uber-raw-data-jul14.csv")

In [16]:
from pyspark.sql import SQLContext

In [17]:
conf = pyspark.SparkConf()
sc = pyspark.SparkContext.getOrCreate(conf=conf)

In [18]:
sqlContext = SQLContext(sc)

In [68]:
df.printSchema()

root
 |-- Dt: string (nullable = true)
 |-- Lat: string (nullable = true)
 |-- Lon: string (nullable = true)
 |-- Base: string (nullable = true)



In [61]:
from pyspark.sql.functions import unix_timestamp
cs1 =  df.withColumn('Dt' ,unix_timestamp('Dt', 'MM/dd/yyyy HH:mm:ss').cast(TimestampType()))

In [69]:
cs2 =  cs1.withColumn("Lat", df["Lat"].cast(DoubleType()))
cs =  cs2.withColumn("Lon", df["Lon"].cast(DoubleType()))

In [70]:
featureCols = ["Lat", "Lon"]

In [72]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [75]:
assembler = VectorAssembler(inputCols=featureCols, outputCol="features")

In [76]:
df2 = assembler.transform(cs)

In [78]:
df2.cache

<bound method DataFrame.cache of DataFrame[Dt: timestamp, Lat: double, Lon: double, Base: string, features: vector]>

In [79]:
df2.show()

+--------------------+-------+--------+------+------------------+
|                  Dt|    Lat|     Lon|  Base|          features|
+--------------------+-------+--------+------+------------------+
|2014-07-01 00:03:...|40.7586|-73.9706|B02512|[40.7586,-73.9706]|
|2014-07-01 00:05:...|40.7605|-73.9994|B02512|[40.7605,-73.9994]|
|2014-07-01 00:06:...| 40.732|-73.9999|B02512| [40.732,-73.9999]|
|2014-07-01 00:09:...|40.7635|-73.9793|B02512|[40.7635,-73.9793]|
|2014-07-01 00:20:...|40.7204|-74.0047|B02512|[40.7204,-74.0047]|
|2014-07-01 00:35:...|40.7487|-73.9869|B02512|[40.7487,-73.9869]|
|2014-07-01 00:57:...|40.7444|-73.9961|B02512|[40.7444,-73.9961]|
|2014-07-01 00:58:...|40.7132|-73.9492|B02512|[40.7132,-73.9492]|
|2014-07-01 01:04:...| 40.759| -73.973|B02512|  [40.759,-73.973]|
|2014-07-01 01:08:...|40.7601|-73.9823|B02512|[40.7601,-73.9823]|
|2014-07-01 01:12:...|40.6951|-74.1784|B02512|[40.6951,-74.1784]|
|2014-07-01 01:23:...|40.7203|-73.9992|B02512|[40.7203,-73.9992]|
|2014-07-0

In [81]:
from pyspark.ml.clustering import KMeans

In [88]:
kmeans = KMeans().setK(20).setFeaturesCol("features").setPredictionCol("cid").setSeed(1)

In [89]:
model = kmeans.fit(df2)

In [90]:
centers = model.clusterCenters()

In [92]:
print('Cluster Centers: ')
for center in centers:
    print(center)

Cluster Centers: 
[ 40.71229703 -73.94646387]
[ 40.7404019  -73.99366038]
[ 40.64592574 -73.78270484]
[ 40.71905211 -74.00141632]
[ 40.65846337 -74.41839735]
[ 40.22300484 -74.04608677]
[ 40.99583714 -73.77298975]
[ 40.75968814 -73.97900913]
[ 40.78051135 -73.9572528 ]
[ 40.85188784 -73.92665342]
[ 40.80210798 -73.07387689]
[ 40.76979444 -73.87415369]
[ 40.75550935 -73.74543929]
[ 40.62696132 -73.97442048]
[ 40.71298379 -73.83936542]
[ 40.73126006 -73.60987121]
[ 40.68102204 -73.9800034 ]
[ 40.74165933 -74.04418609]
[ 40.70316743 -74.18307116]
[ 40.78519051 -73.43029378]


In [93]:
kmeansmodelsummary = model.summary

In [94]:
dff = kmeansmodelsummary.predictions

In [96]:
dff.createOrReplaceTempView('uber')

In [106]:
sqlDF = Spark.sql('SELECT cid, COUNT(*) AS CID_Cnt FROM uber GROUP BY cid ORDER BY 2 DESC LIMIT 5')

In [110]:
sqlDF = Spark.sql('SELECT hour(uber.Dt) AS HourDT, COUNT(cid) AS Cnt FROM uber GROUP BY hour(uber.Dt) ORDER BY 2 DESC')

In [133]:
sqlDF = Spark.sql('SELECT * FROM uber WHERE CID=17')
sqlDF.show()

+--------------------+-------+--------+------+------------------+---+
|                  Dt|    Lat|     Lon|  Base|          features|cid|
+--------------------+-------+--------+------+------------------+---+
|2014-07-01 14:14:...|40.7186|-74.0342|B02512|[40.7186,-74.0342]| 17|
|2014-07-01 14:14:...|40.7186|-74.0342|B02512|[40.7186,-74.0342]| 17|
|2014-07-01 14:17:...|40.7173|-74.0354|B02512|[40.7173,-74.0354]| 17|
|2014-07-01 16:24:...|40.7191|-74.0353|B02512|[40.7191,-74.0353]| 17|
|2014-07-01 16:56:...|40.7223|-74.0402|B02512|[40.7223,-74.0402]| 17|
|2014-07-01 17:27:...|40.7164|-74.0341|B02512|[40.7164,-74.0341]| 17|
|2014-07-01 17:27:...|40.7164|-74.0341|B02512|[40.7164,-74.0341]| 17|
|2014-07-01 18:15:...|40.7249|-74.0354|B02512|[40.7249,-74.0354]| 17|
|2014-07-01 18:31:...|40.7513|-74.0275|B02512|[40.7513,-74.0275]| 17|
|2014-07-01 18:49:...|40.7554|-74.0315|B02512|[40.7554,-74.0315]| 17|
|2014-07-01 21:23:...|40.7367|-74.0312|B02512|[40.7367,-74.0312]| 17|
|2014-07-01 21:56:..