In [1]:

# Spark Hands On Training
# Databricks CE Cloud Practice
# Raul Arrabales / Conscious-Robots.com 

# See files in DBFS
dbutils.fs.ls('/') 
dbutils.fs.ls('/FileStore/tables')
dbutils.fs.ls('/FileStore/tables/1x1xr57q1502297004187/') 

In [2]:
%sql select * from kmeans_table 

C0,C1,C2
0.0,0.0,0.0
0.1,0.1,0.1
0.2,0.2,0.2
9.0,9.0,9.0
9.1,9.1,9.1
9.2,9.2,9.2


In [3]:
# Applying KMeans (new ml lib - not mllib)
from pyspark.ml.clustering import KMeans

In [4]:
# Loading data: 
# This is the old way, load into RDD
# dataset = sc.textFile('/FileStore/tables/1x1xr57q1502297004187/kmeans_data.txt')

# Reading CSV to a df
kmeans_df = spark.read.format("com.databricks.spark.csv").option("header", "false").option("delimiter"," ").load("/FileStore/tables/1x1xr57q1502297004187/kmeans_data.txt")


In [5]:

# Check schema
kmeans_df.printSchema()

In [6]:

# Check data
display(kmeans_df) 

_c0,_c1,_c2
0.0,0.0,0.0
0.1,0.1,0.1
0.2,0.2,0.2
9.0,9.0,9.0
9.1,9.1,9.1
9.2,9.2,9.2


In [7]:
# Need to infer correctly the schema. Data are doubles, not string
kmeans_df = sqlContext.read.format("com.databricks.spark.csv") \
  .option("header", "false").option("delimiter"," ").option("inferschema", "true") \
  .load("/FileStore/tables/1x1xr57q1502297004187/kmeans_data.txt")

In [8]:
# Prepare data for training (see later the explanation about ML Pipelines)
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

In [9]:
assembler = VectorAssembler(inputCols=["_c0","_c1","_c2"], outputCol="features") 
assembler.transform(kmeans_df)

In [10]:
# Create the KMeans model
kmeans_estimator = KMeans().setFeaturesCol("features").setPredictionCol("prediction")

In [11]:
# Pipeline stages definition
pipeline = Pipeline(stages=[assembler, kmeans_estimator])

In [12]:
# Pipeline training
model = pipeline.fit(kmeans_df)

In [13]:
# Get the results: 
results = model.transform(kmeans_df)

In [14]:
# Check results:
display(results) 

_c0,_c1,_c2,features,prediction
0.0,0.0,0.0,"List(0, 3, List(), List())",0
0.1,0.1,0.1,"List(1, 3, List(), List(0.1, 0.1, 0.1))",0
0.2,0.2,0.2,"List(1, 3, List(), List(0.2, 0.2, 0.2))",0
9.0,9.0,9.0,"List(1, 3, List(), List(9.0, 9.0, 9.0))",1
9.1,9.1,9.1,"List(1, 3, List(), List(9.1, 9.1, 9.1))",1
9.2,9.2,9.2,"List(1, 3, List(), List(9.2, 9.2, 9.2))",1
