### DBSCAN Example

**Below is just a small working example to understand how we can use DBSCAN from scikit-learn and use Pandas UDF from Spark to compute this in a distributed manner for each group after a groupby.**

In [53]:
import pandas as pd
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType
from sklearn.cluster import DBSCAN
from pyspark.sql.functions import pandas_udf, PandasUDFType

# Some Dummy data
data = [(1, 11.6133, 48.1075),
         (1, 11.6142, 48.1066),
         (1, 11.6108, 48.1061),
         (1, 11.6207, 48.1192),
         (1, 11.6221, 48.1223),
         (1, 11.5969, 48.1276),
         (2, 11.5995, 48.1258),
         (2, 11.6127, 48.1066),
         (2, 11.6430, 48.1275),
         (2, 11.6368, 48.1278),
         (2, 11.5930, 48.1156)]

# Create Spark Dataframe
df = spark.createDataFrame(data, ["id", "X", "Y"])

# Define the output schema for the UDF
output_schema = StructType(
            [
                StructField('id', StringType()),
                StructField('X', DoubleType()),
                StructField('Y', DoubleType()),
                StructField('cluster', IntegerType())
             ]
    )

In [58]:
# Register the python functions as a Pandas UDF
@pandas_udf(output_schema, PandasUDFType.GROUPED_MAP)
def dbscan(data):
    data["cluster"] = DBSCAN(eps=5, min_samples=3).fit_predict(data[["X", "Y"]])
    result = pd.DataFrame(data, columns=["id", "X", "Y", "cluster"])
    return result

In [57]:
# Apply Above UDF for each group('id' in this case)
res = df.groupby("id").apply(dbscan)

In [58]:
# Display results
res.show()

+---+-------+-------+-------+
| id|      X|      Y|cluster|
+---+-------+-------+-------+
|  |11.6133|48.1075|      0|
|  |11.6142|48.1066|      0|
|  |11.6108|48.1061|      0|
|  |11.6207|48.1192|      0|
|  |11.6221|48.1223|      0|
|  |11.5969|48.1276|      0|
|  |11.5995|48.1258|      0|
|  |11.6127|48.1066|      0|
|  | 11.643|48.1275|      0|
|  |11.6368|48.1278|      0|
|  | 11.593|48.1156|      0|
+---+-------+-------+-------+

