<a href="https://colab.research.google.com/github/princessafusanto/Cloudera/blob/main/clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import when, col
import matplotlib.pyplot as plt

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
# load data
data = spark.read.option("inferSchema", "true").csv("./Clustering.csv", header=True)
data.show()

# select features
data = data.select('Oil Concentration', 'Trash Pollution', 'Bacteria Level')
data.show()

In [None]:
# data preprocess
data = data.na.drop()

In [None]:
# transform
def transformRep(df):
    df = df.withColumn("Trash Pollution", when(df["Trash Pollution"] == "Low", 0.0)\
                       .when(df["Trash Pollution"] == "Medium", 1.0)\
                       .when(df["Trash Pollution"] == "High", 2.0))
    return df

def transformCast(df):
    df = df.withColumn('Trash Pollution', df['Trash Pollution'].cast('double'))
    df = df.withColumn('Oil Concentration', df['Oil Concentration'].cast('double'))
    df = df.withColumn('Bacteria Level', df['Bacteria Level'].cast('double'))
    return df

data = transformRep(data)
data = transformCast(data)
data.show()

In [None]:
# normalization
cols = data.columns
vectorAssembler = VectorAssembler(inputCols=cols, outputCol='AssembledFeatures')
vector = vectorAssembler.transform(data)
standardScaler = StandardScaler(inputCol='AssembledFeatures', outputCol='features').fit(vector)
data = standardScaler.transform(vector)

In [None]:
#generate model
model = KMeans().setK(2).setSeed(123).fit(data)

# use model
prediction = model.transform(data)

In [None]:
#plot
predictions = prediction.toPandas()
plt.scatter(predictions['Oil Concentration'],
           predictions['Bacteria Level'],
           predictions['Trash Pollution'],
           c=predictions['prediction'])
plt.plot()