# Cluster Analysis - PySpark Example

## Set up environment

In [None]:
# Import modules

import pyspark
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
import utils
%matplotlib inline

print (spark.version)
print (pyspark.version)

In [None]:
# Start Spark session

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

## Read in data

In [None]:
# Load minute weather data

inputfile = <FILL-IN>>
df = spark.read.load (inputfile, format="csv", inferSchema="true", header="true")

## Explore data

In [None]:
# Examine schema



In [None]:
# Count rows



In [None]:
# Filter rows 



In [None]:
# Show summary statistics



## Prepare data

In [None]:
# Drop null data



In [None]:
# Create feature vector

featuresUsed = ['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 
        'max_wind_speed','relative_humidity']
assembler = VectorAssembler(inputCols=featuresUsed, outputCol="features_unscaled")
assembled = assembler.transform(workingDF)

In [None]:
# Scale data

scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withStd=True, withMean=True)
scalerModel = scaler.fit(assembled)
scaledData = scalerModel.transform(assembled)

## Perform cluster analysis

In [None]:
# Use one-third data for elbow plot

scaledData = scaledData.select("features", "rowID")

elbowset = scaledData.filter((scaledData.rowID % 3) == 0).select("features")
elbowset.persist()
elbowset.count()

In [None]:
# Generate clusters for elbow plot



In [None]:
# Show elbow plot



In [None]:
# Run KMeans for k = 12

scaledDataFeat = scaledData.select("features")
scaledDataFeat.persist()

kmeans = KMeans(k=12, seed=1)
model = kmeans.fit(scaledDataFeat)
transformed = model.transform(scaledDataFeat)

In [None]:
# Compute cluster centers

centers = model.clusterCenters()
P = utils.pd_centers(featuresUsed, centers)
centers

## Generate plots

### Dry Days

In [None]:
utils.parallel_plot(P[P['relative_humidity'] < -0.5], P)

### Humid Days

### Hot Days

### Cool Days

## Stop Spark session

In [None]:
spark.<<FILL-IN>>