# ML: Bioregions

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ML').getOrCreate()

In [2]:
#loading data
df = spark.read.csv('IP_data.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- a.m.t: integer (nullable = true)
 |-- m.d.r: integer (nullable = true)
 |-- iso: integer (nullable = true)
 |-- t.s: integer (nullable = true)
 |-- max.t.wm: integer (nullable = true)
 |-- min.t.cm: integer (nullable = true)
 |-- t.a.r: integer (nullable = true)
 |-- m.t.wetq: integer (nullable = true)
 |-- m.t.dryq: integer (nullable = true)
 |-- m.t.warmq: integer (nullable = true)
 |-- m.t.coldq: integer (nullable = true)
 |-- a.pre: integer (nullable = true)
 |-- pre.wetm: integer (nullable = true)
 |-- pre.drym: integer (nullable = true)
 |-- pre.s: integer (nullable = true)
 |-- pre.wtq: integer (nullable = true)
 |-- pre.dryq: integer (nullable = true)
 |-- pre.warmq: integer (nullable = true)
 |-- pre.coldq: integer (nullable = true)



In [3]:
df2 = df.withColumnRenamed("_c0","ID") \
    .withColumnRenamed("a.m.t","amt") \
    .withColumnRenamed("m.d.r", "mdr")\
    .withColumnRenamed("t.s", "ts")\
    .withColumnRenamed("max.t.wm","maxtwm")\
    .withColumnRenamed("min.t.cm","mintcm")\
    .withColumnRenamed("t.a.r","tar")\
    .withColumnRenamed("m.t.wetq","mtwetq")\
    .withColumnRenamed("m.t.dryq","mtdryq")\
    .withColumnRenamed("m.t.warmq","mtwarmq")\
    .withColumnRenamed("m.t.coldq","mtcoldq")\
    .withColumnRenamed("a.pre", "apre")\
    .withColumnRenamed("pre.wetm", "prewetm")\
    .withColumnRenamed("pre.drym","predrym")\
    .withColumnRenamed("pre.s","pres")\
    .withColumnRenamed("pre.wtq","prewtq")\
    .withColumnRenamed("pre.dryq","predryq")\
    .withColumnRenamed("pre.warmq","prewarmq")\
    .withColumnRenamed("pre.coldq", "precoldq")

df2.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- amt: integer (nullable = true)
 |-- mdr: integer (nullable = true)
 |-- iso: integer (nullable = true)
 |-- ts: integer (nullable = true)
 |-- maxtwm: integer (nullable = true)
 |-- mintcm: integer (nullable = true)
 |-- tar: integer (nullable = true)
 |-- mtwetq: integer (nullable = true)
 |-- mtdryq: integer (nullable = true)
 |-- mtwarmq: integer (nullable = true)
 |-- mtcoldq: integer (nullable = true)
 |-- apre: integer (nullable = true)
 |-- prewetm: integer (nullable = true)
 |-- predrym: integer (nullable = true)
 |-- pres: integer (nullable = true)
 |-- prewtq: integer (nullable = true)
 |-- predryq: integer (nullable = true)
 |-- prewarmq: integer (nullable = true)
 |-- precoldq: integer (nullable = true)



In [4]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

assembler = VectorAssembler(
    inputCols=['amt','mdr','iso','ts','maxtwm','mintcm','tar','mtwetq','mtdryq','mtwarmq','mtcoldq','apre','prewetm','predrym','pres','prewtq','predryq','prewarmq','precoldq'],
    outputCol= 'features')

output = assembler.transform(df2)
final_data = output.select('features','ID')

In [5]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", 
                        outputCol="scaled_features",
                        withStd=True,withMean=True)

scaler_model = scaler.fit(final_data)
scaled_data = scaler_model.transform(final_data)
scaled_data = scaled_data.drop('features')

In [6]:
final_data = scaled_data.withColumnRenamed("scaled_features","features")
final_data.head()

Row(ID=1, features=DenseVector([0.3912, -3.5081, -1.7269, -2.7506, -1.6116, 2.0428, -3.1046, 0.2157, -0.2402, -0.7881, 1.4737, 1.9372, 1.8988, 1.8999, -0.194, 1.8095, 1.7488, 1.4767, 1.4135]))

In [78]:
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

#HIerarchical top-down
# Trains a bisecting k-means model.
bkm = BisectingKMeans().setK(7).setSeed(1)
model = bkm.fit(final_data)

# Make predictions
predictions = model.transform(final_data)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
print("Cluster Centers: ")
centers = model.clusterCenters()
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.29886837867616367
Cluster Centers: 
[-1.80667296 -0.27886478 -0.08171068 -0.10779981 -1.55334462 -1.57566601
 -0.24242502 -0.3936084  -2.304995   -1.6600859  -1.67665317  0.98865645
  0.50633737  2.30023676 -1.39041854  0.48591644  2.28237939  2.59678504
  0.10702377]
[-1.28819956  0.00160766  1.04412151 -0.57830251 -1.23402428 -0.9193425
 -0.45051959 -1.14755729 -0.60530348 -1.39137224 -1.01184086  0.75311103
  0.59294532  0.79044194 -0.42940511  0.60098792  0.7077346   0.56463619
  0.62954066]
[-0.15237273 -1.59468261  0.18130303 -1.72883748 -1.24367147  0.76146779
 -1.76358123 -0.22383422 -0.34153248 -0.88146666  0.52591287  2.06910384
  2.02083645  1.1123319   0.08050409  2.00805771  1.20673485  1.07828019
  1.93464764]
[-0.54397109  0.23216632 -0.07590314  0.32543237 -0.20114094 -0.58688514
  0.26566456  0.0239168  -0.4866997  -0.32577053 -0.6284214  -0.45870444
 -0.57548627  0.13816155 -0.73412395 -0.57799893  0.12296249  0.13298595


In [64]:
predictions.toPandas().to_csv('predictions.csv')

In [8]:
#KNN 
from pyspark.ml.clustering import KMeans

# Trains a k-means model.
kmeans = KMeans().setK(7).setSeed(1)
model = kmeans.fit(final_data)

predictions_knn = model.transform(final_data)

# Evaluate clustering by computing Within Set Sum of Squared Errors.
evaluator = ClusteringEvaluator()

wssse = evaluator.evaluate(predictions_knn)
print("Within Set Sum of Squared Errors = " + str(wssse))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Within Set Sum of Squared Errors = 0.406918201662101
Cluster Centers: 
[ 0.71225035  0.71536577 -0.20273336  0.78981915  1.0601943   0.19487043
  0.84858657 -0.09471522  0.77093125  0.96717711  0.35963608 -0.41418347
 -0.32148524 -0.69918581  0.58012025 -0.29877136 -0.71762746 -0.69942865
 -0.18744922]
[-0.05102833 -2.60846381 -0.74516779 -2.34770269 -1.61954513  1.28774098
 -2.52632128 -0.08834587 -0.42522838 -1.03721088  0.88428499  2.62066954
  2.70375849  0.82097762  0.41694986  2.71873293  1.02779774  0.88723428
  2.51628426]
[ 1.13806621 -1.22957366 -0.46349243 -0.99067415  0.12293639  1.5496653
 -1.08628144  1.1318026   0.50325422  0.5610566   1.47097633 -0.33993652
  0.01962031 -0.7509349   1.06855581 -0.01903646 -0.69266955 -0.62365247
 -0.07222336]
[-0.41225697  0.11249384  1.0251042  -0.58256745 -0.53068367 -0.21681228
 -0.3322512  -0.75698447 -0.14406255 -0.62100296 -0.19212903  0.99175502
  1.137501   -0.05071291  0.59279818  1.14192667 -0.06833789 -0.12656887
  1.2542836 

In [66]:
predictions_knn.toPandas().to_csv('predictions_knn.csv')

In [9]:
#GMM
from pyspark.ml.clustering import GaussianMixture

gmm = GaussianMixture().setK(7).setSeed(538009335)
model = gmm.fit(final_data)

predictions_GMM = model.transform(final_data)

print("Gaussians shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)

Gaussians shown as a DataFrame: 
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [68]:
predictions_GMM.toPandas().to_csv('predictions_gmm.csv')

## Interpretable ML

In [80]:
#preparing data
df_hie = predictions.withColumnRenamed("prediction","group")
df_hie = df_hie.drop('ID')
df_hie.head()

Row(features=DenseVector([0.3912, -3.5081, -1.7269, -2.7506, -1.6116, 2.0428, -3.1046, 0.2157, -0.2402, -0.7881, 1.4737, 1.9372, 1.8988, 1.8999, -0.194, 1.8095, 1.7488, 1.4767, 1.4135]), group=2)

In [81]:
df_knn = predictions_knn.withColumnRenamed("prediction","group")
df_knn = df_hie.drop('ID')
df_knn.head()

Row(features=DenseVector([0.3912, -3.5081, -1.7269, -2.7506, -1.6116, 2.0428, -3.1046, 0.2157, -0.2402, -0.7881, 1.4737, 1.9372, 1.8988, 1.8999, -0.194, 1.8095, 1.7488, 1.4767, 1.4135]), group=2)

In [82]:
df_gmm = predictions_GMM.withColumnRenamed("prediction","group")
df_gmm = df_hie.drop('ID')
df_gmm.head()

Row(features=DenseVector([0.3912, -3.5081, -1.7269, -2.7506, -1.6116, 2.0428, -3.1046, 0.2157, -0.2402, -0.7881, 1.4737, 1.9372, 1.8988, 1.8999, -0.194, 1.8095, 1.7488, 1.4767, 1.4135]), group=2)

In [83]:
from pyspark.ml.classification import DecisionTreeClassifier
#defining classifier
dtc_hie = DecisionTreeClassifier(labelCol = 'group')
dtc_knn = DecisionTreeClassifier(labelCol = 'group')
dtc_gmm = DecisionTreeClassifier(labelCol = 'group')

#creating the model
dtc_model_hie = dtc_hie.fit(df_hie)
dtc_model_knn = dtc_knn.fit(df_knn)
dtc_model_gmm = dtc_gmm.fit(df_gmm)

In [84]:
print(dtc_model_hie.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a8f26389b162, depth=5, numNodes=53, numClasses=7, numFeatures=19
  If (feature 8 <= 0.2243042446964718)
   If (feature 11 <= 0.0740179603326357)
    If (feature 4 <= -0.8936309038353555)
     If (feature 2 <= 0.4728208964380131)
      If (feature 8 <= -2.96611718498243)
       Predict: 0.0
      Else (feature 8 > -2.96611718498243)
       Predict: 3.0
     Else (feature 2 > 0.4728208964380131)
      Predict: 1.0
    Else (feature 4 > -0.8936309038353555)
     If (feature 6 <= -1.1964819018896904)
      If (feature 13 <= 0.16966431931159073)
       Predict: 5.0
      Else (feature 13 > 0.16966431931159073)
       Predict: 3.0
     Else (feature 6 > -1.1964819018896904)
      If (feature 9 <= 0.8395438878202854)
       Predict: 3.0
      Else (feature 9 > 0.8395438878202854)
       Predict: 4.0
   Else (feature 11 > 0.0740179603326357)
    If (feature 5 <= -0.06763180712437183)
     If (feature 17 <= 1.745228097765416)
      If 

In [85]:
print(dtc_model_knn.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1c87fa5a6022, depth=5, numNodes=53, numClasses=7, numFeatures=19
  If (feature 8 <= 0.2243042446964718)
   If (feature 11 <= 0.0740179603326357)
    If (feature 4 <= -0.8936309038353555)
     If (feature 2 <= 0.4728208964380131)
      If (feature 8 <= -2.96611718498243)
       Predict: 0.0
      Else (feature 8 > -2.96611718498243)
       Predict: 3.0
     Else (feature 2 > 0.4728208964380131)
      Predict: 1.0
    Else (feature 4 > -0.8936309038353555)
     If (feature 6 <= -1.1964819018896904)
      If (feature 13 <= 0.16966431931159073)
       Predict: 5.0
      Else (feature 13 > 0.16966431931159073)
       Predict: 3.0
     Else (feature 6 > -1.1964819018896904)
      If (feature 9 <= 0.8395438878202854)
       Predict: 3.0
      Else (feature 9 > 0.8395438878202854)
       Predict: 4.0
   Else (feature 11 > 0.0740179603326357)
    If (feature 5 <= -0.06763180712437183)
     If (feature 17 <= 1.745228097765416)
      If 

In [86]:
print(dtc_model_gmm.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_612e9f5246f2, depth=5, numNodes=53, numClasses=7, numFeatures=19
  If (feature 8 <= 0.2243042446964718)
   If (feature 11 <= 0.0740179603326357)
    If (feature 4 <= -0.8936309038353555)
     If (feature 2 <= 0.4728208964380131)
      If (feature 8 <= -2.96611718498243)
       Predict: 0.0
      Else (feature 8 > -2.96611718498243)
       Predict: 3.0
     Else (feature 2 > 0.4728208964380131)
      Predict: 1.0
    Else (feature 4 > -0.8936309038353555)
     If (feature 6 <= -1.1964819018896904)
      If (feature 13 <= 0.16966431931159073)
       Predict: 5.0
      Else (feature 13 > 0.16966431931159073)
       Predict: 3.0
     Else (feature 6 > -1.1964819018896904)
      If (feature 9 <= 0.8395438878202854)
       Predict: 3.0
      Else (feature 9 > 0.8395438878202854)
       Predict: 4.0
   Else (feature 11 > 0.0740179603326357)
    If (feature 5 <= -0.06763180712437183)
     If (feature 17 <= 1.745228097765416)
      If 