In [1]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.mllib.regression import LabeledPoint

In [2]:
spark = SparkSession \
    .builder \
    .appName("KMeans") \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
data = sc.textFile("./dataset/Qualitative_Bankruptcy.txt")
data.count()

250

### Prepare data for the logistic regression algorithm

In [4]:
def getDoubleValue(input):
    result = 0.0
    if (input == 'P'):
        result = 3.0
    if (input == 'A'): 
        result = 2.0
    if (input == 'N'): 
        result = 1.0
    if (input == 'NB'): 
        result = 1.0        
    if (input == 'B'): 
        result = 0.0      
    return result

In [5]:
dataTuple = data.map(lambda o: (getDoubleValue(o.split(",")[0]),getDoubleValue(o.split(",")[1]),getDoubleValue(o.split(",")[2]),getDoubleValue(o.split(",")[3]),getDoubleValue(o.split(",")[4]),getDoubleValue(o.split(",")[5]),getDoubleValue(o.split(",")[6])))

In [6]:
df = dataTuple.toDF(['Industrial Risk','Management Risk','Financial Flexibility','Credibility','Competitiveness','Operating Risk','Class'])

In [7]:
df.show(5)

+---------------+---------------+---------------------+-----------+---------------+--------------+-----+
|Industrial Risk|Management Risk|Financial Flexibility|Credibility|Competitiveness|Operating Risk|Class|
+---------------+---------------+---------------------+-----------+---------------+--------------+-----+
|            3.0|            3.0|                  2.0|        2.0|            2.0|           3.0|  1.0|
|            1.0|            1.0|                  2.0|        2.0|            2.0|           1.0|  1.0|
|            2.0|            2.0|                  2.0|        2.0|            2.0|           2.0|  1.0|
|            3.0|            3.0|                  3.0|        3.0|            3.0|           3.0|  1.0|
|            1.0|            1.0|                  3.0|        3.0|            3.0|           1.0|  1.0|
+---------------+---------------+---------------------+-----------+---------------+--------------+-----+
only showing top 5 rows



In [8]:
df.printSchema()

root
 |-- Industrial Risk: double (nullable = true)
 |-- Management Risk: double (nullable = true)
 |-- Financial Flexibility: double (nullable = true)
 |-- Credibility: double (nullable = true)
 |-- Competitiveness: double (nullable = true)
 |-- Operating Risk: double (nullable = true)
 |-- Class: double (nullable = true)



In [9]:
df.groupby('Class').count().show()

+-----+-----+
|Class|count|
+-----+-----+
|  0.0|  107|
|  1.0|  143|
+-----+-----+



In [20]:
ignore = ['Class']
assembler = VectorAssembler(
    inputCols=[x for x in df.columns if x not in ignore],
    outputCol='features')

assembler_df = assembler.transform(df)
assembler_df.select("features").show(truncate=False)

+-------------------------+
|features                 |
+-------------------------+
|[3.0,3.0,2.0,2.0,2.0,3.0]|
|[1.0,1.0,2.0,2.0,2.0,1.0]|
|[2.0,2.0,2.0,2.0,2.0,2.0]|
|[3.0,3.0,3.0,3.0,3.0,3.0]|
|[1.0,1.0,3.0,3.0,3.0,1.0]|
|[2.0,2.0,3.0,3.0,3.0,2.0]|
|[3.0,3.0,2.0,3.0,3.0,3.0]|
|[3.0,3.0,3.0,2.0,2.0,3.0]|
|[3.0,3.0,2.0,3.0,2.0,3.0]|
|[3.0,3.0,2.0,2.0,3.0,3.0]|
|[3.0,3.0,3.0,3.0,2.0,3.0]|
|[3.0,3.0,3.0,2.0,3.0,3.0]|
|[1.0,1.0,2.0,3.0,3.0,1.0]|
|[1.0,1.0,3.0,2.0,2.0,1.0]|
|[1.0,1.0,2.0,3.0,2.0,1.0]|
|[1.0,1.0,2.0,3.0,2.0,1.0]|
|[1.0,1.0,2.0,2.0,3.0,1.0]|
|[1.0,1.0,3.0,3.0,2.0,1.0]|
|[1.0,1.0,3.0,2.0,3.0,1.0]|
|[2.0,2.0,2.0,3.0,3.0,2.0]|
+-------------------------+
only showing top 20 rows



In [26]:
# label_df = assembler_df.withColumn("label", assembler_df["Class"])
# label_df.show()

### Standard scaler

In [45]:
scaler = StandardScaler().setInputCol('features').setOutputCol('scaled_features')

scaler_model = scaler.fit(assembler_df)
scaler_df = scaler_model.transform(assembler_df)

In [46]:
scaler_df.show(5)

+---------------+---------------+---------------------+-----------+---------------+--------------+-----+--------------------+--------------------+
|Industrial Risk|Management Risk|Financial Flexibility|Credibility|Competitiveness|Operating Risk|Class|            features|     scaled_features|
+---------------+---------------+---------------------+-----------+---------------+--------------+-----+--------------------+--------------------+
|            3.0|            3.0|                  2.0|        2.0|            2.0|           3.0|  1.0|[3.0,3.0,2.0,2.0,...|[3.64497271330941...|
|            1.0|            1.0|                  2.0|        2.0|            2.0|           1.0|  1.0|[1.0,1.0,2.0,2.0,...|[1.21499090443647...|
|            2.0|            2.0|                  2.0|        2.0|            2.0|           2.0|  1.0|[2.0,2.0,2.0,2.0,...|[2.42998180887294...|
|            3.0|            3.0|                  3.0|        3.0|            3.0|           3.0|  1.0|[3.0,3.0,3.0,3

In [28]:
# parsed_data = scaler_df.rdd.map(lambda x: LabeledPoint(x[6], x[:6]))
# parsed_data

In [29]:
# parsed_data.collect()

In [30]:
# type(parsed_data)

### Split train & test

In [54]:
train_data, test_data = scaler_df.randomSplit([0.6, 0.4], seed=11)

print("Training Dataset Count:" + str(train_data.count()))
print("Test Dataset Count:" + str(test_data.count()))

Training Dataset Count:152
Test Dataset Count:98


### KMeans model

In [55]:
kmeans = KMeans(featuresCol='scaled_features', k=2)
model = kmeans.fit(train_data)
output = model.transform(scaler_df)

In [59]:
centroids = model.clusterCenters()
print("Cluster centroids: ")
for centroid in centroids:
    print(centroid)

Cluster centroids: 
[2.04812752 1.634819   1.33400568 1.49496302 1.26427356 1.85731548]
[2.60778536 2.59814903 2.83937144 3.02177836 2.96104096 2.42737864]


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 44574)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.8/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.8/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.8/socketserver.py", line 720, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pysp

In [56]:
silhouette_score=[]
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='scaled_features', \
                                metricName='silhouette', distanceMeasure='squaredEuclidean')

In [57]:
score=evaluator.evaluate(output)    
silhouette_score.append(score)
print("Silhouette Score:",score)

Silhouette Score: 0.5181090713027618


__Silhouette score of 0.5 is not a very good score, meaning the data was not clustered completely.__