In [1]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [2]:
spark = SparkSession \
    .builder \
    .appName("KMeans") \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
data = sc.textFile("./dataset/Qualitative_Bankruptcy.txt")
data.count()

250

### Prepare data for the logistic regression algorithm

In [4]:
def getDoubleValue(input):
    result = 0.0
    if (input == 'P'):
        result = 3.0
    if (input == 'A'): 
        result = 2.0
    if (input == 'N'): 
        result = 1.0
    if (input == 'NB'): 
        result = 1.0        
    if (input == 'B'): 
        result = 0.0      
    return result

In [5]:
dataTuple = data.map(lambda o: (getDoubleValue(o.split(",")[0]),getDoubleValue(o.split(",")[1]),getDoubleValue(o.split(",")[2]),getDoubleValue(o.split(",")[3]),getDoubleValue(o.split(",")[4]),getDoubleValue(o.split(",")[5]),getDoubleValue(o.split(",")[6])))

In [6]:
df = dataTuple.toDF(['Industrial Risk','Management Risk','Financial Flexibility','Credibility','Competitiveness','Operating Risk','Class'])

In [7]:
df.show(5)

+---------------+---------------+---------------------+-----------+---------------+--------------+-----+
|Industrial Risk|Management Risk|Financial Flexibility|Credibility|Competitiveness|Operating Risk|Class|
+---------------+---------------+---------------------+-----------+---------------+--------------+-----+
|            3.0|            3.0|                  2.0|        2.0|            2.0|           3.0|  1.0|
|            1.0|            1.0|                  2.0|        2.0|            2.0|           1.0|  1.0|
|            2.0|            2.0|                  2.0|        2.0|            2.0|           2.0|  1.0|
|            3.0|            3.0|                  3.0|        3.0|            3.0|           3.0|  1.0|
|            1.0|            1.0|                  3.0|        3.0|            3.0|           1.0|  1.0|
+---------------+---------------+---------------------+-----------+---------------+--------------+-----+
only showing top 5 rows



In [8]:
df.printSchema()

root
 |-- Industrial Risk: double (nullable = true)
 |-- Management Risk: double (nullable = true)
 |-- Financial Flexibility: double (nullable = true)
 |-- Credibility: double (nullable = true)
 |-- Competitiveness: double (nullable = true)
 |-- Operating Risk: double (nullable = true)
 |-- Class: double (nullable = true)



In [9]:
df.groupby('Class').count().show()

+-----+-----+
|Class|count|
+-----+-----+
|  0.0|  107|
|  1.0|  143|
+-----+-----+



In [10]:
ignore = ['Class']
assembler = VectorAssembler(
    inputCols=[x for x in df.columns if x not in ignore],
    outputCol='features')

assembler_df = assembler.transform(df)
assembler_df.select("features").show(truncate=False)

+-------------------------+
|features                 |
+-------------------------+
|[3.0,3.0,2.0,2.0,2.0,3.0]|
|[1.0,1.0,2.0,2.0,2.0,1.0]|
|[2.0,2.0,2.0,2.0,2.0,2.0]|
|[3.0,3.0,3.0,3.0,3.0,3.0]|
|[1.0,1.0,3.0,3.0,3.0,1.0]|
|[2.0,2.0,3.0,3.0,3.0,2.0]|
|[3.0,3.0,2.0,3.0,3.0,3.0]|
|[3.0,3.0,3.0,2.0,2.0,3.0]|
|[3.0,3.0,2.0,3.0,2.0,3.0]|
|[3.0,3.0,2.0,2.0,3.0,3.0]|
|[3.0,3.0,3.0,3.0,2.0,3.0]|
|[3.0,3.0,3.0,2.0,3.0,3.0]|
|[1.0,1.0,2.0,3.0,3.0,1.0]|
|[1.0,1.0,3.0,2.0,2.0,1.0]|
|[1.0,1.0,2.0,3.0,2.0,1.0]|
|[1.0,1.0,2.0,3.0,2.0,1.0]|
|[1.0,1.0,2.0,2.0,3.0,1.0]|
|[1.0,1.0,3.0,3.0,2.0,1.0]|
|[1.0,1.0,3.0,2.0,3.0,1.0]|
|[2.0,2.0,2.0,3.0,3.0,2.0]|
+-------------------------+
only showing top 20 rows



### Standard scaler

In [11]:
scaler = StandardScaler().setInputCol('features').setOutputCol('scaled_features')

scaler_model = scaler.fit(assembler_df)
scaler_df = scaler_model.transform(assembler_df)

In [12]:
scaler_df.show(5)

+---------------+---------------+---------------------+-----------+---------------+--------------+-----+--------------------+--------------------+
|Industrial Risk|Management Risk|Financial Flexibility|Credibility|Competitiveness|Operating Risk|Class|            features|     scaled_features|
+---------------+---------------+---------------------+-----------+---------------+--------------+-----+--------------------+--------------------+
|            3.0|            3.0|                  2.0|        2.0|            2.0|           3.0|  1.0|[3.0,3.0,2.0,2.0,...|[3.64497271330941...|
|            1.0|            1.0|                  2.0|        2.0|            2.0|           1.0|  1.0|[1.0,1.0,2.0,2.0,...|[1.21499090443647...|
|            2.0|            2.0|                  2.0|        2.0|            2.0|           2.0|  1.0|[2.0,2.0,2.0,2.0,...|[2.42998180887294...|
|            3.0|            3.0|                  3.0|        3.0|            3.0|           3.0|  1.0|[3.0,3.0,3.0,3

### Split train & test

In [13]:
train_data, test_data = scaler_df.randomSplit([0.6, 0.4], seed=11)

print("Training Dataset Count:" + str(train_data.count()))
print("Test Dataset Count:" + str(test_data.count()))

Training Dataset Count:152
Test Dataset Count:98


### KMeans model

In [14]:
kmeans = KMeans(featuresCol='scaled_features', k=4)
model = kmeans.fit(train_data)
output = model.transform(test_data)

In [15]:
output.show(5)

+---------------+---------------+---------------------+-----------+---------------+--------------+-----+--------------------+--------------------+----------+
|Industrial Risk|Management Risk|Financial Flexibility|Credibility|Competitiveness|Operating Risk|Class|            features|     scaled_features|prediction|
+---------------+---------------+---------------------+-----------+---------------+--------------+-----+--------------------+--------------------+----------+
|            1.0|            1.0|                  2.0|        3.0|            3.0|           1.0|  1.0|[1.0,1.0,2.0,3.0,...|[1.21499090443647...|         2|
|            1.0|            1.0|                  2.0|        3.0|            3.0|           1.0|  1.0|[1.0,1.0,2.0,3.0,...|[1.21499090443647...|         2|
|            1.0|            1.0|                  3.0|        2.0|            2.0|           1.0|  1.0|[1.0,1.0,3.0,2.0,...|[1.21499090443647...|         2|
|            1.0|            1.0|                  3

In [16]:
centroids = model.clusterCenters()
print("Cluster centroids: ")
for centroid in centroids:
    print(centroid)

Cluster centroids: 
[2.84763493 2.96745736 2.84032043 2.96952066 2.96060856 2.64266819]
[1.2149909  1.53046885 1.31621894 1.64961437 1.13460448 2.07098895]
[1.59867224 1.28149306 2.81779445 3.16537845 2.92608523 1.63499127]
[3.00174223 1.75451476 1.31831149 1.30897709 1.36819952 1.65814474]


In [17]:
silhouette_score=[]
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='scaled_features', \
                                metricName='silhouette', distanceMeasure='squaredEuclidean')

In [18]:
score=evaluator.evaluate(output)    
silhouette_score.append(score)
print("Silhouette Score:",score)

Silhouette Score: 0.37475611001942133


__Silhouette score of 0.5 is not a very good score, meaning the data was not clustered completely.__