<a href="https://colab.research.google.com/github/sasansharifipour/Spark_Class/blob/main/Clustring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

K-Means

In [3]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [4]:
dataset = spark.read.format("libsvm").load("spark-3.0.1-bin-hadoop2.7/data/mllib/sample_kmeans_data.txt")

In [5]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [6]:
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

In [7]:
predictions = model.transform(dataset)

In [8]:
predictions.show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|           (3,[],[])|         1|
|  1.0|(3,[0,1,2],[0.1,0...|         1|
|  2.0|(3,[0,1,2],[0.2,0...|         1|
|  3.0|(3,[0,1,2],[9.0,9...|         0|
|  4.0|(3,[0,1,2],[9.1,9...|         0|
|  5.0|(3,[0,1,2],[9.2,9...|         0|
+-----+--------------------+----------+



In [9]:
evaluator = ClusteringEvaluator()

In [10]:
silhoutte = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhoutte))

Silhouette with squared euclidean distance = 0.9997530305375207


In [11]:
centers = model.clusterCenters()
print("Cluster Centers : ")
for center in centers:
  print(center)

Cluster Centers : 
[9.1 9.1 9.1]
[0.1 0.1 0.1]


LDA

In [12]:
from pyspark.ml.clustering import LDA

In [13]:
dataset = spark.read.format("libsvm").load("spark-3.0.1-bin-hadoop2.7/data/mllib/sample_lda_libsvm_data.txt")

In [14]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(11,[0,1,2,4,5,6,...|
|  1.0|(11,[0,1,3,4,7,10...|
|  2.0|(11,[0,1,2,5,6,8,...|
|  3.0|(11,[0,1,3,6,8,9,...|
|  4.0|(11,[0,1,2,3,4,6,...|
|  5.0|(11,[0,1,3,4,5,6,...|
|  6.0|(11,[0,1,3,6,8,9,...|
|  7.0|(11,[0,1,2,3,4,5,...|
|  8.0|(11,[0,1,3,4,5,6,...|
|  9.0|(11,[0,1,2,4,6,8,...|
| 10.0|(11,[0,1,2,3,5,6,...|
| 11.0|(11,[0,1,4,5,6,7,...|
+-----+--------------------+



In [15]:
lda = LDA(k =10, maxIter=10)
model = lda.fit(dataset)

In [16]:
ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entrie corpus: " + str(ll))
print("The lower bound on the log perplexity: " + str(lp))

The lower bound on the log likelihood of the entrie corpus: -802.6921083168511
The lower bound on the log perplexity: 3.0872773396801967


In [17]:
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms : ")
topics.show(truncate=False)

The topics described by their top-weighted terms : 
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[9, 6, 8]  |[0.11938133238688342, 0.09832742663233758, 0.09778055644246263]|
|1    |[9, 10, 8] |[0.10359802577383691, 0.09984505651654535, 0.09721940069830898]|
|2    |[2, 10, 8] |[0.1050083351567891, 0.10332239149433853, 0.09835974378079414] |
|3    |[3, 10, 4] |[0.22913363157196692, 0.17271020815256324, 0.11328092238132463]|
|4    |[6, 1, 9]  |[0.20707120033435528, 0.1716780929931112, 0.1436643060272085]  |
|5    |[8, 1, 7]  |[0.10980866889565394, 0.09600076116358597, 0.09507867366188923]|
|6    |[2, 10, 5] |[0.10322308690068116, 0.1014065807845403, 0.09881338995500483] |
|7    |[7, 8, 0]  |[0.10039110321104068, 0.09750239967756968, 0.09718581186940799]|
|8    |[0, 7, 4]  |[0.10

In [19]:
transformed = model.transform(dataset)
transformed.show(truncate= False)

+-----+---------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                       |topicDistribution                                                                                                                                                                                                    |
+-----+---------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(11,[0,1,2,4,5,6,7,10],[1.0,2.0,6.0,2.0,3.0,1.0,1.0,3.0])      |[0.0047359403491389245,0.004736037180698078,0.004736098086692605,0.3219

In [20]:
spark.stop()