<a href="https://colab.research.google.com/github/sasansharifipour/Spark_Class/blob/main/ML_Fundamental.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

In [4]:
data = [(Vectors.sparse(4, [(0, 1.0) , (3, -2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0) , (3, 1.0)]),)]

In [5]:
data

[(SparseVector(4, {0: 1.0, 3: -2.0}),),
 (DenseVector([4.0, 5.0, 0.0, 3.0]),),
 (DenseVector([6.0, 7.0, 0.0, 8.0]),),
 (SparseVector(4, {0: 9.0, 3: 1.0}),)]

In [6]:
df = spark.createDataFrame(data, ["features"])

In [7]:
df.show()

+--------------------+
|            features|
+--------------------+
|(4,[0,3],[1.0,-2.0])|
|   [4.0,5.0,0.0,3.0]|
|   [6.0,7.0,0.0,8.0]|
| (4,[0,3],[9.0,1.0])|
+--------------------+



In [8]:
r1= Correlation.corr(df, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))

Pearson correlation matrix:
DenseMatrix([[1.        , 0.05564149,        nan, 0.40047142],
             [0.05564149, 1.        ,        nan, 0.91359586],
             [       nan,        nan, 1.        ,        nan],
             [0.40047142, 0.91359586,        nan, 1.        ]])


In [9]:
r1.asDict()

{'pearson(features)': DenseMatrix(4, 4, [1.0, 0.0556, nan, 0.4005, 0.0556, 1.0, nan, 0.9136, nan, nan, 1.0, nan, 0.4005, 0.9136, nan, 1.0], False)}

In [10]:
r1

Row(pearson(features)=DenseMatrix(4, 4, [1.0, 0.0556, nan, 0.4005, 0.0556, 1.0, nan, 0.9136, nan, nan, 1.0, nan, 0.4005, 0.9136, nan, 1.0], False))

In [11]:
r2= Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))

Spearman correlation matrix:
DenseMatrix([[1.        , 0.10540926,        nan, 0.4       ],
             [0.10540926, 1.        ,        nan, 0.9486833 ],
             [       nan,        nan, 1.        ,        nan],
             [0.4       , 0.9486833 ,        nan, 1.        ]])


In [12]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ChiSquareTest

In [13]:
data = [(0.0, Vectors.dense(0.5, 10.0)),
	      (0.0, Vectors.dense(1.5, 20.0)),
	      (1.0, Vectors.dense(1.5, 30.0)),
	      (0.0, Vectors.dense(3.5, 30.0)),
	      (0.0, Vectors.dense(3.5, 40.0)),
	      (1.0, Vectors.dense(3.5, 40.0))]

In [14]:
df_data = spark.createDataFrame(data, ["label", "features"])

In [15]:
r = ChiSquareTest.test(df_data, "features", "label").head()

In [16]:
r

Row(pValues=DenseVector([0.6873, 0.6823]), degreesOfFreedom=[2, 3], statistics=DenseVector([0.75, 1.5]))

In [17]:
print("pValues : " + str(r.pValues))
print("degreesOfFreedom : " + str(r.degreesOfFreedom))
print("statistics : " + str(r.statistics))

pValues : [0.6872892787909721,0.6822703303362126]
degreesOfFreedom : [2, 3]
statistics : [0.75,1.5]


In [18]:
spark.stop()