<a href="https://colab.research.google.com/github/sasansharifipour/Spark_Class/blob/main/ChiSquard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

Vector Slice

In [3]:
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row

In [4]:
df = spark.createDataFrame([
	Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3})),
	Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]))])

In [5]:
df.show()

+--------------------+
|        userFeatures|
+--------------------+
|(3,[0,1],[-2.0,2.3])|
|      [-2.0,2.3,0.0]|
+--------------------+



In [6]:
slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1])

In [7]:
output = slicer.transform(df)

In [8]:
output.select("userFeatures", "features").show()

+--------------------+-------------+
|        userFeatures|     features|
+--------------------+-------------+
|(3,[0,1],[-2.0,2.3])|(1,[0],[2.3])|
|      [-2.0,2.3,0.0]|        [2.3]|
+--------------------+-------------+



Chi Squared Selector

In [9]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

In [10]:
df = spark.createDataFrame([
(7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,),
(8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,),
(9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)],
["id", "features", "clicked"])

In [11]:
df.show()

+---+------------------+-------+
| id|          features|clicked|
+---+------------------+-------+
|  7|[0.0,0.0,18.0,1.0]|    1.0|
|  8|[0.0,1.0,12.0,0.0]|    0.0|
|  9|[1.0,0.0,15.0,0.1]|    0.0|
+---+------------------+-------+



In [12]:
selector = ChiSqSelector(numTopFeatures=1, featuresCol="features", outputCol="selectedFeatures", labelCol="clicked")

In [14]:
result = selector.fit(df).transform(df)

In [15]:
result.show()

+---+------------------+-------+----------------+
| id|          features|clicked|selectedFeatures|
+---+------------------+-------+----------------+
|  7|[0.0,0.0,18.0,1.0]|    1.0|          [18.0]|
|  8|[0.0,1.0,12.0,0.0]|    0.0|          [12.0]|
|  9|[1.0,0.0,15.0,0.1]|    0.0|          [15.0]|
+---+------------------+-------+----------------+



In [16]:
spark.stop()