<a href="https://colab.research.google.com/github/sasansharifipour/Spark_Class/blob/main/PCA_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

PCA

In [3]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

In [4]:
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0,7.0]),)]

In [6]:
df = spark.createDataFrame(data, ["features"])

In [8]:
df.show()

+--------------------+
|            features|
+--------------------+
| (5,[1,3],[1.0,7.0])|
|[2.0,0.0,3.0,4.0,...|
|[4.0,0.0,0.0,6.0,...|
+--------------------+



In [9]:
pca = PCA( k = 3, inputCol="features", outputCol="pcaFeatures" )

model = pca.fit(df)

In [10]:
result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)

+-----------------------------------------------------------+
|pcaFeatures                                                |
+-----------------------------------------------------------+
|[1.6485728230883807,-4.013282700516296,-5.524543751369388] |
|[-4.645104331781534,-1.1167972663619026,-5.524543751369387]|
|[-6.428880535676489,-5.337951427775355,-5.524543751369389] |
+-----------------------------------------------------------+



One Hot Encoding

In [12]:
from pyspark.ml.feature import OneHotEncoder

In [13]:
df = spark.createDataFrame([
                            (0.0, 1.0),
                            (1.0, 0.0),
                            (2.0, 1.0),
                            (0.0, 2.0),
                            (0.0, 1.0),
                            (2.0, 0.0)
],["categoryIndex1", "categoryIndex2"])

In [14]:
df.show()

+--------------+--------------+
|categoryIndex1|categoryIndex2|
+--------------+--------------+
|           0.0|           1.0|
|           1.0|           0.0|
|           2.0|           1.0|
|           0.0|           2.0|
|           0.0|           1.0|
|           2.0|           0.0|
+--------------+--------------+



In [15]:
encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"],
                        outputCols = ["categoryVec1", "categoryVec2"])

In [16]:
model_encoder = encoder.fit(df)

In [17]:
encoded = model_encoder.transform(df)

In [18]:
encoded.show()

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|
+--------------+--------------+-------------+-------------+



MinMaxScalar

In [19]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

In [58]:
our_dataframe = spark.createDataFrame([
                                   (0, Vectors.dense([2.0, 0.1, 1.0]),),
                                   (1, Vectors.dense([1.0, 1.1, 1.0]),),
                                   (2, Vectors.dense([3.0, 10.1, 3.0]),)
],["id", "features"])

In [59]:
our_dataframe.show()

+---+--------------+
| id|      features|
+---+--------------+
|  0| [2.0,0.1,1.0]|
|  1| [1.0,1.1,1.0]|
|  2|[3.0,10.1,3.0]|
+---+--------------+



In [60]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

In [61]:
scaler

MinMaxScaler_3880cecd9474

In [62]:
scaler_model = scaler.fit(our_dataframe)

In [63]:
scaler_data = scaler_model.transform(our_dataframe)

In [64]:
scaler_data.select("features", "scaledFeatures").show()

+--------------+--------------+
|      features|scaledFeatures|
+--------------+--------------+
| [2.0,0.1,1.0]| [0.5,0.0,0.0]|
| [1.0,1.1,1.0]| [0.0,0.1,0.0]|
|[3.0,10.1,3.0]| [1.0,1.0,1.0]|
+--------------+--------------+



In [65]:
spark.stop()