<a href="https://colab.research.google.com/github/sasansharifipour/Spark_Class/blob/main/Feature_Extractor_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

Term Frequency-Inverse Document Frequency (TF-IDF)

In [3]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [4]:
sentenceData = spark.createDataFrame([
                                      (0.0, "Hi I heard about Spark"),
                                      (0.0, "I wish Java could use case classes"),
                                      (1.0, "Logistic regression models are neat")

], ["label", "sentence"])

In [5]:
sentenceData.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish Java could...|
|  1.0|Logistic regressi...|
+-----+--------------------+



In [6]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

In [7]:
wordsData.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  1.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



In [9]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

In [10]:
featurizedData.show()

+-----+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|
+-----+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(20,[6,8,13,16],[...|
|  0.0|I wish Java could...|[i, wish, java, c...|(20,[0,2,7,13,15,...|
|  1.0|Logistic regressi...|[logistic, regres...|(20,[3,4,6,11,19]...|
+-----+--------------------+--------------------+--------------------+



In [11]:
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [12]:
idf

IDF_3d838e0136f3

In [13]:
idfModel = idf.fit(featurizedData)

In [14]:
idfModel

IDFModel: uid=IDF_3d838e0136f3, numDocs=3, numFeatures=20

In [15]:
rescaledData = idfModel.transform(featurizedData)

In [16]:
rescaledData

DataFrame[label: double, sentence: string, words: array<string>, rawFeatures: vector, features: vector]

In [17]:
rescaledData.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(20,[6,8,13,16],[...|
|  0.0|(20,[0,2,7,13,15,...|
|  1.0|(20,[3,4,6,11,19]...|
+-----+--------------------+



Word2Vec

In [18]:
from pyspark.ml.feature import Word2Vec

In [19]:
documentDF = spark.createDataFrame([
                                    ("Hi I heard about Spark".split(" "), ),
                                    ("I wish Java could use case classes".split(" "), ),
                                    ("Logistic regression models are neat".split(" "), )
],["text"])

In [20]:
documentDF.show()

+--------------------+
|                text|
+--------------------+
|[Hi, I, heard, ab...|
|[I, wish, Java, c...|
|[Logistic, regres...|
+--------------------+



In [22]:
word2vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")

In [23]:
word2vec

Word2Vec_f1206f221332

In [24]:
model = word2vec.fit(documentDF)

In [27]:
result = model.transform(documentDF)

for row in result.collect():
  text, vector = row
  print("Text: [%s] => \nVector : %s \n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector : [0.016157862544059754,-0.048465465754270555,-0.06793839037418366] 

Text: [I, wish, Java, could, use, case, classes] => 
Vector : [0.033490611639406,-0.0041030315416199815,-0.046117741215441904] 

Text: [Logistic, regression, models, are, neat] => 
Vector : [-0.05778748095035553,0.025992725044488907,0.011553806625306607] 



CountVectorizer

In [28]:
from pyspark.ml.feature import CountVectorizer

In [29]:
df = spark.createDataFrame([
                            (0, "a b c".split(" ")),
                            (1, "a b b c a".split(" ")),
],["id", "words"])

In [30]:
df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [31]:
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

In [32]:
cv

CountVectorizer_37aefd688e7d

In [33]:
model_cv = cv.fit(df)

In [35]:
model_cv

CountVectorizerModel: uid=CountVectorizer_37aefd688e7d, vocabularySize=3

In [39]:
result = model_cv.transform(df)
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+



In [40]:
spark.stop()