<a href="https://colab.research.google.com/github/saurater/ciencia_de_dados_pyspark/blob/main/Spark_NLP_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [4]:
# May take a little while on a local computer
spark = SparkSession.builder.appName("NLP").getOrCreate()

In [5]:
sentenceData =  spark.createDataFrame([
                                       (0.0, "Hi, I heard about Spark"),
                                       (0.0 , "I wish Java could use case classes"),
                                       (0.0, "Logistic regression models are neat" )                                 
                                       ], ["label" , "sentence"])

In [6]:
sentenceData.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi, I heard about...|
|  0.0|I wish Java could...|
|  0.0|Logistic regressi...|
+-----+--------------------+



In [7]:
tokenziner = Tokenizer(inputCol="sentence", outputCol='words')

In [8]:
wordsData = tokenziner.transform(sentenceData)

In [9]:
wordsData.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi, I heard about...|[hi,, i, heard, a...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  0.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



In [10]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)

In [11]:
featurizedData = hashingTF.transform(wordsData)

In [12]:
featurizedData["rawFeatures"]

Column<'rawFeatures'>

In [13]:
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [14]:
idfModel = idf.fit(featurizedData)

In [15]:
rescaledData = idfModel.transform(featurizedData)

In [16]:
rescaledData.select('label', "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(20,[6,11,13,16],...|
|  0.0|(20,[0,2,7,13,15,...|
|  0.0|(20,[3,4,6,11,19]...|
+-----+--------------------+



In [17]:
spark.stop()