# NLP - Tools

In [1]:
import findspark
findspark.init("/home/rodolfo/spark-3.3.1-bin-hadoop3")
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("nlp_tools").getOrCreate()

23/01/02 13:53:22 WARN Utils: Your hostname, rodolfo-300E5M-300E5L resolves to a loopback address: 127.0.1.1; using 10.0.0.107 instead (on interface wlp3s0)
23/01/02 13:53:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/02 13:53:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/02 13:53:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/01/02 13:53:29 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [4]:
sentence_df = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

In [5]:
sentence_df.show()

                                                                                

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish Java could...|
|  1.0|Logistic regressi...|
+-----+--------------------+



In [7]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
words_df = tokenizer.transform(sentence_df)
words_df.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  1.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



In [8]:
hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures")
featurized_df = hashing_tf.transform(words_df)

In [9]:
idf = IDF(inputCol="rawFeatures", outputCol="features").fit(featurized_df)
rescaled_df = idf.transform(featurized_df)

                                                                                

In [10]:
rescaled_df.show()

23/01/02 13:57:25 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
23/01/02 13:57:25 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
+-----+--------------------+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[18700,19...|(262144,[18700,19...|
|  0.0|I wish Java could...|[i, wish, java, c...|(262144,[19036,20...|(262144,[19036,20...|
|  1.0|Logistic regressi...|[logistic, regres...|(262144,[46243,58...|(262144,[46243,58...|
+-----+--------------------+--------------------+--------------------+--------------------+



In [11]:
from pyspark.ml.feature import CountVectorizer

In [12]:
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

In [13]:
df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [14]:
cv = CountVectorizer(
    inputCol="words",
    outputCol="features",
    vocabSize=3,
    minDF=2.0
).fit(df)

In [15]:
result = cv.transform(df)

In [16]:
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+



23/01/02 19:51:02 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 20921141 ms exceeds timeout 120000 ms
23/01/02 19:51:02 WARN SparkContext: Killing executors is not supported by current scheduler.
