# NLP - tools

In [1]:
import findspark
findspark.init("/home/rodolfo/spark-3.3.1-bin-hadoop3")
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("nlp_tools").getOrCreate()

23/01/02 13:38:15 WARN Utils: Your hostname, rodolfo-300E5M-300E5L resolves to a loopback address: 127.0.1.1; using 10.0.0.107 instead (on interface wlp3s0)
23/01/02 13:38:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/02 13:38:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/02 13:38:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/01/02 13:38:20 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [5]:
sen_df = spark.createDataFrame([
    (0, "Hi I heard about spark"),
    (1, "I wish jave could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "sentence"])

In [6]:
sen_df.show()

                                                                                

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish jave could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [7]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
regex_tokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")

In [8]:
count_tokens = udf(lambda words: len(words), IntegerType())

In [9]:
tokenized = tokenizer.transform(sen_df)

In [10]:
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish jave could...|[i, wish, jave, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [11]:
tokenized.withColumn("tokens", count_tokens(col("words"))).show()

                                                                                

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish jave could...|[i, wish, jave, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [13]:
rg_tokenized = regex_tokenizer.transform(sen_df)

In [14]:
rg_tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish jave could...|[i, wish, jave, c...|
|  2|Logistic,regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [15]:
rg_tokenized.withColumn("tokens", count_tokens(col("words"))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish jave could...|[i, wish, jave, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [16]:
from pyspark.ml.feature import StopWordsRemover

In [19]:
sentence_df = spark.createDataFrame([
    (0, ["I", "saw", "the", "green", "horse"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "tokens"])

In [20]:
sentence_df.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Mary, had, a, li...|
+---+--------------------+



In [21]:
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")

In [22]:
remover.transform(sentence_df).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



## N-gram

In [23]:
from pyspark.ml.feature import NGram

In [25]:
word_df = spark.createDataFrame([
    (0, "Hi I heard about spark".split(" ")),
    (1, "I wish Java could use case classes".split(" ")),
    (2, "Logistic regression models are neat".split(" ")),
], ["id", "words"])

In [26]:
word_df.show()

+---+--------------------+
| id|               words|
+---+--------------------+
|  0|[Hi, I, heard, ab...|
|  1|[I, wish, Java, c...|
|  2|[Logistic, regres...|
+---+--------------------+



In [27]:
ngram = NGram(n=2, inputCol="words", outputCol="grams")

In [28]:
ngram.transform(word_df).select("grams").show(truncate=False)

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+

