In [None]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
 # Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("StopWords").getOrCreate()

In [3]:
# Create DataFrame
sentenceData = spark.createDataFrame([
                                      (0, ["Big", "data", "is", "super", "powerful"]),
                                      (1, ["This", "is", "going", "to", "be", "epic"])
], ["id", "raw"])

sentenceData.show(truncate=False)

+---+--------------------------------+
|id |raw                             |
+---+--------------------------------+
|0  |[Big, data, is, super, powerful]|
|1  |[This, is, going, to, be, epic] |
+---+--------------------------------+



In [None]:
# Import stop words library
from pyspark.ml.feature import StopWordsRemover

**Stop words** are words that have little or no linguistic value in NLP. Removing these words from the data can improve the accuracy of the language model because it removes inessential words. 

Examples: the, a, and ... any word can be considered a stop word if it does not contribute to the meaning of the sentence.

In [None]:
# Run the Remover
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")

StopWordsRemover() function takes an input column that will be passed into the function, and an output column to add the results.

In [6]:
# Transform and show the data
remover.transform(sentenceData).show(truncate=False)

+---+--------------------------------+----------------------------+
|id |raw                             |filtered                    |
+---+--------------------------------+----------------------------+
|0  |[Big, data, is, super, powerful]|[Big, data, super, powerful]|
|1  |[This, is, going, to, be, epic] |[going, epic]               |
+---+--------------------------------+----------------------------+

