In [1]:
! pip install pyspark



In [2]:
## IMPORT NECESSARY LIBRARIES
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.sql.functions import col


In [3]:
## CREATING THE SPARK SESSION
spark = SparkSession.builder.master("local").appName("TextPreprocessingTFIDF").getOrCreate()

In [4]:
## CREATING THE TEXT DATA MANUALLY
data = [
    (0, "I love India"),
    (1, "India is the best country"),
    (2, "PySpark makes it easy to work with data"),
    (3, "I enjoy learning new technologies like Spark"),
    (4, "Python and Spark are great for data science")
]


In [5]:
## DEFINING THE COLUMNS
columns = ["id", "text"]

In [6]:
## CREATING SPARK DATAFRAME
df = spark.createDataFrame(data, columns)


df.show()


+---+--------------------+
| id|                text|
+---+--------------------+
|  0|        I love India|
|  1|India is the best...|
|  2|PySpark makes it ...|
|  3|I enjoy learning ...|
|  4|Python and Spark ...|
+---+--------------------+



In [7]:
## TOKENIZING THE TEXTS INTO LISTS OF WORDS
tokenizer = Tokenizer(inputCol="text", outputCol="words")
df_tokens = tokenizer.transform(df)


df_tokens.show()


+---+--------------------+--------------------+
| id|                text|               words|
+---+--------------------+--------------------+
|  0|        I love India|    [i, love, india]|
|  1|India is the best...|[india, is, the, ...|
|  2|PySpark makes it ...|[pyspark, makes, ...|
|  3|I enjoy learning ...|[i, enjoy, learni...|
|  4|Python and Spark ...|[python, and, spa...|
+---+--------------------+--------------------+



In [8]:
## REMOVING THE STOPWORDS FROM TOKENIZED WORDS
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df_filtered = remover.transform(df_tokens)


df_filtered.show()


+---+--------------------+--------------------+--------------------+
| id|                text|               words|      filtered_words|
+---+--------------------+--------------------+--------------------+
|  0|        I love India|    [i, love, india]|       [love, india]|
|  1|India is the best...|[india, is, the, ...|[india, best, cou...|
|  2|PySpark makes it ...|[pyspark, makes, ...|[pyspark, makes, ...|
|  3|I enjoy learning ...|[i, enjoy, learni...|[enjoy, learning,...|
|  4|Python and Spark ...|[python, and, spa...|[python, spark, g...|
+---+--------------------+--------------------+--------------------+



In [9]:
## CONVERTING THE FILTERED WORDS TO TERM FREQUENCY (TF) USING HashingTF
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=20)
df_tfidf = hashing_tf.transform(df_filtered)


df_tfidf.show()

+---+--------------------+--------------------+--------------------+--------------------+
| id|                text|               words|      filtered_words|        raw_features|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|        I love India|    [i, love, india]|       [love, india]|(20,[0,1],[1.0,1.0])|
|  1|India is the best...|[india, is, the, ...|[india, best, cou...|(20,[1,3,10],[1.0...|
|  2|PySpark makes it ...|[pyspark, makes, ...|[pyspark, makes, ...|(20,[0,1,7,13,15]...|
|  3|I enjoy learning ...|[i, enjoy, learni...|[enjoy, learning,...|(20,[3,5,6,10,12]...|
|  4|Python and Spark ...|[python, and, spa...|[python, spark, g...|(20,[3,6,9,10,15]...|
+---+--------------------+--------------------+--------------------+--------------------+



In [10]:
## APPLYING INVERSE DOCUMENT FREQUENCY (IDF) TO GET THE TF-IDF VALUES
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(df_tfidf)
df_tfidf_final = idf_model.transform(df_tfidf)


df_tfidf_final.select("id", "text", "features").show()

+---+--------------------+--------------------+
| id|                text|            features|
+---+--------------------+--------------------+
|  0|        I love India|(20,[0,1],[0.6931...|
|  1|India is the best...|(20,[1,3,10],[0.4...|
|  2|PySpark makes it ...|(20,[0,1,7,13,15]...|
|  3|I enjoy learning ...|(20,[3,5,6,10,12]...|
|  4|Python and Spark ...|(20,[3,6,9,10,15]...|
+---+--------------------+--------------------+



In [11]:
## STOPPING THE SPARK SESSION
spark.stop()