In [1]:
import os, sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, split, explode, count, lower

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.master('local[*]').getOrCreate()

class WordCount:
    def createData(self):
        line = """In a world full of #technology, understanding data is key to success. Data analytics, 'AI', and machine learning are transforming industries. Companies are racing to harness the power of data-driven insights; But,  data is messy and comes in various formats - structured, unstructured, and semi-structured. The challenge is to clean, process, and  analyze this data effectively. There's a growing demand for data scientists, analysts, and engineers who can unlock the value hidden within the data."""

        dataDf = spark.createDataFrame([(line,)], ['text'])
        return dataDf

    def wCount(self, inputDf):
        inputDf.createOrReplaceTempView('table')
        query = """with cte as (
                select regexp_replace(text, "[';.,#*-]", ' ') as text
                from table),
                M1 as (select 
                explode(split(lower(text), ' ')) as words 
                from cte)
                select 
                words, count(1)
                from M1 
                where words != '' 
                group by words 
                order by words"""
        return spark.sql(query)

    def wordCount(self, inputDf):
        pattern = "[';.,#*-_]"
        replacement = ' '
        regexDf = inputDf.select(regexp_replace(col('text'), pattern, replacement).alias('words'))
        splitDf = regexDf.select(explode(split(lower('words'), ' ')).alias('words'))
        countDf = splitDf.groupBy('words').agg(count('*').alias('counts'))\
                        .filter(col('words') != '').orderBy('words')
        return countDf


ob = WordCount()
inputDf = ob.createData()
print(inputDf.head())

resultDf = ob.wCount(inputDf)
resultDf.show(100, False)

#resultDf = ob.wordCount(inputDf)
#resultDf.show(100, False)

Row(text="In a world full of #technology, understanding data is key to success. Data analytics, 'AI', and machine learning are transforming industries. Companies are racing to harness the power of data-driven insights; But,  data is messy and comes in various formats - structured, unstructured, and semi-structured. The challenge is to clean, process, and  analyze this data effectively. There's a growing demand for data scientists, analysts, and engineers who can unlock the value hidden within the data.")
+-------------+--------+
|words        |count(1)|
+-------------+--------+
|a            |2       |
|ai           |1       |
|analysts     |1       |
|analytics    |1       |
|analyze      |1       |
|and          |5       |
|are          |2       |
|but          |1       |
|can          |1       |
|challenge    |1       |
|clean        |1       |
|comes        |1       |
|companies    |1       |
|data         |7       |
|demand       |1       |
|driven       |1       |
|effectively  |

In [2]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, lower, split, explode

os.environ['PYSPARK'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.master('local[*]').getOrCreate()
        
class WordCount:
    
    def createData(self):
        line = """In a world full of #technology, understanding data is key to success. Data analytics, 'AI', and machine learning are transforming industries. Companies are racing to harness the power of data-driven insights; But,  data is messy and comes in various formats - structured, unstructured, and semi-structured. The challenge is to clean, process, and  analyze this data effectively. There's a growing demand for data scientists, analysts, and engineers who can unlock the value hidden within the data."""
        df = spark.createDataFrame([(line,)], ['text'])
        return df
    
    def wordcountSQL(self, inputdf):
        inputDf.createOrReplaceTempView('table')
        query = """
        with cte as (
                select regexp_replace(text, "[';.,#*-]", ' ') as text
                from table),
                
                M1 as (select 
                explode(split(lower(text), ' ')) as words 
                from cte)
                
                select 
                words, count(1)
                from M1 
                where words != '' 
                group by words 
                order by words    
        """
        return spark.sql(query)
    
    def wordCountPyspark(self, inputdf):
        pattern = "[';.,#*-]"
        replacement = ' '
        regexDf = inputdf.select(regexp_replace(col('text'), pattern, replacement).alias('words'))
        splitDf = regexDf.select(explode(split(lower('words'), ' ')).alias('words'))
        countDf = splitDf.groupBy('words').agg(count('*').alias('counts')).filter(col('words')!='').orderBy('words')
        print(countDf.show())
        return countDf
         
    
w1 = WordCount()
inputdf = w1.createData()
#print(inputdf.head())
#inputdf.show()
#print("**********")
#resultdf = w1.wordcountSQL(inputdf)
#resultdf.show(200)
redultDf = w1.wordCountPyspark(inputdf)
redultDf.show()


+-----------+------+
|      words|counts|
+-----------+------+
|          a|     2|
|         ai|     1|
|   analysts|     1|
|  analytics|     1|
|    analyze|     1|
|        and|     5|
|        are|     2|
|        but|     1|
|        can|     1|
|  challenge|     1|
|      clean|     1|
|      comes|     1|
|  companies|     1|
|       data|     7|
|     demand|     1|
|     driven|     1|
|effectively|     1|
|  engineers|     1|
|        for|     1|
|    formats|     1|
+-----------+------+
only showing top 20 rows

None
+-----------+------+
|      words|counts|
+-----------+------+
|          a|     2|
|         ai|     1|
|   analysts|     1|
|  analytics|     1|
|    analyze|     1|
|        and|     5|
|        are|     2|
|        but|     1|
|        can|     1|
|  challenge|     1|
|      clean|     1|
|      comes|     1|
|  companies|     1|
|       data|     7|
|     demand|     1|
|     driven|     1|
|effectively|     1|
|  engineers|     1|
|        for|     1|
|  