In [1]:
import os
import sys

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession.builder.appName("word-count-sql").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [2]:
df = spark.read.text("book.txt")

In [3]:
df.createOrReplaceTempView("lines")

In [7]:
spark.sql("""
select split(value, "\\W+")
  from lines
""").show()

+--------------------+
|split(value, W+, -1)|
+--------------------+
|[Self-Employment:...|
|[Achieving Financ...|
|     [By Frank Kane]|
|                  []|
|                  []|
|                  []|
|[Copyright � 2015...|
|[All rights reser...|
|                  []|
|                  []|
|          [CONTENTS]|
|        [Disclaimer]|
|           [Preface]|
|[Part I: Making t...|
|[Overcoming Inertia]|
|   [Fear of Failure]|
|[Career Indoctrin...|
|[The Carrot on a ...|
|    [Ego Protection]|
|[Your Employer as...|
+--------------------+
only showing top 20 rows



In [8]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [9]:
spark.sql("""
select split(value, "\\W+") as array
  from lines
""")

array
[Self-Employment:...
[Achieving Financ...
[By Frank Kane]
[]
[]
[]
[Copyright � 2015...
[All rights reser...
[]
[]


In [16]:
spark.sql("""
select explode(array(10, 20)) as word
""")

word
10
20


In [49]:
spark.conf.get('spark.sql.parser.escapedStringLiterals')

'false'

In [48]:
spark.sql("""
SELECT split('ala 5ma 4kora', '\\\\W+')
""")

"split(ala 5ma 4kora, \W+, -1)"
"[ala, 5ma, 4kora]"


In [53]:
spark.sql("""
with words as (
  select explode(split(lower(value), '\\\\W+')) as word
    from lines
), 

trimmed as (
  select trim(word) as word
    from words
   where trim(word) != ''
)

select word, count(*) as count
  from trimmed
 group by word
 order by count(*) desc
""")

word,count
you,1878
to,1828
your,1420
the,1292
a,1191
of,970
and,934
that,747
it,649
in,616
