In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = (SparkSession.builder.appName("Analyzing the vocabulary of Pride and Prejudice.").getOrCreate())

In [4]:
spark.sparkContext.setLogLevel("WARN")

In [5]:
spark?

[1;31mType:[0m        SparkSession
[1;31mString form:[0m <pyspark.sql.session.SparkSession object at 0x00000221454D9B40>
[1;31mFile:[0m        c:\pyspark\spark-3.2.3-bin-hadoop3.2\python\pyspark\sql\session.py
[1;31mDocstring:[0m  
The entry point to programming Spark with the Dataset and DataFrame API.

A SparkSession can be used create :class:`DataFrame`, register :class:`DataFrame` as
tables, execute SQL over tables, cache tables, and read parquet files.
To create a :class:`SparkSession`, use the following builder pattern:

.. autoattribute:: builder
   :annotation:

Examples
--------
>>> spark = SparkSession.builder \
...     .master("local") \
...     .appName("Word Count") \
...     .config("spark.some.config.option", "some-value") \
...     .getOrCreate()

>>> from datetime import datetime
>>> from pyspark.sql import Row
>>> spark = SparkSession(sc)
>>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1,
...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1)

In [6]:
book = spark.read.text("./1342-0.txt")

In [7]:
# column_1 = book.select(book["value"])
# column_2 = book.select(book.value)

In [8]:
# column_1.show(10, truncate = 70)
# column_2.show(10, truncate = 70)

In [9]:
# book.show(15, truncate=100)

In [10]:
from pyspark.sql.functions import col, split, explode, lower, regexp_extract

In [11]:
# import pyspark.sql.functions as F

# used functions:
#     1. col
#     2. explode
#     3. lower
#     4. split
#     5. regexp_extract

In [12]:
lines = book.select(split(col("value"), " ").alias("line"))

In [13]:
#lines = book.select(split(book.value, " ").alias("line"))

In [14]:
#lines = book.select(split(book.value, " ").alias("line"))

In [15]:
lines.show(10)

+--------------------+
|                line|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title:, Pride, a...|
|                  []|
+--------------------+
only showing top 10 rows



In [16]:
words = lines.select(explode(col("line")).alias("book"))

In [17]:
words.show(5)

+---------+
|     book|
+---------+
|      The|
|  Project|
|Gutenberg|
|    EBook|
|       of|
+---------+
only showing top 5 rows



In [18]:
# convert to lowercase

lower_words = words.select(lower(col("book")).alias("lower_word"))

In [19]:
lower_words.show(15)

+----------+
|lower_word|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|     pride|
|       and|
|prejudice,|
|        by|
|      jane|
|    austen|
|          |
|      this|
|     ebook|
|        is|
+----------+
only showing top 15 rows



In [20]:
#  remove characters and spaces

clean_words = lower_words.select(regexp_extract(col("lower_word"), "[a-z]+", 0).alias("clean_word"))

remove_spaces = clean_words.filter(col("clean_word") != "")

In [21]:
remove_spaces.show(15)

+----------+
|clean_word|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|     pride|
|       and|
| prejudice|
|        by|
|      jane|
|    austen|
|      this|
|     ebook|
|        is|
|       for|
+----------+
only showing top 15 rows



In [22]:
removed_spaces = clean_words.filter(col("clean_word") != "")

In [23]:
removed_spaces.show(15)

+----------+
|clean_word|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|     pride|
|       and|
| prejudice|
|        by|
|      jane|
|    austen|
|      this|
|     ebook|
|        is|
|       for|
+----------+
only showing top 15 rows

