In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]").appName("Frankenstein").getOrCreate()

In [2]:
spark

In [3]:
!pwd

/home/jovyan


In [4]:
!ls

 frankensteinNotebook.ipynb  'Spark Intro.ipynb'   work
 frankenstein.txt	      studibier.csv


In [8]:
frankenstein = spark.read.text("frankenstein.txt")

In [9]:
frankenstein.count()

7834

In [10]:
frankenstein

DataFrame[value: string]

In [12]:
frankenstein.printSchema()

root
 |-- value: string (nullable = true)



In [13]:
print(frankenstein.dtypes)

[('value', 'string')]


In [15]:
print(f"ist das Element ein string: {frankenstein.dtypes[0][1] == 'string'}")

ist das Element ein string: True


In [17]:
frankenstein.show(truncate=False)

+-------------------------------------------------------------------------+
|value                                                                    |
+-------------------------------------------------------------------------+
|                                                                         |
|Project Gutenberg's Frankenstein, by Mary Wollstonecraft (Godwin) Shelley|
|                                                                         |
|This eBook is for the use of anyone anywhere at no cost and with         |
|almost no restrictions whatsoever.  You may copy it, give it away or     |
|re-use it under the terms of the Project Gutenberg License included      |
|with this eBook or online at www.gutenberg.net                           |
|                                                                         |
|                                                                         |
|Title: Frankenstein                                                      |
|       or T

In [19]:
from pyspark.sql.functions import split

lines = frankenstein.select(split(frankenstein.value, " "))
lines.show(10, truncate=100)

+----------------------------------------------------------------------------------+
|                                                               split(value,  , -1)|
+----------------------------------------------------------------------------------+
|                                                                                []|
|[Project, Gutenberg's, Frankenstein,, by, Mary, Wollstonecraft, (Godwin), Shelley]|
|                                                                                []|
|   [This, eBook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]|
|[almost, no, restrictions, whatsoever., , You, may, copy, it,, give, it, away, or]|
|   [re-use, it, under, the, terms, of, the, Project, Gutenberg, License, included]|
|                            [with, this, eBook, or, online, at, www.gutenberg.net]|
|                                                                                []|
|                                                                

In [20]:
lines = frankenstein.select(split(frankenstein.value, " ").alias("Zeile"))
lines.show(10, truncate=100)

+----------------------------------------------------------------------------------+
|                                                                             Zeile|
+----------------------------------------------------------------------------------+
|                                                                                []|
|[Project, Gutenberg's, Frankenstein,, by, Mary, Wollstonecraft, (Godwin), Shelley]|
|                                                                                []|
|   [This, eBook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]|
|[almost, no, restrictions, whatsoever., , You, may, copy, it,, give, it, away, or]|
|   [re-use, it, under, the, terms, of, the, Project, Gutenberg, License, included]|
|                            [with, this, eBook, or, online, at, www.gutenberg.net]|
|                                                                                []|
|                                                                

In [21]:
lines.printSchema()

root
 |-- Zeile: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [22]:
linesWCR = lines.withColumnRenamed("Zeile", "Renamed")
linesWCR.show()

+--------------------+
|             Renamed|
+--------------------+
|                  []|
|[Project, Gutenbe...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title:, Frankens...|
|[, , , , , , , or...|
|                  []|
|[Author:, Mary, W...|
|                  []|
|[Release, Date:, ...|
|[Last, updated:, ...|
|                  []|
|[Language:, English]|
|                  []|
|[Character, set, ...|
+--------------------+
only showing top 20 rows



In [23]:
frankenstein.show(100,truncate=False)

+-------------------------------------------------------------------------+
|value                                                                    |
+-------------------------------------------------------------------------+
|                                                                         |
|Project Gutenberg's Frankenstein, by Mary Wollstonecraft (Godwin) Shelley|
|                                                                         |
|This eBook is for the use of anyone anywhere at no cost and with         |
|almost no restrictions whatsoever.  You may copy it, give it away or     |
|re-use it under the terms of the Project Gutenberg License included      |
|with this eBook or online at www.gutenberg.net                           |
|                                                                         |
|                                                                         |
|Title: Frankenstein                                                      |
|       or T

In [24]:
lines = frankenstein.select(split(frankenstein.value, "[^a-zA-Z]").alias("Zeile"))
lines.show(100, truncate=False)

+--------------------------------------------------------------------------------------+
|Zeile                                                                                 |
+--------------------------------------------------------------------------------------+
|[]                                                                                    |
|[Project, Gutenberg, s, Frankenstein, , by, Mary, Wollstonecraft, , Godwin, , Shelley]|
|[]                                                                                    |
|[This, eBook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]       |
|[almost, no, restrictions, whatsoever, , , You, may, copy, it, , give, it, away, or]  |
|[re, use, it, under, the, terms, of, the, Project, Gutenberg, License, included]      |
|[with, this, eBook, or, online, at, www, gutenberg, net]                              |
|[]                                                                                    |
|[]                  

In [26]:
# Spalte selektieren geht auf verschiedene weisen
lines.select(lines.Zeile).show()
lines.select("Zeile").show()

from pyspark.sql.functions import col
lines.select(col("Zeile")).show()
# lines["Zeile"] ist nur eine spalte und keine Dataframe also geht lines["Zeile"].show() nicht, aber folgendes:
lines.select(lines["Zeile"]).show()

+--------------------+
|               Zeile|
+--------------------+
|                  []|
|[Project, Gutenbe...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re, use, it, und...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title, , Franken...|
|[, , , , , , , or...|
|                  []|
|[Author, , Mary, ...|
|                  []|
|[Release, Date, ,...|
|[Last, updated, ,...|
|                  []|
|[Language, , Engl...|
|                  []|
|[Character, set, ...|
+--------------------+
only showing top 20 rows

+--------------------+
|               Zeile|
+--------------------+
|                  []|
|[Project, Gutenbe...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re, use, it, und...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title, , Franken...|
|[, , , , , , , or...|
|                  []|
|[Author, , Mary, ...|
|                  []|
|[Release, Date, ,...|
|[Last, 

# Exploding list of words into ROWS (nicht COLS)

In [27]:
from pyspark.sql.functions import explode, col

In [28]:
words = lines.select(explode(col("zeile")).alias("word"))

In [29]:
words.show()

+--------------+
|          word|
+--------------+
|              |
|       Project|
|     Gutenberg|
|             s|
|  Frankenstein|
|              |
|            by|
|          Mary|
|Wollstonecraft|
|              |
|        Godwin|
|              |
|       Shelley|
|              |
|          This|
|         eBook|
|            is|
|           for|
|           the|
|           use|
+--------------+
only showing top 20 rows



In [30]:
from pyspark.sql.functions import lower
words_lower = words.select(lower(col("word")).alias("word_lower"))
words_lower.show(truncate=False)

+--------------+
|word_lower    |
+--------------+
|              |
|project       |
|gutenberg     |
|s             |
|frankenstein  |
|              |
|by            |
|mary          |
|wollstonecraft|
|              |
|godwin        |
|              |
|shelley       |
|              |
|this          |
|ebook         |
|is            |
|for           |
|the           |
|use           |
+--------------+
only showing top 20 rows



In [32]:
from pyspark.sql.functions import regexp_extract
words_real = words_lower.select(regexp_extract(col("word_lower"), "[a-z]{2,}|a|i", 0).alias("real_word"))
words_real.show()

+--------------+
|     real_word|
+--------------+
|              |
|       project|
|     gutenberg|
|              |
|  frankenstein|
|              |
|            by|
|          mary|
|wollstonecraft|
|              |
|        godwin|
|              |
|       shelley|
|              |
|          this|
|         ebook|
|            is|
|           for|
|           the|
|           use|
+--------------+
only showing top 20 rows



In [33]:
propper_words = words_real.filter(col("real_word") != "")

In [34]:
propper_words.show()

+--------------+
|     real_word|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|            is|
|           for|
|           the|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
|            no|
|          cost|
+--------------+
only showing top 20 rows



In [35]:
propper_wordsWHERE = words_real.where(col("real_word") != "")
propper_wordsWHERE.show()

+--------------+
|     real_word|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|            is|
|           for|
|           the|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
|            no|
|          cost|
+--------------+
only showing top 20 rows



In [38]:
from pyspark.sql.functions import length
min3Zeichen = words_real.where(length(col("real_word")) > 3)
min3Zeichen.show()

+--------------+
|     real_word|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|        anyone|
|      anywhere|
|          cost|
|          with|
|        almost|
|  restrictions|
|    whatsoever|
|          copy|
|          give|
|          away|
|         under|
+--------------+
only showing top 20 rows



In [39]:
propper_words_any = propper_words.filter(col("real_word") != "any*")
propper_words_any.show()

+--------------+
|     real_word|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|            is|
|           for|
|           the|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
|            no|
|          cost|
+--------------+
only showing top 20 rows



# Aufgabe 1: erstelle einen Block "sauberen" Code, der Schritt für Schritt alle Einzelschritte ausführt

In [40]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]").appName("Frankenstein2").getOrCreate()

from pyspark.sql.functions import col, split, explode, lower, regexp_extract

frank = spark.read.text("frankenstein.txt")

lines = frank.select(split(frank.value, " ").alias("line"))

words = lines.select(explode(col("line")).alias("word"))

words_lower = words.select(lower(col("word")).alias("word_lower"))

words_clean = words_lower.select(regexp_extract(col("word_lower"), "[a-z]{2+}|a|i", 0).alias("word"))

words_nonull = words_clean.where(col("word") != "")

In [89]:
#1b: ändere den Code von eben so, dass:
#    ->a) das wort is aus dem gesamten Text entfernt wird

words_nois = words_nonull.where(col("word") != "is")
words_nois.show()
#    ->b) nur wörter mit der mindestlänge von 3 Zeichen beibehalten werden
from pyspark.sql.functions import length
words_min3 = words_lower.where(length(col("word")) >= 3)
words_min3.show()

words_min3_alt = words_lower.select(regexp_extract(col("word_lower"), "[a-z]{3,}|a|i", 0).alias("word"))

+--------------+
|          word|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|           for|
|           the|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
|            no|
|          cost|
|           and|
+--------------+
only showing top 20 rows

+--------------+
|    word_lower|
+--------------+
|       project|
|   gutenberg's|
| frankenstein,|
|          mary|
|wollstonecraft|
|      (godwin)|
|       shelley|
|          this|
|         ebook|
|           for|
|           the|
|           use|
|        anyone|
|      anywhere|
|          cost|
|           and|
|          with|
|        almost|
|  restrictions|
|   whatsoever.|
+--------------+
only showing top 20 rows



# Aufgabe 2: finde programmatisch heraus, wie viele Spalten KEINE Strings sind:

In [46]:
datenA2 = spark.createDataFrame([["test", "noch ein test", 10_000_000_000]], ["1","2","3"])
datenA2.printSchema()

root
 |-- 1: string (nullable = true)
 |-- 2: string (nullable = true)
 |-- 3: long (nullable = true)



In [47]:
datenA2.show()

+----+-------------+-----------+
|   1|            2|          3|
+----+-------------+-----------+
|test|noch ein test|10000000000|
+----+-------------+-----------+



In [48]:
cnt = 0
for x,y in datenA2.dtypes:
    if y != 'string':
        cnt+=1
print(f'cnt = {cnt}')

cnt = 1


# Aufgabe 3: mache den Code lesbar
datenA3 = spark.read.text("frankenstein.txt").select(length(col("value"))).withColumnRenamed("length(value)", "numChar")


In [49]:
datenA3 = spark.read.text("frankenstein.txt").select(length(col("value"))).withColumnRenamed("length(value)", "numChar")
datenA3.show()

+-------+
|numChar|
+-------+
|      0|
|     73|
|      0|
|     64|
|     68|
|     67|
|     46|
|      0|
|      0|
|     19|
|     31|
|      0|
|     44|
|      0|
|     39|
|     30|
|      0|
|     17|
|      0|
|     29|
+-------+
only showing top 20 rows



In [51]:
datenA3_lösung = spark.read.text("frankenstein.txt").select(length(col("value")).alias("numChar"))
datenA3_lösung.show()

+-------+
|numChar|
+-------+
|      0|
|     73|
|      0|
|     64|
|     68|
|     67|
|     46|
|      0|
|      0|
|     19|
|     31|
|      0|
|     44|
|      0|
|     39|
|     30|
|      0|
|     17|
|      0|
|     29|
+-------+
only showing top 20 rows



# Aufgabe 4: Problem im Code reparieren


In [52]:
# siehe Musterlösung
datenA4 = spark.createDataFrame([["key", 20_000_000, 10_000_000_000]], ["key", "value1", "value2"])
datenA4.printSchema()
datenA4.show()

root
 |-- key: string (nullable = true)
 |-- value1: long (nullable = true)
 |-- value2: long (nullable = true)

+---+--------+-----------+
|key|  value1|     value2|
+---+--------+-----------+
|key|20000000|10000000000|
+---+--------+-----------+



In [57]:
from pyspark.sql.functions import greatest
from pyspark.sql.utils import AnalysisException

# pipeline erzeugt neuen dataframe und spalten aus dem alten sind nicht mitgegeben -> key gibt es nicht mehr
try:
    datenA4M = datenA4.select(greatest(col("value1"), col("value2")).alias("maxVal")).select("key", "maxVal")
except AnalysisException as err:
    print(f'das war nicht gut {err}')
datenA4M.show()

das war nicht gut Column 'key' does not exist. Did you mean one of the following? [maxVal];
'Project ['key, maxVal#273L]
+- Project [greatest(value1#245L, value2#246L) AS maxVal#273L]
   +- LogicalRDD [key#244, value1#245L, value2#246L], false

+-----------+
|     maxVal|
+-----------+
|10000000000|
+-----------+



In [59]:
# problemlösung
from pyspark.sql.functions import greatest
from pyspark.sql.utils import AnalysisException

# pipeline erzeugt neuen dataframe und spalten aus dem alten sind nicht mitgegeben -> key gibt es nicht mehr
try:
    datenA4M = datenA4.select(col("key"),greatest(col("value1"), col("value2")).alias("maxVal"))
except AnalysisException as err:
    print(f'das war nicht gut {err}')
datenA4M.show()

+---+-----------+
|key|     maxVal|
+---+-----------+
|key|10000000000|
+---+-----------+



# Aufgabe 5: filtere einen ganzen Haufen Wörter raus
mit Hilfe der Funktion isin: filtere die Wörter is, not, if, the aus dem Text

In [73]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]").appName("Frankenstein2").getOrCreate()

from pyspark.sql.functions import col, split, explode, lower, regexp_extract

frank = spark.read.text("frankenstein.txt")

lines = frank.select(split(frank.value, " ").alias("line"))

words = lines.select(explode(col("line")).alias("word"))

words_lower = words.select(lower(col("word")).alias("word_lower"))

words_clean = words_lower.select(regexp_extract(col("word_lower"), "[a-z]{2,}|a|i", 0).alias("word"))

words_nonull = words_clean.where(col("word") != "")
words_nonull.show()

+--------------+
|          word|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|            is|
|           for|
|           the|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
|            no|
|          cost|
+--------------+
only showing top 20 rows



In [74]:
a5 = words_nonull.where(~col("word").isin(["is","not","if","the"]))
a5.show()

+--------------+
|          word|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|           for|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
|            no|
|          cost|
|           and|
|          with|
+--------------+
only showing top 20 rows



# Aufgabe 6: Debugging
finde den Fehler im Code und repariere ihn so, dass der Code wie wohl erwartet funktioniert

In [80]:
from pyspark.sql.functions import col, split
from pyspark.sql.utils import AnalysisException

try:
    book = spark.read.text("frankenstein.txt")
    shema = book.printSchema()
    lines= book.select(split(book.value, " ").alias("line"))
    words = lines.select(explode(col("line")).alias("word"))

except AnalysisException as err:
    print(err)

words.show()

root
 |-- value: string (nullable = true)

+--------------+
|          word|
+--------------+
|              |
|       Project|
|   Gutenberg's|
| Frankenstein,|
|            by|
|          Mary|
|Wollstonecraft|
|      (Godwin)|
|       Shelley|
|              |
|          This|
|         eBook|
|            is|
|           for|
|           the|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
+--------------+
only showing top 20 rows



# Gruppieren
Ziel: zählen, wie oft jedes Wort vorkommt. geht count() alleine?

In [81]:
# nur count geht nicht
words_nonull.count()

77907

In [82]:
groups = words_nonull.groupby(col("word"))
print(groups)

<pyspark.sql.group.GroupedData object at 0x7fbb5d5cfeb0>


In [84]:
wordCount = groups.count()
wordCount.show()

+-------------+-----+
|         word|count|
+-------------+-----+
|       online|    4|
|        those|   92|
|         some|  148|
|       voyage|   15|
|       harder|    1|
|        still|   68|
|     painters|    1|
|   lieutenant|    2|
|          few|   62|
|          fog|    1|
|       travel|    4|
|          art|    7|
|    arguments|    7|
|         hope|   51|
|gratification|    3|
| imperatively|    1|
|        inner|    1|
|apprehensions|    2|
|    connected|    7|
|   circulates|    1|
+-------------+-----+
only showing top 20 rows



In [88]:
wordCount.sort("count",ascending = False).show()

+-----+-----+
| word|count|
+-----+-----+
|  the| 4364|
|  and| 3040|
|    i| 2843|
|   of| 2757|
|   to| 2172|
|   my| 1773|
|    a| 1442|
|   in| 1184|
| that| 1030|
|  was| 1022|
|   me|  868|
| with|  713|
|  but|  690|
|  had|  686|
|  you|  642|
|   he|  607|
|which|  565|
|   it|  562|
|   as|  537|
|  his|  533|
+-----+-----+
only showing top 20 rows



# Aufgabe: finde die Anzahl der Worte per Anzahl Buchstaben (also: wie viele Worte mit 1, 2, 3, ... Buchstaben)

In [92]:
from pyspark.sql.functions import length
words_nonull.select(length(col("word")).alias("length")).groupBy("length").count().orderBy(col("length").asc()).show()

+------+-----+
|length|count|
+------+-----+
|     1| 4285|
|     2|14235|
|     3|16731|
|     4|12081|
|     5| 7935|
|     6| 6549|
|     7| 5570|
|     8| 3785|
|     9| 3302|
|    10| 1751|
|    11|  985|
|    12|  458|
|    13|  157|
|    14|   58|
|    15|   19|
|    16|    5|
|    18|    1|
+------+-----+



In [93]:
words_nonull.write.csv("dings.csv") # -> kann unter umständen in mehrere Dateien gespeichert werden, wenn zu viele Daten, hier gibt es aber eine hilfsfunktion (siehe seine Lösung) 