In [6]:
print("Hello world")

Hello world


In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
  .master("local[*]")\
  .appName("Jupyter")\
  .config('spark.driver.port', '20002')\
  .getOrCreate()
spark

# Writing basic DF to HDFS and reading it back

In [4]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [11]:
out_path = "hdfs://namenode:8020/mycsv.csv"
df.repartition(1).write.mode("overwrite").option("header", "true").csv(out_path)

In [12]:
df2 = spark.read.csv(out_path)
df2.show()

+---+---+-------+----------+--------------------+
|_c0|_c1|    _c2|       _c3|                 _c4|
+---+---+-------+----------+--------------------+
|  a|  b|      c|         d|                   e|
|  1|2.0|string1|2000-01-01|2000-01-01T12:00:...|
|  2|3.0|string2|2000-02-01|2000-01-02T12:00:...|
|  4|5.0|string3|2000-03-01|2000-01-03T12:00:...|
+---+---+-------+----------+--------------------+



# Reading data locally and writing to HDFS

In [13]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

df3 = spark.read.json("work/data/people.json")
df3.show()

+---+---+-------+
|age| id|   name|
+---+---+-------+
| 15| 12|Michael|
| 30| 15|   Andy|
| 19| 20| Justin|
| 12| 15|   Andy|
| 19| 20|    Jim|
| 12| 10|   Andy|
+---+---+-------+



In [14]:
out_path = "hdfs://namenode:8020/people.json"
df3.write.mode("overwrite").json(out_path)

In [15]:
df4 = spark.read.json(out_path)
df4.show()

+---+---+-------+
|age| id|   name|
+---+---+-------+
| 15| 12|Michael|
| 30| 15|   Andy|
| 19| 20| Justin|
| 12| 15|   Andy|
| 19| 20|    Jim|
| 12| 10|   Andy|
+---+---+-------+

