# DataFrameWriter

http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter

    df.write.format("csv")
    .option("mode", "OVERWRITE") # append, overwrite, errorIfExists, ignore
    .option("dateFormat", "yyyy-MM-dd")
    .option("path", "path/to/file(s)")
    .save()

## Spark Session

In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .master("local")
         .appName("Spark session")
         .getOrCreate())

## Prepare Data for Writing

In [2]:
df = (spark
      .read
      .parquet('data/part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet'))

df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

## Write CSV
http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter.csv

In [15]:
df.write.csv(path='write/out.csv', 
             mode='overwrite',
             compression='bzip2',
             sep=',',
             header='true',
             dateFormat='yyyy-MM-dd',
             encoding='UTF-8')

## Write JSON
http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter.json

In [9]:
df.write.json(path='write/out.json', 
              mode='overwrite', # append, overwrite, ignore, error or errorifexists (default)
              compression='gzip',
              dateFormat='yyyy-MM-dd',
              encoding='UTF-8',
              lineSep='\n')

## Write ORC
http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter.orc

In [12]:
df.write.orc(path='write/out.orc',
             mode='overwrite',
             partitionBy='DEST_COUNTRY_NAME',
             compression='snappy')

## Write Parquet
http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter.parquet

In [19]:
df.write.parquet(path='write/out.parquet',
                mode='overwrite',
                partitionBy='DEST_COUNTRY_NAME',
                compression='snappy')

## Write Text
http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter.text

In [74]:
from pyspark.sql.types import StringType
# supports only for one column and in form of string
(df.selectExpr("DEST_COUNTRY_NAME", "cast(count as string) count")
.write
.partitionBy("count")
.mode('overwrite')
.text(path='write/out.txt')
)

## Write Table
http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter.saveAsTable

In [124]:
(df.write.saveAsTable(name='test_table',
                      format='parquet',
                      mode='overwrite'))

# for example in HIVE

In [125]:
! ls spark-warehouse/test_table/

_SUCCESS
part-00000-b4945019-12d4-4561-8a49-a1d3bfc6d11c-c000.snappy.parquet


In [119]:
df.write.mode('overwrite').format('csv').saveAsTable('my_table')

In [123]:
! ls spark-warehouse/my_table/

_SUCCESS
part-00000-fe18ec64-e00a-4bbe-a880-6058cc532ae9-c000.csv


## Write a specific Format

In [102]:
%%writefile pyfiles/avro_write.py

from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .master("local")
         .appName("Spark session")
         .getOrCreate())

# prepare data
df = (spark
      .read
      .parquet('data/part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet'))

# write data
(df.write
 .format('avro')
.mode('overwrite')
.save('write/out.avro'))

spark.stop()



Overwriting pyfiles/avro_write.py


In [106]:
! spark-submit \
--packages org.apache.spark:spark-avro_2.11:2.4.3 \
pyfiles/avro_write.py

Ivy Default Cache set to: /Users/esn/.ivy2/cache
The jars for the packages stored in: /Users/esn/.ivy2/jars
:: loading settings :: url = jar:file:/Users/esn/anaconda3/lib/python3.7/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.spark#spark-avro_2.11 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-af76dc26-83a1-407b-9322-cd0480c9fbc9;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.11;2.4.3 in central
	found org.spark-project.spark#unused;1.0.0 in central
downloading https://repo1.maven.org/maven2/org/apache/spark/spark-avro_2.11/2.4.3/spark-avro_2.11-2.4.3.jar ...
	[SUCCESSFUL ] org.apache.spark#spark-avro_2.11;2.4.3!spark-avro_2.11.jar (159ms)
:: resolution report :: resolve 1970ms :: artifacts dl 164ms
	:: modules in use:
	org.apache.spark#spark-avro_2.11;2.4.3 from central in [default]
	org.spark-project.spark#unused;1.0.0 from central in [default]
	----------------------------------

## Write N Separate Files

In [110]:
df.repartition(5).write.mode('overwrite').format('csv').save('write/repartitioned.csv')

In [111]:
! ls write/repartitioned.csv

_SUCCESS
part-00000-9d4a5a81-94ca-479c-a0db-d15b72baa2d8-c000.csv
part-00001-9d4a5a81-94ca-479c-a0db-d15b72baa2d8-c000.csv
part-00002-9d4a5a81-94ca-479c-a0db-d15b72baa2d8-c000.csv
part-00003-9d4a5a81-94ca-479c-a0db-d15b72baa2d8-c000.csv
part-00004-9d4a5a81-94ca-479c-a0db-d15b72baa2d8-c000.csv


## Partitioning

Can parallelize the reading. Drawback: to little files.

In [112]:
df.write.mode('overwrite').format('csv').partitionBy('DEST_COUNTRY_NAME').save('write/partitioned.csv')

In [126]:
! ls write/partitioned.csv | head -10

DEST_COUNTRY_NAME=Afghanistan
DEST_COUNTRY_NAME=Angola
DEST_COUNTRY_NAME=Anguilla
DEST_COUNTRY_NAME=Antigua and Barbuda
DEST_COUNTRY_NAME=Argentina
DEST_COUNTRY_NAME=Aruba
DEST_COUNTRY_NAME=Australia
DEST_COUNTRY_NAME=Austria
DEST_COUNTRY_NAME=Azerbaijan
DEST_COUNTRY_NAME=Bahrain


## Bucketing
This can help avoid shuffles. You can define large enough for HDFS files.

In [117]:
# can be only use with tables
num_buckets = 10
df.write.mode('overwrite').format('csv').bucketBy(num_buckets, 'DEST_COUNTRY_NAME').saveAsTable('bucketedcsv')

In [118]:
! ls spark-warehouse/bucketedcsv

_SUCCESS
part-00000-1ce6aae1-501c-4eb1-b7e1-afa618512f68_00000.c000.csv
part-00000-1ce6aae1-501c-4eb1-b7e1-afa618512f68_00001.c000.csv
part-00000-1ce6aae1-501c-4eb1-b7e1-afa618512f68_00002.c000.csv
part-00000-1ce6aae1-501c-4eb1-b7e1-afa618512f68_00003.c000.csv
part-00000-1ce6aae1-501c-4eb1-b7e1-afa618512f68_00004.c000.csv
part-00000-1ce6aae1-501c-4eb1-b7e1-afa618512f68_00005.c000.csv
part-00000-1ce6aae1-501c-4eb1-b7e1-afa618512f68_00006.c000.csv
part-00000-1ce6aae1-501c-4eb1-b7e1-afa618512f68_00007.c000.csv
part-00000-1ce6aae1-501c-4eb1-b7e1-afa618512f68_00008.c000.csv
part-00000-1ce6aae1-501c-4eb1-b7e1-afa618512f68_00009.c000.csv


# Close Session

In [None]:
spark.stop()