In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [2]:
spark

In [3]:
spark.sparkContext.defaultParallelism

2

In [4]:
print(spark._jsc.sc().getExecutorMemoryStatus().keys())

Set(cb1defdb05d8:34039)


In [5]:
df = spark.read.csv("ItalianRivers.csv", inferSchema=True, header=True)
df.show(3)

+-----+------+---------+--------------------+--------------+---------------+
| Name|Length|ItalyOnly|              Cities|           Sea|NumberOfBridges|
+-----+------+---------+--------------------+--------------+---------------+
|   Po|   652|     true|Turin, Piacenza, ...|  Adriatic Sea|            123|
|Tiber|   405|     true|         Rome, Terni|Tyrrhenian Sea|             66|
| Arno|   241|     true|      Florence, Pisa|  Ligurian Sea|             40|
+-----+------+---------+--------------------+--------------+---------------+
only showing top 3 rows



In [6]:
df.rdd.getNumPartitions()

1

In [7]:
df.write.format("parquet").mode("overwrite").save("ir-1.parquet")

In [8]:
df = df.repartition(4)

In [9]:
df.rdd.getNumPartitions()

4

In [10]:
df.write.mode("overwrite").parquet("ir-4.parquet")

In [11]:
from pyspark.sql.functions import spark_partition_id

df.withColumn("partition_id", spark_partition_id()).orderBy("Name").show()

+--------------+------+---------+--------------------+--------------+---------------+------------+
|          Name|Length|ItalyOnly|              Cities|           Sea|NumberOfBridges|partition_id|
+--------------+------+---------+--------------------+--------------+---------------+------------+
|          Adda|   313|    false|        Lecco, Milan|      Po River|             63|           3|
|         Adige|   410|     true|      Verona, Trento|  Adriatic Sea|            105|           0|
|          Agri|   136|     true|     Policoro, Tursi|    Ionian Sea|             14|           3|
|         Amato|    59|     true|Lamezia Terme, Ni...|Tyrrhenian Sea|              8|           3|
|        Aniene|    99|     true|     Subiaco, Tivoli|   Tiber River|             13|           2|
|          Arno|   241|     true|      Florence, Pisa|  Ligurian Sea|             40|           0|
|Aterno-Pescara|   152|     true|     Pescara, Chieti|  Adriatic Sea|             20|           0|
|  Bacchig

In [12]:
df.write.format("csv").option("header", True).mode("overwrite").save("ir-4.csv")

In [13]:
df.coalesce(1).write.format("csv").option("header", True).mode("overwrite").save("ir-1.csv")

In [14]:
import pandas as pd

df_pandas = df.toPandas()
df_pandas.to_csv("ir-pandas.csv", index=False)

In [15]:
df.write.format("csv").partitionBy("Sea").option("header", True).mode("overwrite").save("ir-sea.csv")

In [16]:
df.coalesce(1).write.format("csv").option("header", True).mode("overwrite").save("ir-1.csv")

In [17]:
df.coalesce(1).write.format("csv").option("header", True).mode("append").save("ir-1.csv")

In [18]:
df.count()

72

In [19]:
dfn = spark.read.csv("ir-1.csv", inferSchema=True, header=True)
dfn.count()

144

In [20]:
# df.coalesce(1).write.format("csv").option("header", True).mode("error").save("ir-1.csv")

In [21]:
df.coalesce(1).write.format("csv").option("header", True).mode("ignore").save("ir-1.csv")

In [22]:
dfp = spark.read.parquet("ir-4.parquet")
dfp.show(3)

+--------+------+---------+--------------------+--------------+---------------+
|    Name|Length|ItalyOnly|              Cities|           Sea|NumberOfBridges|
+--------+------+---------+--------------------+--------------+---------------+
|    Agri|   136|     true|     Policoro, Tursi|    Ionian Sea|             14|
|Volturno|   175|     true|Capua, Castel Vol...|Tyrrhenian Sea|             30|
|    Adda|   313|    false|        Lecco, Milan|      Po River|             63|
+--------+------+---------+--------------------+--------------+---------------+
only showing top 3 rows



In [23]:
dfp.rdd.getNumPartitions()

2

https://spark.apache.org/docs/latest/sql-data-sources-parquet.html
