<a href="https://colab.research.google.com/github/sasansharifipour/Spark_Class/blob/main/Spark_IO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

import findspark
findspark.init()

In [3]:
import pyspark 
from pyspark.sql import SparkSession
sc = pyspark.SparkContext(appName="SQL_Operations")

spark = SparkSession.builder.getOrCreate()

In [4]:
df4 = spark.read.load('/content/spark-3.0.1-bin-hadoop2.7/examples/src/main/resources/people.json', format="json")

df4.select("name","age").write.save("namesAndAges.parquet", format="parquet")

In [5]:
df5 = spark.read.load('/content/spark-3.0.1-bin-hadoop2.7/examples/src/main/resources/people.csv',format="csv", sep=",", inferSchema="true", header="true")

In [6]:
df5.show()

+------------------+
|      name;age;job|
+------------------+
|Jorge;30;Developer|
|  Bob;32;Developer|
+------------------+



In [7]:
df6 = spark.read.load('/content/spark-3.0.1-bin-hadoop2.7/examples/src/main/resources/people.csv',format="csv", sep=";", inferSchema="true", header="true")

In [8]:
df6.show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+



In [11]:
# Run SQL onfiles directly
df6 = spark.sql("SELECT * FROM parquet.`namesAndAges.parquet`")

In [12]:
df6.show()

+-------+----+
|   name| age|
+-------+----+
|Michael|null|
|   Andy|  30|
| Justin|  19|
+-------+----+



In [13]:
peopleDF = spark.read.json('/content/spark-3.0.1-bin-hadoop2.7/examples/src/main/resources/people.json')

In [14]:
peopleDF.write.parquet("people.parquet")

In [15]:
parquetFile = spark.read.parquet("people.parquet")

In [16]:
parquetFile.createOrReplaceTempView("parquetFile")

In [17]:
teenagers = spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")

In [18]:
teenagers.show()

+------+
|  name|
+------+
|Justin|
+------+



In [19]:
from pyspark.sql import Row

squaresDF = spark.createDataFrame(sc.parallelize(range(1, 6))
  .map(lambda i : Row(single=i, double = i ** 2)))

In [20]:
squaresDF.show()

+------+------+
|single|double|
+------+------+
|     1|     1|
|     2|     4|
|     3|     9|
|     4|    16|
|     5|    25|
+------+------+



In [21]:
squaresDF.write.parquet("data/test_table/key=1")

In [22]:
cubesDF = spark.createDataFrame(sc.parallelize(range(6, 11))
  .map(lambda i : Row(single=i, double = i ** 3)))

In [23]:
cubesDF.show()

+------+------+
|single|double|
+------+------+
|     6|   216|
|     7|   343|
|     8|   512|
|     9|   729|
|    10|  1000|
+------+------+



In [24]:
cubesDF.write.parquet("data/test_table/key=2")

In [25]:
mergedDf = spark.read.option("mergeSchema","true").parquet("data/test_table")

In [26]:
mergedDf.printSchema()

root
 |-- single: long (nullable = true)
 |-- double: long (nullable = true)
 |-- key: integer (nullable = true)



In [27]:
mergedDf.show()

+------+------+---+
|single|double|key|
+------+------+---+
|     8|   512|  2|
|     9|   729|  2|
|    10|  1000|  2|
|     3|     9|  1|
|     4|    16|  1|
|     5|    25|  1|
|     6|   216|  2|
|     7|   343|  2|
|     1|     1|  1|
|     2|     4|  1|
+------+------+---+



In [28]:
spark.stop()
sc.stop()