In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [3]:
!ls

sample_data  spark-3.1.1-bin-hadoop3.2	spark-3.1.1-bin-hadoop3.2.tgz


In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [6]:
df = spark.read.option("header", True).json("/content/people.json")
#reading a json file without schema defined.

In [7]:
df.show()
#usually it is represented in alphabetical order  in json file.
#here age is before name as it is taken in alphabetical order as schema not defined.

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [8]:
df.printSchema()
#printing schema

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [23]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,FloatType,DecimalType,DateType,TimestampType,ArrayType
#here we importing types and for creating custom defined schema

In [10]:
dfjson_schema=StructType([StructField("name",StringType()),
                       StructField("age",IntegerType()),
                            ])

In [11]:
df_json=spark.read.option("header",True).schema(dfjson_schema).json("/content/people.json")
#here we are using the defined schema

In [12]:
df_json.show()
#here we specified schema so it will display according to schema.
#here we specified schema so it will display according to schema.here name comming first
#then age as we defined schema it didnt take in alphabetic order.

+-------+----+
|   name| age|
+-------+----+
|Michael|null|
|   Andy|  30|
| Justin|  19|
+-------+----+



In [13]:
df_json.printSchema()
#here we defined schema so it should be according to schema definition

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [17]:
df_jsonschema2=StructType([StructField("name",StringType()),
                           StructField("age",IntegerType()),
                           StructField("address",StructType([StructField("r.no",IntegerType()),
                                                             StructField("city",StringType())]))])

In [20]:
jsonextrafield_df=spark.read\
                       .option("header",True)\
                       .schema(df_jsonschema2)\
                       .json("/content/people1.json")

In [21]:
jsonextrafield_df.printSchema()
#print schema that has structType inside it

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- address: struct (nullable = true)
 |    |-- r.no: integer (nullable = true)
 |    |-- city: string (nullable = true)



In [22]:
jsonextrafield_df.show()
#displays the result with address having two columns

+-------+----+----------+
|   name| age|   address|
+-------+----+----------+
|Michael|null|      null|
|   Andy|  30|      null|
| Justin|  19|      null|
|  Candy|  20|{12, pune}|
+-------+----+----------+



In [29]:
jsonschema_arraytype=StructType([StructField("name",StringType()),
                                 StructField("age",IntegerType()),
                                 StructField("hobbies",ArrayType(StringType())),
                                 StructField("address",StructType([(StructField("r.no",IntegerType())),
                                                                  (StructField("city",StringType()))]))
                                             ])

In [31]:
jsonarraytype_df=spark.read\
                      .option("header",True)\
                      .schema(jsonschema_arraytype)\
                      .json("/content/people2.json")

In [32]:
jsonarraytype_df.printSchema()
#prints the arraytype of variable

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- address: struct (nullable = true)
 |    |-- r.no: integer (nullable = true)
 |    |-- city: string (nullable = true)



In [34]:
jsonarraytype_df.show(truncate=False)
#when array list of items present in it.it is not possible to have different data types in array list

+-------+----+-----------------------------+----------+
|name   |age |hobbies                      |address   |
+-------+----+-----------------------------+----------+
|Michael|null|null                         |null      |
|Andy   |30  |null                         |null      |
|Justin |19  |[cricket, newspaper, walking]|null      |
|Candy  |20  |null                         |{12, pune}|
+-------+----+-----------------------------+----------+

