In [1]:
import findspark
findspark.init('/home/shashank/spark-2.3.2-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [3]:
df = spark.read.json('Spark_DataFrames/people.json') #spark doesn't like spaces in file paths - make sure to use "_"

In [4]:
df.show(3)

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [5]:
df.head(3)

[Row(age=None, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin')]

In [6]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [7]:
df.columns #no parenthesis

['age', 'name']

In [8]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [9]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



# Schema

In [10]:
#Type tools
from pyspark.sql.types import (StructField, StringType, 
                               IntegerType, StructType)

In [11]:
#Change age , which was string, to integer
data_schema = [StructField('age', IntegerType(), True),
              StructField('name', StringType(), True)] #True signifies that its okay that some values are null

In [12]:
final_struct = StructType(fields=data_schema)

In [13]:
df = spark.read.json('Spark_DataFrames/people.json', schema=final_struct)

In [14]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



The documentation does not do a great job of explaining this process. Always check the schema using <br>
    **df.printSchema()** <br>
If the schema seems off, change it using **StructField** <br>
<br>
Then use a schema object to import the file. 