In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.json("simple_zipcodes.json")

In [4]:
df.show() # this will show us 10 rows by default

+-------------------+-----+-----------+-------+
|               City|State|ZipCodeType|Zipcode|
+-------------------+-----+-----------+-------+
|        PARC PARQUE|   PR|   STANDARD|    704|
|PASEO COSTA DEL SUR|   PR|   STANDARD|    704|
|       BDA SAN LUIS|   PR|   STANDARD|    709|
|  CINGULAR WIRELESS|   TX|     UNIQUE|  76166|
|         FORT WORTH|   TX|   STANDARD|  76177|
|           FT WORTH|   TX|   STANDARD|  76177|
|    URB EUGENE RICE|   PR|   STANDARD|    704|
|               MESA|   AZ|   STANDARD|  85209|
|               MESA|   AZ|   STANDARD|  85210|
|           HILLIARD|   FL|   STANDARD|  32046|
+-------------------+-----+-----------+-------+



In [5]:
df.collect() #collect returns all records as a list of row objects

[Row(City='PARC PARQUE', State='PR', ZipCodeType='STANDARD', Zipcode=704),
 Row(City='PASEO COSTA DEL SUR', State='PR', ZipCodeType='STANDARD', Zipcode=704),
 Row(City='BDA SAN LUIS', State='PR', ZipCodeType='STANDARD', Zipcode=709),
 Row(City='CINGULAR WIRELESS', State='TX', ZipCodeType='UNIQUE', Zipcode=76166),
 Row(City='FORT WORTH', State='TX', ZipCodeType='STANDARD', Zipcode=76177),
 Row(City='FT WORTH', State='TX', ZipCodeType='STANDARD', Zipcode=76177),
 Row(City='URB EUGENE RICE', State='PR', ZipCodeType='STANDARD', Zipcode=704),
 Row(City='MESA', State='AZ', ZipCodeType='STANDARD', Zipcode=85209),
 Row(City='MESA', State='AZ', ZipCodeType='STANDARD', Zipcode=85210),
 Row(City='HILLIARD', State='FL', ZipCodeType='STANDARD', Zipcode=32046)]

In [6]:
df.take(5) # you have to specify the number like 5 or 10

[Row(City='PARC PARQUE', State='PR', ZipCodeType='STANDARD', Zipcode=704),
 Row(City='PASEO COSTA DEL SUR', State='PR', ZipCodeType='STANDARD', Zipcode=704),
 Row(City='BDA SAN LUIS', State='PR', ZipCodeType='STANDARD', Zipcode=709),
 Row(City='CINGULAR WIRELESS', State='TX', ZipCodeType='UNIQUE', Zipcode=76166),
 Row(City='FORT WORTH', State='TX', ZipCodeType='STANDARD', Zipcode=76177)]

In [7]:
df.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- Zipcode: long (nullable = true)



In [8]:
df.count() #this will give you total no of rows in the df

10

In [9]:
df.select("City").show() #you can print columns using select

+-------------------+
|               City|
+-------------------+
|        PARC PARQUE|
|PASEO COSTA DEL SUR|
|       BDA SAN LUIS|
|  CINGULAR WIRELESS|
|         FORT WORTH|
|           FT WORTH|
|    URB EUGENE RICE|
|               MESA|
|               MESA|
|           HILLIARD|
+-------------------+



In [10]:
df.select("City", "State").show() #you can print multiple columns

+-------------------+-----+
|               City|State|
+-------------------+-----+
|        PARC PARQUE|   PR|
|PASEO COSTA DEL SUR|   PR|
|       BDA SAN LUIS|   PR|
|  CINGULAR WIRELESS|   TX|
|         FORT WORTH|   TX|
|           FT WORTH|   TX|
|    URB EUGENE RICE|   PR|
|               MESA|   AZ|
|               MESA|   AZ|
|           HILLIARD|   FL|
+-------------------+-----+



In [11]:
df.filter(df["Zipcode"]>704).show()

+-----------------+-----+-----------+-------+
|             City|State|ZipCodeType|Zipcode|
+-----------------+-----+-----------+-------+
|     BDA SAN LUIS|   PR|   STANDARD|    709|
|CINGULAR WIRELESS|   TX|     UNIQUE|  76166|
|       FORT WORTH|   TX|   STANDARD|  76177|
|         FT WORTH|   TX|   STANDARD|  76177|
|             MESA|   AZ|   STANDARD|  85209|
|             MESA|   AZ|   STANDARD|  85210|
|         HILLIARD|   FL|   STANDARD|  32046|
+-----------------+-----+-----------+-------+



In [12]:
df.select("Zipcode").filter("Zipcode = 704").show()

+-------+
|Zipcode|
+-------+
|    704|
|    704|
|    704|
+-------+



In [13]:
df.select("City").filter("City like 'M%' ").show()

+----+
|City|
+----+
|MESA|
|MESA|
+----+



In [14]:
df.sort("State").show()

+-------------------+-----+-----------+-------+
|               City|State|ZipCodeType|Zipcode|
+-------------------+-----+-----------+-------+
|               MESA|   AZ|   STANDARD|  85209|
|               MESA|   AZ|   STANDARD|  85210|
|           HILLIARD|   FL|   STANDARD|  32046|
|       BDA SAN LUIS|   PR|   STANDARD|    709|
|        PARC PARQUE|   PR|   STANDARD|    704|
|    URB EUGENE RICE|   PR|   STANDARD|    704|
|PASEO COSTA DEL SUR|   PR|   STANDARD|    704|
|  CINGULAR WIRELESS|   TX|     UNIQUE|  76166|
|         FORT WORTH|   TX|   STANDARD|  76177|
|           FT WORTH|   TX|   STANDARD|  76177|
+-------------------+-----+-----------+-------+



In [15]:
df.describe("Zipcode").show() #summary statistics

+-------+-----------------+
|summary|          Zipcode|
+-------+-----------------+
|  count|               10|
|   mean|          43380.6|
| stddev|39635.30844027274|
|    min|              704|
|    max|            85210|
+-------+-----------------+



In [16]:
df.select("Zipcode").min()

AttributeError: 'DataFrame' object has no attribute 'min'

In [17]:
df.columns

['City', 'State', 'ZipCodeType', 'Zipcode']

In [18]:
df.dtypes

[('City', 'string'),
 ('State', 'string'),
 ('ZipCodeType', 'string'),
 ('Zipcode', 'bigint')]

In [19]:
df.describe("Zipcode", deciles=True).show() #summary statistics

TypeError: describe() got an unexpected keyword argument 'deciles'