# First PYSpark DataFrame Creation

In this file we 
* Create a SparkSession object
* Download a csv file from the web
* Read the csv as a PySpark DataFrame
* View the top 20 rows of the PySpark DataFrame
* Write the PySpark DF as a parquet file to a folder zones (with default partition 1)

In [1]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [22]:
from decouple import config, AutoConfig
import os
import requests
from datetime import datetime, timedelta

In [14]:
os.getcwd()

'/home/sanyashireen/sf_eviction'

In [19]:
config = AutoConfig(search_path='.env') # <-- .env file located next to manage.py
API_TOKEN = config("API_TOKEN")
API_KEY_ID = config("API_KEY_ID")
API_KEY_SECRET = config("API_KEY_SECRET")

In [20]:
# download the json by supplying the api token in the header
def get_json(endpoint, headers):
    """Calls API, requests all created & updated records >/= 180 days."""
    headers['Accept'] = 'application/json'
    pull_date = (datetime.now() - timedelta(days=180)).strftime("%Y-%m-%dT%H:%M:%S")
    combined = []
    offset, counter = 0, 1
    error = False
    while True:
        params = f"""$query=SELECT:*,* WHERE :created_at >= '{pull_date}' OR :updated_at >= '{pull_date}' 
                 ORDER BY :id LIMIT 1500"""
        response = requests.get(endpoint, headers=headers, params=params)
        if response.status_code != 200:
            error = f'api_request-endpoint|{endpoint}|params|{params}|'
            break
        captured = response.json()
        if len(captured) == 0:
            break
        combined.extend(captured)
        offset = 10000 * counter
        counter += 1
    if error:
        log_exit(filename=error, api_error=response.status_code)
        return -1, -1

    # params = f"""$query=SELECT:*,* WHERE :created_at >= '{pull_date}' OR :updated_at >= '{pull_date}'
    #                     ORDER BY :id LIMIT 1500"""
    # response = requests.get(endpoint, headers=headers, params=params)
    # captured = response.json()
    # combined.extend(captured)

    metadata = parse_metadata(response.headers)
    print('get_json complete')
    return metadata, combined

In [None]:
SODA_url = 'https://data.sfgov.org/resource/5cei-gny5'
SODA_headers = {
    'keyId': API_KEY_ID,
    'keySecret': API_KEY_SECRET
}
head, content = get_json(SODA_url, SODA_headers)

In [4]:
# How we connect to spark locally with all available resources
# Create PySpark SparkSession
spark = SparkSession.builder.master("local[*]").appName('test').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/08 01:58:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
data_filename = '5cei-gny5.json'

In [12]:
# Read the csv as a PySpark DF object
df = spark.read.json(data_filename)

In [13]:
df.printSchema()

root
 |-- _corrupt_record: string (nullable = true)



In [7]:
# View the top 20 rows of the PySpark DF
df.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [8]:
# Use Spark and write the PySpark DF to the folder 'zones' as a parquet file where it will be written as partitons
# if number of partitions is not defined the default paritions in 1
df.write.parquet('zones')

                                                                                

In [11]:
# We can see the folder zones was created to write the parquet file into
!ls -lh

total 28K
-rw-rw-r-- 1 sanyashireen sanyashireen 6.8K Feb 22 01:52 Untitled.ipynb
-rw-rw-r-- 1 sanyashireen sanyashireen  13K Aug 17  2016 taxi+_zone_lookup.csv
drwxr-xr-x 2 sanyashireen sanyashireen 4.0K Feb 22 01:54 zones
