## Downloading data as json

In [10]:
from decouple import config, AutoConfig
from datetime import datetime, timedelta
import os
import requests
from pathlib import Path

In [11]:
config = AutoConfig(search_path='.env') # <-- .env file located next to manage.py
API_TOKEN = config("API_TOKEN")
API_KEY_ID = config("API_KEY_ID")
API_KEY_SECRET = config("API_KEY_SECRET")

In [12]:
# download the json by supplying the api token in the header
def get_json(endpoint, headers):
    """Calls API, requests all created & updated records >/= 180 days."""
    headers['Accept'] = 'application/json' # csv?
    # pull_date = (datetime.now() - timedelta(days=180)).strftime("%Y-%m-%dT%H:%M:%S") # year, month, day, hour, minute, seconds, microseconds
    combined = []
    offset, counter = 0, 1
    error = False
    params = f"""$query=SELECT:*,* ORDER BY :id LIMIT 200000"""
    # response has two parts .json() and .headers https://www.w3schools.com/python/ref_requests_response.asp
    response = requests.get(endpoint, headers=headers, params=params)
    captured = response.json()
    combined.extend(captured)
    print('get_json complete')
    return combined

In [13]:
# Sodu API Credentials
API_TOKEN = config("API_TOKEN")
API_KEY_ID = config("API_KEY_ID")
API_KEY_SECRET = config("API_KEY_SECRET")

source_path_json = '/home/sanyashireen/sf_eviction/data_eviction/2023/3/25/api_raw_eviction_2023-03-25.json'
data_dir = Path('/home/sanyashireen/sf_eviction/data_eviction/2023/3/25')
data_dir.mkdir(parents=True, exist_ok=True)

SODA_url = 'https://data.sfgov.org/resource/5cei-gny5'
SODA_headers = {
    'keyId': API_KEY_ID,
    'keySecret': API_KEY_SECRET
}
content = get_json(SODA_url, SODA_headers)


get_json complete


In [14]:
type(content)

list

In [15]:
print(type(content[1]))

<class 'dict'>


In [11]:
# Extracting all column names as the first row might not contain all the columns
keys = set().union(*(d.keys() for d in content))

In [36]:
print(keys)

{':@computed_region_p5aj_wyqh', ':@computed_region_bh8s_q3mv', ':@computed_region_h4ep_8xdi', 'client_location', 'neighborhood', ':@computed_region_9jxd_iqea', 'shape', 'non_payment', 'nuisance', 'breach', 'city', ':@computed_region_qgnn_b9vv', 'address', 'demolition', ':@computed_region_jwn9_ihcz', 'illegal_use', 'access_denial', 'other_cause', 'eviction_id', 'state', 'substantial_rehab', 'development', 'constraints_date', ':@computed_region_rxqg_mtj9', ':@computed_region_26cr_cadq', 'roommate_same_unit', ':@computed_region_6qbp_sg9q', ':@computed_region_pigm_ib2e', ':@computed_region_6ezc_tdp2', 'file_date', ':id', ':@computed_region_ajp5_b2md', 'ellis_act_withdrawal', 'good_samaritan_ends', 'lead_remediation', 'unapproved_subtenant', 'capital_improvement', 'condo_conversion', 'late_payments', ':created_at', ':version', ':updated_at', 'zip', 'supervisor_district', 'owner_move_in', ':@computed_region_6pnf_4xz7', ':@computed_region_fyvs_ahh9', ':@computed_region_yftq_j783', 'failure_to

In [14]:
# Writing list of dict as csv to local
# Ref: https://stackoverflow.com/questions/3086973/how-do-i-convert-this-list-of-dictionaries-to-a-csv-file
import csv
with open(source_path_csv, 'w', encoding='utf8', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(content)

In [37]:
with open(source_path_csv, 'w', encoding='utf8', newline='') as output_file:
    fc = csv.DictWriter(output_file, 
                        fieldnames=keys,

                       )
    fc.writeheader()
    fc.writerows(content)

## csv is corrupting the data so trying to write the data to pyspark directly

In [18]:
import json
source_path_json = '/home/sanyashireen/sf_eviction/data_eviction/2023/3/25/api_raw_eviction_2023-03-25.json'
s = json.dumps(content, indent=4)
open(source_path_json,"w", encoding='utf8').write(s)

354456664

## Testing for csv data from api

In [19]:
#spark.stop()

In [20]:
# Creating and testing script to clean and transform csv data using pyspark
# spark related packages
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()
#spark.conf.set("spark.sql.debug.maxToStringFields", 1000)

In [None]:
from pyspark.sql import types
schema = types.StructType([
    types.StructField('eviction_id', types.StringType(), True), 
    types.StructField('address', types.StringType(), True), 
    types.StructField('city', types.StringType(), True), 
    types.StructField('state', types.StringType(), True), 
    types.StructField('zip', types.IntegerType(), True), 
    types.StructField('file_date', types.DateType(), True), 
    types.StructField('non_payment', types.BooleanType(), True), 
    types.StructField('breach', types.BooleanType(), True), 
    types.StructField('nuisance', types.BooleanType(), True), 
    types.StructField('illegal_use', types.BooleanType(), True), 
    types.StructField('failure_to_sign_renewal', types.BooleanType(), True), 
    types.StructField('access_denial', types.BooleanType(), True), 
    types.StructField('unapproved_subtenant', types.BooleanType(), True), 
    types.StructField('owner_move_in', types.BooleanType(), True), 
    types.StructField('demolition', types.BooleanType(), True), 
    types.StructField('capital_improvement', types.BooleanType(), True), 
    types.StructField('substantial_rehab', types.BooleanType(), True), 
    types.StructField('ellis_act_withdrawal', types.BooleanType(), True), 
    types.StructField('condo_conversion', types.BooleanType(), True), 
    types.StructField('roommate_same_unit', types.BooleanType(), True), 
    types.StructField('other_cause', types.BooleanType(), True), 
    types.StructField('late_payments', types.BooleanType(), True), 
    types.StructField('lead_remediation', types.BooleanType(), True), 
    types.StructField('development', types.BooleanType(), True), 
    types.StructField('good_samaritan_ends', types.BooleanType(), True), 
    types.StructField('constraints_date', types.StringType(), True), 
    types.StructField('supervisor_district', types.IntegerType(), True), 
    types.StructField('neighborhood', types.StringType(), True), 
    types.StructField('client_location', types.StringType(), True)])

In [27]:
df = spark.read.option("multiline","true").json(str(source_path_json))

                                                                                

In [17]:
df.rdd.getNumPartitions

<bound method RDD.getNumPartitions of MapPartitionsRDD[14] at javaToPython at NativeMethodAccessorImpl.java:0>

In [28]:
df.count()

                                                                                

177578

In [29]:
df.printSchema()

root
 |-- :@computed_region_26cr_cadq: string (nullable = true)
 |-- :@computed_region_6ezc_tdp2: string (nullable = true)
 |-- :@computed_region_6pnf_4xz7: string (nullable = true)
 |-- :@computed_region_6qbp_sg9q: string (nullable = true)
 |-- :@computed_region_9jxd_iqea: string (nullable = true)
 |-- :@computed_region_ajp5_b2md: string (nullable = true)
 |-- :@computed_region_bh8s_q3mv: string (nullable = true)
 |-- :@computed_region_fyvs_ahh9: string (nullable = true)
 |-- :@computed_region_h4ep_8xdi: string (nullable = true)
 |-- :@computed_region_jwn9_ihcz: string (nullable = true)
 |-- :@computed_region_p5aj_wyqh: string (nullable = true)
 |-- :@computed_region_pigm_ib2e: string (nullable = true)
 |-- :@computed_region_qgnn_b9vv: string (nullable = true)
 |-- :@computed_region_rxqg_mtj9: string (nullable = true)
 |-- :@computed_region_yftq_j783: string (nullable = true)
 |-- :created_at: string (nullable = true)
 |-- :id: string (nullable = true)
 |-- :updated_at: string (nullab

TypeError: 'StructType' object is not callable

In [30]:
df.head(3)

23/03/25 20:39:14 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

[Row(:@computed_region_26cr_cadq='2', :@computed_region_6ezc_tdp2=None, :@computed_region_6pnf_4xz7='2', :@computed_region_6qbp_sg9q='53', :@computed_region_9jxd_iqea=None, :@computed_region_ajp5_b2md='20', :@computed_region_bh8s_q3mv='28859', :@computed_region_fyvs_ahh9='19', :@computed_region_h4ep_8xdi=None, :@computed_region_jwn9_ihcz='53', :@computed_region_p5aj_wyqh='4', :@computed_region_pigm_ib2e=None, :@computed_region_qgnn_b9vv='3', :@computed_region_rxqg_mtj9='7', :@computed_region_yftq_j783='2', :created_at='2023-01-23T23:46:16.858Z', :id='row-ufzj_22gk~drek', :updated_at='2023-01-23T23:46:33.925Z', :version='rv-wryr~aezj~xte7', access_denial=False, address='2500 Block Of Folsom  Street', breach=False, capital_improvement=False, city='San Francisco', client_location=Row(human_address='{"address": "", "city": "", "state": "", "zip": ""}', latitude='37.75649855484188', longitude='-122.41446453496935'), condo_conversion=False, constraints_date=None, demolition=False, developmen

In [31]:
df.columns

[':@computed_region_26cr_cadq',
 ':@computed_region_6ezc_tdp2',
 ':@computed_region_6pnf_4xz7',
 ':@computed_region_6qbp_sg9q',
 ':@computed_region_9jxd_iqea',
 ':@computed_region_ajp5_b2md',
 ':@computed_region_bh8s_q3mv',
 ':@computed_region_fyvs_ahh9',
 ':@computed_region_h4ep_8xdi',
 ':@computed_region_jwn9_ihcz',
 ':@computed_region_p5aj_wyqh',
 ':@computed_region_pigm_ib2e',
 ':@computed_region_qgnn_b9vv',
 ':@computed_region_rxqg_mtj9',
 ':@computed_region_yftq_j783',
 ':created_at',
 ':id',
 ':updated_at',
 ':version',
 'access_denial',
 'address',
 'breach',
 'capital_improvement',
 'city',
 'client_location',
 'condo_conversion',
 'constraints_date',
 'demolition',
 'development',
 'ellis_act_withdrawal',
 'eviction_id',
 'failure_to_sign_renewal',
 'file_date',
 'good_samaritan_ends',
 'illegal_use',
 'late_payments',
 'lead_remediation',
 'neighborhood',
 'non_payment',
 'nuisance',
 'other_cause',
 'owner_move_in',
 'roommate_same_unit',
 'shape',
 'state',
 'substantial_r

In [32]:
len(df.columns)

49

In [33]:
print(f'The pyspark df is a list of records where each record is of type {type(df.head(1)[0])} \n\n {df.head(1)[0]}')

[Stage 14:>                                                         (0 + 1) / 1]

The pyspark df is a list of records where each record is of type <class 'pyspark.sql.types.Row'> 

 Row(:@computed_region_26cr_cadq='2', :@computed_region_6ezc_tdp2=None, :@computed_region_6pnf_4xz7='2', :@computed_region_6qbp_sg9q='53', :@computed_region_9jxd_iqea=None, :@computed_region_ajp5_b2md='20', :@computed_region_bh8s_q3mv='28859', :@computed_region_fyvs_ahh9='19', :@computed_region_h4ep_8xdi=None, :@computed_region_jwn9_ihcz='53', :@computed_region_p5aj_wyqh='4', :@computed_region_pigm_ib2e=None, :@computed_region_qgnn_b9vv='3', :@computed_region_rxqg_mtj9='7', :@computed_region_yftq_j783='2', :created_at='2023-01-23T23:46:16.858Z', :id='row-ufzj_22gk~drek', :updated_at='2023-01-23T23:46:33.925Z', :version='rv-wryr~aezj~xte7', access_denial=False, address='2500 Block Of Folsom  Street', breach=False, capital_improvement=False, city='San Francisco', client_location=Row(human_address='{"address": "", "city": "", "state": "", "zip": ""}', latitude='37.75649855484188', longitude=

                                                                                

In [7]:
# from web data
# Extracting column names to decide which ones are of interest
# Convert type row to dictionary
#row_as_dict = df.head(1)[0].asDict()

# Column Names
#print(list(row_as_dict.keys()))

# Sample Column and Value
#for key, value in row_as_dict.items():
    #print(f'{key}={value}')


['eviction_id', 'address', 'city', 'state', 'zip', 'file_date', 'non_payment', 'breach', 'nuisance', 'illegal_use', 'failure_to_sign_renewal', 'access_denial', 'unapproved_subtenant', 'owner_move_in', 'demolition', 'capital_improvement', 'substantial_rehab', 'ellis_act_withdrawal', 'condo_conversion', 'roommate_same_unit', 'other_cause', 'late_payments', 'lead_remediation', 'development', 'good_samaritan_ends', 'constraints_date', 'supervisor_district', 'neighborhood', 'client_location', 'shape', ':@computed_region_6qbp_sg9q', ':@computed_region_qgnn_b9vv', ':@computed_region_26cr_cadq', ':@computed_region_ajp5_b2md', ':@computed_region_fyvs_ahh9', ':@computed_region_p5aj_wyqh', ':@computed_region_rxqg_mtj9', ':@computed_region_yftq_j783', ':@computed_region_bh8s_q3mv', ':@computed_region_9jxd_iqea', ':@computed_region_6ezc_tdp2', ':@computed_region_6pnf_4xz7', ':@computed_region_h4ep_8xdi', ':@computed_region_pigm_ib2e', ':@computed_region_jwn9_ihcz']
eviction_id=M222138
address=400 B

In [34]:
# Extracting column names to decide which ones are of interest
# Convert type row to dictionary
row_as_dict = df.head(1)[0].asDict()

# Column Names
print(list(row_as_dict.keys()))

# Sample Column and Value
for key, value in row_as_dict.items():
    print(f'{key}={value}')
    
# new cols updated_at, id, created_at, version  missing when extracting from the URL

[Stage 15:>                                                         (0 + 1) / 1]

[':@computed_region_26cr_cadq', ':@computed_region_6ezc_tdp2', ':@computed_region_6pnf_4xz7', ':@computed_region_6qbp_sg9q', ':@computed_region_9jxd_iqea', ':@computed_region_ajp5_b2md', ':@computed_region_bh8s_q3mv', ':@computed_region_fyvs_ahh9', ':@computed_region_h4ep_8xdi', ':@computed_region_jwn9_ihcz', ':@computed_region_p5aj_wyqh', ':@computed_region_pigm_ib2e', ':@computed_region_qgnn_b9vv', ':@computed_region_rxqg_mtj9', ':@computed_region_yftq_j783', ':created_at', ':id', ':updated_at', ':version', 'access_denial', 'address', 'breach', 'capital_improvement', 'city', 'client_location', 'condo_conversion', 'constraints_date', 'demolition', 'development', 'ellis_act_withdrawal', 'eviction_id', 'failure_to_sign_renewal', 'file_date', 'good_samaritan_ends', 'illegal_use', 'late_payments', 'lead_remediation', 'neighborhood', 'non_payment', 'nuisance', 'other_cause', 'owner_move_in', 'roommate_same_unit', 'shape', 'state', 'substantial_rehab', 'supervisor_district', 'unapproved_sub

                                                                                

In [42]:
# Extracting column names to decide which ones are of interest
# Convert type row to dictionary
row_as_dict = df.head(1)[0].asDict()

# Column Names
print(list(row_as_dict.keys()))

# Sample Column and Value
for key, value in row_as_dict.items():
    print(f'{key}={value}')
    
# new cols updated_at, id, created_at, version  missing when extracting from the URL

[':@computed_region_p5aj_wyqh', ':@computed_region_bh8s_q3mv', ':@computed_region_h4ep_8xdi', 'client_location', 'neighborhood', ':@computed_region_9jxd_iqea', 'shape', 'non_payment', 'nuisance', 'breach', 'city', ':@computed_region_qgnn_b9vv', 'address', 'demolition', ':@computed_region_jwn9_ihcz', 'illegal_use', 'access_denial', 'other_cause', 'eviction_id', 'state', 'substantial_rehab', 'development', 'constraints_date', ':@computed_region_rxqg_mtj9', ':@computed_region_26cr_cadq', 'roommate_same_unit', ':@computed_region_6qbp_sg9q', ':@computed_region_pigm_ib2e', ':@computed_region_6ezc_tdp2', 'file_date', ':id', ':@computed_region_ajp5_b2md', 'ellis_act_withdrawal', 'good_samaritan_ends', 'lead_remediation', 'unapproved_subtenant', 'capital_improvement', 'condo_conversion', 'late_payments', ':created_at', ':version', ':updated_at', 'zip', 'supervisor_district', 'owner_move_in', ':@computed_region_6pnf_4xz7', ':@computed_region_fyvs_ahh9', ':@computed_region_yftq_j783', 'failure_to

## Analysis
* As we can see the columns of interest are not all columns.
* The last columns name `@computed_region*` is not of interest so we will ignore those columns.
* And will extract only those columns from the raw dataset
* Column `client_location` and `shape` hold the same data 

In [24]:
imp_cols = ['eviction_id', 'address', 'city', 'state', 'zip', 'file_date', 'non_payment', 'breach', 'nuisance', 'illegal_use', 'failure_to_sign_renewal', 'access_denial', 'unapproved_subtenant', 'owner_move_in', 'demolition', 'capital_improvement', 'substantial_rehab', 'ellis_act_withdrawal', 'condo_conversion', 'roommate_same_unit', 'other_cause', 'late_payments', 'lead_remediation', 'development', 'good_samaritan_ends', 'constraints_date', 'supervisor_district', 'neighborhood', 'client_location']


In [25]:
# new df with only selected columns 
df_clean = df.select(imp_cols)
df_clean.show(5)

+-----------+-------+-----+-----+--------------------+---------+---------------+--------------------+--------+--------------------+-----------------------+-------------+--------------------+--------------------+-------------+-------------------+-----------------+--------------------+----------------+------------------+-----------+-------------+----------------+-----------+-------------------+----------------+-------------------+---------------+--------------------+
|eviction_id|address| city|state|                 zip|file_date|    non_payment|              breach|nuisance|         illegal_use|failure_to_sign_renewal|access_denial|unapproved_subtenant|       owner_move_in|   demolition|capital_improvement|substantial_rehab|ellis_act_withdrawal|condo_conversion|roommate_same_unit|other_cause|late_payments|lead_remediation|development|good_samaritan_ends|constraints_date|supervisor_district|   neighborhood|     client_location|
+-----------+-------+-----+-----+--------------------+------

In [26]:
df_clean.printSchema()

root
 |-- eviction_id: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- file_date: string (nullable = true)
 |-- non_payment: string (nullable = true)
 |-- breach: string (nullable = true)
 |-- nuisance: string (nullable = true)
 |-- illegal_use: string (nullable = true)
 |-- failure_to_sign_renewal: string (nullable = true)
 |-- access_denial: string (nullable = true)
 |-- unapproved_subtenant: string (nullable = true)
 |-- owner_move_in: string (nullable = true)
 |-- demolition: string (nullable = true)
 |-- capital_improvement: string (nullable = true)
 |-- substantial_rehab: string (nullable = true)
 |-- ellis_act_withdrawal: string (nullable = true)
 |-- condo_conversion: string (nullable = true)
 |-- roommate_same_unit: string (nullable = true)
 |-- other_cause: string (nullable = true)
 |-- late_payments: string (nullable = true)
 |-- lead_remediation: 

## Convert few rows to pandas df and extract the schema

In [28]:
import pandas as p
pandas_df = p.read_csv(source_path_csv, nrows=100000, usecols=imp_cols)

In [12]:
# pandas for json data
import pandas as p
pandas_df = p.read_json(source_path_json)

In [14]:
pandas_df.shape

(177578, 49)

In [15]:
pandas_df.head(10)

Unnamed: 0,:id,:created_at,:updated_at,:version,:@computed_region_6qbp_sg9q,:@computed_region_qgnn_b9vv,:@computed_region_26cr_cadq,:@computed_region_ajp5_b2md,:@computed_region_fyvs_ahh9,:@computed_region_p5aj_wyqh,...,good_samaritan_ends,supervisor_district,neighborhood,client_location,shape,:@computed_region_9jxd_iqea,:@computed_region_6ezc_tdp2,:@computed_region_h4ep_8xdi,:@computed_region_pigm_ib2e,constraints_date
0,row-ufzj_22gk~drek,2023-01-23 23:46:16.858000+00:00,2023-01-23 23:46:33.925000+00:00,rv-wryr~aezj~xte7,53.0,3.0,2.0,20.0,19.0,4.0,...,False,9.0,Mission,"{'latitude': '37.75649855484188', 'longitude':...","{'type': 'Point', 'coordinates': [-122.41447, ...",,,,,
1,row-k4ar_jqfk~qf9p,2023-01-23 23:46:16.858000+00:00,2023-01-23 23:46:33.925000+00:00,rv-jrf5.m8q7.bb2j,55.0,2.0,9.0,26.0,29.0,3.0,...,False,10.0,Potrero Hill,"{'latitude': '37.7636861675663', 'longitude': ...","{'type': 'Point', 'coordinates': [-122.38873, ...",14.0,,,,
2,row-g7er.wz7c~98am,2023-01-23 23:46:16.858000+00:00,2023-01-23 23:46:33.925000+00:00,rv-kub4-qfcm-vv5s,20.0,6.0,10.0,36.0,36.0,1.0,...,False,3.0,Tenderloin,"{'latitude': '37.78620563104688', 'longitude':...","{'type': 'Point', 'coordinates': [-122.41727, ...",,,,,
3,row-jvdz~8ne5_agq5,2023-01-23 23:46:16.858000+00:00,2023-01-23 23:46:33.925000+00:00,rv-t6vv~sqbg.fcxv,55.0,2.0,9.0,26.0,29.0,3.0,...,False,10.0,Potrero Hill,"{'latitude': '37.7636861675663', 'longitude': ...","{'type': 'Point', 'coordinates': [-122.38873, ...",14.0,,,,
4,row-er7z~7fk9~55mx,2023-01-23 23:46:16.858000+00:00,2023-01-23 23:46:33.925000+00:00,rv-8f6e~9dpn.id6z,39.0,10.0,7.0,35.0,35.0,8.0,...,False,4.0,Sunset/Parkside,"{'latitude': '37.748396041406835', 'longitude'...","{'type': 'Point', 'coordinates': [-122.50294, ...",,,,,
5,row-gr32-umab-fxqu,2023-01-23 23:46:16.858000+00:00,2023-01-23 23:46:33.925000+00:00,rv-d76w-xwwa~2fpx,91.0,2.0,2.0,25.0,28.0,3.0,...,False,9.0,Portola,"{'latitude': '37.730376135010886', 'longitude'...","{'type': 'Point', 'coordinates': [-122.40584, ...",,,,,
6,row-5jy6_s62c~m2g4,2023-01-23 23:46:16.858000+00:00,2023-01-23 23:46:33.925000+00:00,rv-pbei~25pq-sufa,44.0,10.0,8.0,41.0,40.0,8.0,...,False,7.0,West of Twin Peaks,"{'latitude': '37.74321534793589', 'longitude':...","{'type': 'Point', 'coordinates': [-122.47192, ...",,,,,
7,row-vzi9_7svc~98jq,2023-01-23 23:46:16.858000+00:00,2023-01-23 23:46:33.925000+00:00,rv-md38~jqdr~y94e,20.0,5.0,10.0,36.0,36.0,10.0,...,False,5.0,Tenderloin,"{'latitude': '37.78333200817521', 'longitude':...","{'type': 'Point', 'coordinates': [-122.40984, ...",6.0,1.0,1.0,18.0,
8,row-hin8.uhzw.rh2d,2023-01-23 23:46:16.858000+00:00,2023-01-23 23:46:33.925000+00:00,rv-r87s.ubap-ypha,20.0,5.0,10.0,36.0,36.0,10.0,...,False,5.0,Tenderloin,"{'latitude': '37.78361971423206', 'longitude':...","{'type': 'Point', 'coordinates': [-122.41506, ...",6.0,1.0,1.0,18.0,
9,row-c8et.y7q5~zf5a,2023-01-23 23:46:16.858000+00:00,2023-01-23 23:46:33.925000+00:00,rv-hhns.jcpx_zh4i,20.0,5.0,10.0,36.0,36.0,10.0,...,False,5.0,Tenderloin,"{'latitude': '37.78361971423206', 'longitude':...","{'type': 'Point', 'coordinates': [-122.41506, ...",6.0,1.0,1.0,18.0,


In [29]:
# Removing rows with null data so we can avoid the error when convering to spark df for schema purpose
"""The command pandas_df[pandas_df.notnull().all(1)] is used in Python's pandas library to filter rows from a pandas DataFrame based on whether they contain only non-null values in all columns.

Here's how the command works:

pandas_df refers to a pandas DataFrame.
pandas_df.notnull() returns a DataFrame of the same shape as pandas_df but with boolean values indicating whether each element is null or not.
pandas_df.notnull().all(1) returns a Series of boolean values indicating whether all the values in each row of the DataFrame are non-null. The 1 argument specifies that we want to check for non-null values across each row.
Finally, pandas_df[pandas_df.notnull().all(1)] returns a new DataFrame that contains only the rows where all the values are non-null. This is achieved by using the boolean Series to select only the rows where all the values are True.
In summary, this command can be used to filter out rows with missing values from a pandas DataFrame."""
pandas_df= pandas_df[pandas_df.notnull().all(1)]
pandas_df.shape

(9678, 29)

In [30]:
# Extract the schema of the pandas df by converting it to pyspark df
spark.createDataFrame(pandas_df).schema
# Modify and align the schema in VSCode

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


StructType([StructField('client_location', StringType(), True), StructField('neighborhood', StringType(), True), StructField('non_payment', BooleanType(), True), StructField('nuisance', BooleanType(), True), StructField('breach', BooleanType(), True), StructField('city', StringType(), True), StructField('address', StringType(), True), StructField('demolition', BooleanType(), True), StructField('illegal_use', BooleanType(), True), StructField('access_denial', BooleanType(), True), StructField('other_cause', BooleanType(), True), StructField('eviction_id', StringType(), True), StructField('state', StringType(), True), StructField('substantial_rehab', BooleanType(), True), StructField('development', BooleanType(), True), StructField('constraints_date', StringType(), True), StructField('roommate_same_unit', BooleanType(), True), StructField('file_date', StringType(), True), StructField('ellis_act_withdrawal', BooleanType(), True), StructField('good_samaritan_ends', BooleanType(), True), St

In [186]:
pandas_df[pandas_df['illegal_use']==False].shape

(9678, 29)

In [189]:
pandas_df['illegal_use']

47       False
54       False
74       False
94       False
95       False
         ...  
99971    False
99982    False
99984    False
99991    False
99992    False
Name: illegal_use, Length: 9678, dtype: bool

In [35]:
pandas_df[pandas_df['development']=='M221207']

Unnamed: 0,client_location,neighborhood,non_payment,nuisance,breach,city,address,demolition,illegal_use,access_denial,...,good_samaritan_ends,lead_remediation,unapproved_subtenant,capital_improvement,condo_conversion,late_payments,zip,supervisor_district,owner_move_in,failure_to_sign_renewal


In [190]:
pandas_df['constraints_date']

47       2028-01-16T00:00:00.000
54       2028-01-31T00:00:00.000
74       2027-12-31T00:00:00.000
94       2028-01-03T00:00:00.000
95       2028-01-03T00:00:00.000
                  ...           
99971    2018-04-22T00:00:00.000
99982    2018-04-20T00:00:00.000
99984    2018-04-21T00:00:00.000
99991    2018-04-13T00:00:00.000
99992    2018-04-20T00:00:00.000
Name: constraints_date, Length: 9678, dtype: object

## Creating PySpark Dataframe by passing the schema
* Refer to the dtypes [here](https://data.sfgov.org/Housing-and-Buildings/Eviction-Notices/5cei-gny5)

In [193]:
from pyspark.sql import types
schema = types.StructType([
    types.StructField('eviction_id', types.StringType(), True), 
    types.StructField('address', types.StringType(), True), 
    types.StructField('city', types.StringType(), True), 
    types.StructField('state', types.StringType(), True), 
    types.StructField('zip', types.IntegerType(), True), 
    types.StructField('file_date', types.DateType(), True), 
    types.StructField('non_payment', types.BooleanType(), True), 
    types.StructField('breach', types.BooleanType(), True), 
    types.StructField('nuisance', types.BooleanType(), True), 
    types.StructField('illegal_use', types.BooleanType(), True), 
    types.StructField('failure_to_sign_renewal', types.BooleanType(), True), 
    types.StructField('access_denial', types.BooleanType(), True), 
    types.StructField('unapproved_subtenant', types.BooleanType(), True), 
    types.StructField('owner_move_in', types.BooleanType(), True), 
    types.StructField('demolition', types.BooleanType(), True), 
    types.StructField('capital_improvement', types.BooleanType(), True), 
    types.StructField('substantial_rehab', types.BooleanType(), True), 
    types.StructField('ellis_act_withdrawal', types.BooleanType(), True), 
    types.StructField('condo_conversion', types.BooleanType(), True), 
    types.StructField('roommate_same_unit', types.BooleanType(), True), 
    types.StructField('other_cause', types.BooleanType(), True), 
    types.StructField('late_payments', types.BooleanType(), True), 
    types.StructField('lead_remediation', types.BooleanType(), True), 
    types.StructField('development', types.BooleanType(), True), 
    types.StructField('good_samaritan_ends', types.BooleanType(), True), 
    types.StructField('constraints_date', types.StringType(), True), 
    types.StructField('supervisor_district', types.IntegerType(), True), 
    types.StructField('neighborhood', types.StringType(), True), 
    types.StructField('client_location', types.StringType(), True)])

In [194]:
raw_pyspark_df = spark.read \
                       .option("header", "true") \
                       .schema(schema) \
                       .csv(source_path)

In [195]:
raw_pyspark_df.count()

177578

In [196]:
pyspark_df = raw_pyspark_df.select(imp_cols)

In [197]:
pyspark_df.printSchema()

root
 |-- eviction_id: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- file_date: date (nullable = true)
 |-- non_payment: boolean (nullable = true)
 |-- breach: boolean (nullable = true)
 |-- nuisance: boolean (nullable = true)
 |-- illegal_use: boolean (nullable = true)
 |-- failure_to_sign_renewal: boolean (nullable = true)
 |-- access_denial: boolean (nullable = true)
 |-- unapproved_subtenant: boolean (nullable = true)
 |-- owner_move_in: boolean (nullable = true)
 |-- demolition: boolean (nullable = true)
 |-- capital_improvement: boolean (nullable = true)
 |-- substantial_rehab: boolean (nullable = true)
 |-- ellis_act_withdrawal: boolean (nullable = true)
 |-- condo_conversion: boolean (nullable = true)
 |-- roommate_same_unit: boolean (nullable = true)
 |-- other_cause: boolean (nullable = true)
 |-- late_payments: boolean (nullable = true)
 |-- lea

## Option 1: Write partitioned data to local
* And then use the prefect blocks to write the data to GCS and BQ

In [36]:
# folder to write the partition on the local system
data_partition_dir = '/home/sanyashireen/sf_eviction/data_eviction/2023/3/22/clean_eviction_2023-03-22_partitioned'
pyspark_df = pyspark_df.repartition(100)
pyspark_df.write.parquet(data_partition_dir, mode='overwrite')

23/03/23 18:48:08 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 45, schema size: 29
CSV file: file:///home/sanyashireen/sf_eviction/data_eviction/2023/3/22/gcs_raw_eviction_2023-03-22.csv


                                                                                

## Option 2: Write partitioned data directly to gcs and bq (as an external table)
* Will need to use Dataproc for that.

In [12]:
spark.stop()

In [None]:

target_dir = '/home/sanyashireen/sf_eviction/data_eviction/2023/3/25/clean_eviction_2023-03-25_partitioned'
spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName('test') \
                    .getOrCreate()
imp_cols = ['eviction_id', 'address', 'city', 'state', 'zip', 'file_date', 'non_payment', 'breach', 'nuisance', 'illegal_use', 'failure_to_sign_renewal', 'access_denial', 'unapproved_subtenant', 'owner_move_in', 'demolition', 'capital_improvement', 'substantial_rehab', 'ellis_act_withdrawal', 'condo_conversion', 'roommate_same_unit', 'other_cause', 'late_payments', 'lead_remediation', 'development', 'good_samaritan_ends', 'constraints_date', 'supervisor_district', 'neighborhood', 'client_location']
schema = types.StructType([
                            types.StructField('eviction_id', types.StringType(), True), 
                            types.StructField('address', types.StringType(), True), 
                            types.StructField('city', types.StringType(), True), 
                            types.StructField('state', types.StringType(), True), 
                            types.StructField('zip', types.IntegerType(), True), 
                            types.StructField('file_date', types.DateType(), True), 
                            types.StructField('non_payment', types.BooleanType(), True), 
                            types.StructField('breach', types.BooleanType(), True), 
                            types.StructField('nuisance', types.BooleanType(), True), 
                            types.StructField('illegal_use', types.BooleanType(), True), 
                            types.StructField('failure_to_sign_renewal', types.BooleanType(), True), 
                            types.StructField('access_denial', types.BooleanType(), True), 
                            types.StructField('unapproved_subtenant', types.BooleanType(), True), 
                            types.StructField('owner_move_in', types.BooleanType(), True), 
                            types.StructField('demolition', types.BooleanType(), True), 
                            types.StructField('capital_improvement', types.BooleanType(), True), 
                            types.StructField('substantial_rehab', types.BooleanType(), True), 
                            types.StructField('ellis_act_withdrawal', types.BooleanType(), True), 
                            types.StructField('condo_conversion', types.BooleanType(), True), 
                            types.StructField('roommate_same_unit', types.BooleanType(), True), 
                            types.StructField('other_cause', types.BooleanType(), True), 
                            types.StructField('late_payments', types.BooleanType(), True), 
                            types.StructField('lead_remediation', types.BooleanType(), True), 
                            types.StructField('development', types.BooleanType(), True), 
                            types.StructField('good_samaritan_ends', types.BooleanType(), True), 
                            types.StructField('constraints_date', types.StringType(), True), 
                            types.StructField('supervisor_district', types.IntegerType(), True), 
                            types.StructField('neighborhood', types.StringType(), True), 
                            types.StructField('client_location', types.StringType(), True)])

df = spark.read \
          .option("header", "true") \
          .schema(schema) \
          .json(source_path) \
          .select(imp_cols)
    
print(f'Total rows read:' {df.shape})
df = df.repartition(100)
df.write.parquet(target_dir, mode='overwrite')
spark.stop()