In [1]:
# Creating and testing script to clean and transform csv data using pyspark
# spark related packages
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/23 17:16:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
raw_data_filepath = '/home/sanyashireen/sf_eviction/data_eviction/2023/3/22/gcs_raw_eviction_2023-03-22.csv'
df = spark.read.option("header", "true").csv(str(raw_data_filepath))

In [3]:
df.show(20)

23/03/23 17:16:20 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+-----------+--------------------+-------------+-----+-----+--------------------+-----------+------+--------+-----------+-----------------------+-------------+--------------------+-------------+----------+-------------------+-----------------+--------------------+----------------+------------------+-----------+-------------+----------------+-----------+-------------------+----------------+-------------------+--------------------+---------------+--------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+--------------

In [6]:
print(f'The pyspark df is a list of records where each record is of type {type(df.head(1)[0])} \n\n {df.head(1)[0]}')

The pyspark df is a list of records where each record is of type <class 'pyspark.sql.types.Row'> 

 Row(eviction_id='M222138', address='400 Block Of Eddy  Street', city='San Francisco', state='CA', zip='94102', file_date='2022-12-12T00:00:00.000', non_payment='false', breach='false', nuisance='true', illegal_use='false', failure_to_sign_renewal='false', access_denial='false', unapproved_subtenant='false', owner_move_in='false', demolition='false', capital_improvement='false', substantial_rehab='false', ellis_act_withdrawal='false', condo_conversion='false', roommate_same_unit='false', other_cause='false', late_payments='false', lead_remediation='false', development='false', good_samaritan_ends='false', constraints_date=None, supervisor_district='5', neighborhood='Tenderloin', client_location=None, shape='POINT (-122.41508 37.783627)', :@computed_region_6qbp_sg9q=None, :@computed_region_qgnn_b9vv=None, :@computed_region_26cr_cadq=None, :@computed_region_ajp5_b2md=None, :@computed_region

In [7]:
# Extracting column names to decide which ones are of interest
# Convert type row to dictionary
row_as_dict = df.head(1)[0].asDict()

# Column Names
print(list(row_as_dict.keys()))

# Sample Column and Value
for key, value in row_as_dict.items():
    print(f'{key}={value}')


['eviction_id', 'address', 'city', 'state', 'zip', 'file_date', 'non_payment', 'breach', 'nuisance', 'illegal_use', 'failure_to_sign_renewal', 'access_denial', 'unapproved_subtenant', 'owner_move_in', 'demolition', 'capital_improvement', 'substantial_rehab', 'ellis_act_withdrawal', 'condo_conversion', 'roommate_same_unit', 'other_cause', 'late_payments', 'lead_remediation', 'development', 'good_samaritan_ends', 'constraints_date', 'supervisor_district', 'neighborhood', 'client_location', 'shape', ':@computed_region_6qbp_sg9q', ':@computed_region_qgnn_b9vv', ':@computed_region_26cr_cadq', ':@computed_region_ajp5_b2md', ':@computed_region_fyvs_ahh9', ':@computed_region_p5aj_wyqh', ':@computed_region_rxqg_mtj9', ':@computed_region_yftq_j783', ':@computed_region_bh8s_q3mv', ':@computed_region_9jxd_iqea', ':@computed_region_6ezc_tdp2', ':@computed_region_6pnf_4xz7', ':@computed_region_h4ep_8xdi', ':@computed_region_pigm_ib2e', ':@computed_region_jwn9_ihcz']
eviction_id=M222138
address=400 B

## Analysis
* As we can see the columns of interest are not all columns.
* The last columns name `@computed_region*` is not of interest so we will ignore those columns.
* And will extract only those columns from the raw dataset
* Column `client_location` and `shape` hold the same data 

In [20]:
imp_cols = ['eviction_id', 'address', 'city', 'state', 'zip', 'file_date', 'non_payment', 'breach', 'nuisance', 'illegal_use', 'failure_to_sign_renewal', 'access_denial', 'unapproved_subtenant', 'owner_move_in', 'demolition', 'capital_improvement', 'substantial_rehab', 'ellis_act_withdrawal', 'condo_conversion', 'roommate_same_unit', 'other_cause', 'late_payments', 'lead_remediation', 'development', 'good_samaritan_ends', 'constraints_date', 'supervisor_district', 'neighborhood', 'client_location']


In [9]:
# new df with only selected columns 
df_clean = df.select(imp_cols)
df_clean.show(5)

+-----------+--------------------+-------------+-----+-----+--------------------+-----------+------+--------+-----------+-----------------------+-------------+--------------------+-------------+----------+-------------------+-----------------+--------------------+----------------+------------------+-----------+-------------+----------------+-----------+-------------------+----------------+-------------------+--------------------+---------------+--------------------+
|eviction_id|             address|         city|state|  zip|           file_date|non_payment|breach|nuisance|illegal_use|failure_to_sign_renewal|access_denial|unapproved_subtenant|owner_move_in|demolition|capital_improvement|substantial_rehab|ellis_act_withdrawal|condo_conversion|roommate_same_unit|other_cause|late_payments|lead_remediation|development|good_samaritan_ends|constraints_date|supervisor_district|        neighborhood|client_location|               shape|
+-----------+--------------------+-------------+-----+----

In [10]:
df_clean.printSchema()

root
 |-- eviction_id: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- file_date: string (nullable = true)
 |-- non_payment: string (nullable = true)
 |-- breach: string (nullable = true)
 |-- nuisance: string (nullable = true)
 |-- illegal_use: string (nullable = true)
 |-- failure_to_sign_renewal: string (nullable = true)
 |-- access_denial: string (nullable = true)
 |-- unapproved_subtenant: string (nullable = true)
 |-- owner_move_in: string (nullable = true)
 |-- demolition: string (nullable = true)
 |-- capital_improvement: string (nullable = true)
 |-- substantial_rehab: string (nullable = true)
 |-- ellis_act_withdrawal: string (nullable = true)
 |-- condo_conversion: string (nullable = true)
 |-- roommate_same_unit: string (nullable = true)
 |-- other_cause: string (nullable = true)
 |-- late_payments: string (nullable = true)
 |-- lead_remediation: 

## Convert few rows to pandas df and extract the schema

In [14]:
import pandas as p
pandas_df = p.read_csv(raw_data_filepath, nrows=1000, usecols=imp_cols)

In [15]:
# Removing rows with null data so we can avoid the error when convering to spark df for schema purpose
"""The command pandas_df[pandas_df.notnull().all(1)] is used in Python's pandas library to filter rows from a pandas DataFrame based on whether they contain only non-null values in all columns.

Here's how the command works:

pandas_df refers to a pandas DataFrame.
pandas_df.notnull() returns a DataFrame of the same shape as pandas_df but with boolean values indicating whether each element is null or not.
pandas_df.notnull().all(1) returns a Series of boolean values indicating whether all the values in each row of the DataFrame are non-null. The 1 argument specifies that we want to check for non-null values across each row.
Finally, pandas_df[pandas_df.notnull().all(1)] returns a new DataFrame that contains only the rows where all the values are non-null. This is achieved by using the boolean Series to select only the rows where all the values are True.
In summary, this command can be used to filter out rows with missing values from a pandas DataFrame."""
pandas_df= pandas_df[pandas_df.notnull().all(1)]
pandas_df.shape

(77, 30)

In [16]:
# Extract the schema of the pandas df by converting it to pyspark df
spark.createDataFrame(pandas_df).schema
# Modify and align the schema in VSCode

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


StructType([StructField('eviction_id', StringType(), True), StructField('address', StringType(), True), StructField('city', StringType(), True), StructField('state', StringType(), True), StructField('zip', DoubleType(), True), StructField('file_date', StringType(), True), StructField('non_payment', BooleanType(), True), StructField('breach', BooleanType(), True), StructField('nuisance', BooleanType(), True), StructField('illegal_use', BooleanType(), True), StructField('failure_to_sign_renewal', BooleanType(), True), StructField('access_denial', BooleanType(), True), StructField('unapproved_subtenant', BooleanType(), True), StructField('owner_move_in', BooleanType(), True), StructField('demolition', BooleanType(), True), StructField('capital_improvement', BooleanType(), True), StructField('substantial_rehab', BooleanType(), True), StructField('ellis_act_withdrawal', BooleanType(), True), StructField('condo_conversion', BooleanType(), True), StructField('roommate_same_unit', BooleanType(

## Creating PySpark Dataframe by passing the schema
* Refer to the dtypes [here](https://data.sfgov.org/Housing-and-Buildings/Eviction-Notices/5cei-gny5)

In [29]:
from pyspark.sql import types
schema = types.StructType([
    types.StructField('eviction_id', types.StringType(), True), 
    types.StructField('address', types.StringType(), True), 
    types.StructField('city', types.StringType(), True), 
    types.StructField('state', types.StringType(), True), 
    types.StructField('zip', types.IntegerType(), True), 
    types.StructField('file_date', types.DateType(), True), 
    types.StructField('non_payment', types.BooleanType(), True), 
    types.StructField('breach', types.BooleanType(), True), 
    types.StructField('nuisance', types.BooleanType(), True), 
    types.StructField('illegal_use', types.BooleanType(), True), 
    types.StructField('failure_to_sign_renewal', types.BooleanType(), True), 
    types.StructField('access_denial', types.BooleanType(), True), 
    types.StructField('unapproved_subtenant', types.BooleanType(), True), 
    types.StructField('owner_move_in', types.BooleanType(), True), 
    types.StructField('demolition', types.BooleanType(), True), 
    types.StructField('capital_improvement', types.BooleanType(), True), 
    types.StructField('substantial_rehab', types.BooleanType(), True), 
    types.StructField('ellis_act_withdrawal', types.BooleanType(), True), 
    types.StructField('condo_conversion', types.BooleanType(), True), 
    types.StructField('roommate_same_unit', types.BooleanType(), True), 
    types.StructField('other_cause', types.BooleanType(), True), 
    types.StructField('late_payments', types.BooleanType(), True), 
    types.StructField('lead_remediation', types.BooleanType(), True), 
    types.StructField('development', types.BooleanType(), True), 
    types.StructField('good_samaritan_ends', types.BooleanType(), True), 
    types.StructField('constraints_date', types.StringType(), True), 
    types.StructField('supervisor_district', types.IntegerType(), True), 
    types.StructField('neighborhood', types.StringType(), True), 
    types.StructField('client_location', types.StringType(), True)])

In [31]:
raw_pyspark_df = spark.read \
                       .option("header", "true") \
                       .schema(schema) \
                       .csv(raw_data_filepath)

In [32]:
pyspark_df = raw_pyspark_df.select(imp_cols)

In [33]:
pyspark_df.printSchema()

root
 |-- eviction_id: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- file_date: date (nullable = true)
 |-- non_payment: boolean (nullable = true)
 |-- breach: boolean (nullable = true)
 |-- nuisance: boolean (nullable = true)
 |-- illegal_use: boolean (nullable = true)
 |-- failure_to_sign_renewal: boolean (nullable = true)
 |-- access_denial: boolean (nullable = true)
 |-- unapproved_subtenant: boolean (nullable = true)
 |-- owner_move_in: boolean (nullable = true)
 |-- demolition: boolean (nullable = true)
 |-- capital_improvement: boolean (nullable = true)
 |-- substantial_rehab: boolean (nullable = true)
 |-- ellis_act_withdrawal: boolean (nullable = true)
 |-- condo_conversion: boolean (nullable = true)
 |-- roommate_same_unit: boolean (nullable = true)
 |-- other_cause: boolean (nullable = true)
 |-- late_payments: boolean (nullable = true)
 |-- lea

In [36]:
# folder to write the partition into
data_partition_dir = '/home/sanyashireen/sf_eviction/data_eviction/2023/3/22/clean_eviction_2023-03-22_partitioned'
pyspark_df = pyspark_df.repartition(100)
pyspark_df.write.parquet(data_partition_dir, mode='overwrite')

23/03/23 18:48:08 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 45, schema size: 29
CSV file: file:///home/sanyashireen/sf_eviction/data_eviction/2023/3/22/gcs_raw_eviction_2023-03-22.csv


                                                                                

In [None]:
spark.stop()