In [1]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read CSV GZ Example") \
    .getOrCreate()

24/09/26 23:34:13 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
df = spark.read.csv("data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",",        # The separator is still a comma
    quote='"',      # Quote character
    escape='"',    # No escape character
    multiLine=True, # Allows for multiline records
    mode="PERMISSIVE"  # Handle malformed lines permissively
)

                                                                                

In [7]:
df.schema

StructType([StructField('id', LongType(), True), StructField('listing_url', StringType(), True), StructField('scrape_id', LongType(), True), StructField('last_scraped', DateType(), True), StructField('source', StringType(), True), StructField('name', StringType(), True), StructField('description', StringType(), True), StructField('neighborhood_overview', StringType(), True), StructField('picture_url', StringType(), True), StructField('host_id', IntegerType(), True), StructField('host_url', StringType(), True), StructField('host_name', StringType(), True), StructField('host_since', DateType(), True), StructField('host_location', StringType(), True), StructField('host_about', StringType(), True), StructField('host_response_time', StringType(), True), StructField('host_response_rate', StringType(), True), StructField('host_acceptance_rate', StringType(), True), StructField('host_is_superhost', StringType(), True), StructField('host_thumbnail_url', StringType(), True), StructField('host_pi

In [5]:
import json
# Convert the schema to JSON format
schema_json = df.schema.json()

# Pretty print the JSON schema
print(json.dumps(json.loads(schema_json), indent=2))

{
  "fields": [
    {
      "metadata": {},
      "name": "id",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "listing_url",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "scrape_id",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "last_scraped",
      "nullable": true,
      "type": "date"
    },
    {
      "metadata": {},
      "name": "source",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "description",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "neighborhood_overview",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "picture_url",
      "nullable": true,
      "type": "string"
    },
   

In [12]:
df.select(df.neighbourhood_cleansed).show(100)

+----------------------+
|neighbourhood_cleansed|
+----------------------+
|             Islington|
|  Kensington and Ch...|
|           Westminster|
|            Wandsworth|
|  Richmond upon Thames|
|               Enfield|
|  Hammersmith and F...|
|                Camden|
|  Richmond upon Thames|
|               Lambeth|
|        City of London|
|                Camden|
|         Tower Hamlets|
|  Richmond upon Thames|
|              Haringey|
|  Hammersmith and F...|
|              Haringey|
|             Southwark|
|           Westminster|
|                Barnet|
|               Hackney|
|                Merton|
|                Ealing|
|              Haringey|
|           Westminster|
|  Hammersmith and F...|
|               Lambeth|
|  Kensington and Ch...|
|              Hounslow|
|             Southwark|
|        Waltham Forest|
|                Barnet|
|  Hammersmith and F...|
|  Hammersmith and F...|
|                 Brent|
|                Barnet|
|                Camden|


In [11]:
df.select(df.neighbourhood_cleansed).distinct().show(40)

[Stage 9:>                                                          (0 + 1) / 1]

+----------------------+
|neighbourhood_cleansed|
+----------------------+
|            Wandsworth|
|               Croydon|
|                Bexley|
|               Lambeth|
|  Barking and Dagenham|
|                Camden|
|             Greenwich|
|                Newham|
|         Tower Hamlets|
|                Barnet|
|              Hounslow|
|                Harrow|
|  Kensington and Ch...|
|             Islington|
|                 Brent|
|              Haringey|
|               Bromley|
|                Merton|
|           Westminster|
|             Southwark|
|               Hackney|
|               Enfield|
|                Ealing|
|                Sutton|
|  Hammersmith and F...|
|  Kingston upon Thames|
|              Havering|
|            Hillingdon|
|        Waltham Forest|
|  Richmond upon Thames|
|             Redbridge|
|        City of London|
|              Lewisham|
+----------------------+



                                                                                