In [52]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read CSV GZ Example") \
    .getOrCreate()

In [89]:
df = spark.read.csv("listings2.csv", 
                    header=True,
                    inferSchema=True,
                     sep=",",        # The separator is still a comma
    quote='"',      # Quote character
    escape='"',    # No escape character
    multiLine=True, # Allows for multiline records
    mode="PERMISSIVE"  # Handle malformed lines permissively
                   )

In [90]:
import json
# Convert the schema to JSON format
schema_json = df.schema.json()

# Pretty print the JSON schema
print(json.dumps(json.loads(schema_json), indent=2))

{
  "fields": [
    {
      "metadata": {},
      "name": "id",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "listing_url",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "scrape_id",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "last_scraped",
      "nullable": true,
      "type": "date"
    },
    {
      "metadata": {},
      "name": "source",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "description",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "neighborhood_overview",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "picture_url",
      "nullable": true,
      "type": "string"
    },
   

In [92]:
df_f = df.filter(df.id == '307497')
df_f.show()

+------+--------------------+--------------+------------+---------------+--------------------+--------------------+---------------------+--------------------+-------+--------------------+---------+----------+--------------------+--------------------+------------------+------------------+--------------------+-----------------+--------------------+--------------------+--------------------+-------------------+-------------------------+------------------+--------------------+----------------------+--------------------+----------------------+----------------------------+--------+---------+-------------+---------------+------------+---------+--------------+--------+----+--------------------+-----+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+----------------+---------------------+-----------

In [93]:
df.schema

StructType([StructField('id', LongType(), True), StructField('listing_url', StringType(), True), StructField('scrape_id', LongType(), True), StructField('last_scraped', DateType(), True), StructField('source', StringType(), True), StructField('name', StringType(), True), StructField('description', StringType(), True), StructField('neighborhood_overview', StringType(), True), StructField('picture_url', StringType(), True), StructField('host_id', IntegerType(), True), StructField('host_url', StringType(), True), StructField('host_name', StringType(), True), StructField('host_since', DateType(), True), StructField('host_location', StringType(), True), StructField('host_about', StringType(), True), StructField('host_response_time', StringType(), True), StructField('host_response_rate', StringType(), True), StructField('host_acceptance_rate', StringType(), True), StructField('host_is_superhost', StringType(), True), StructField('host_thumbnail_url', StringType(), True), StructField('host_pi

In [94]:
n_df = df.select(df.neighbourhood_cleansed)

In [95]:
n_df.show()

+----------------------+
|neighbourhood_cleansed|
+----------------------+
|          Centrum-Oost|
|            Westerpark|
|          Centrum-Oost|
|          Centrum-Oost|
|          Centrum-West|
|          Centrum-Oost|
|              Oud-Oost|
|          Centrum-West|
|  Oostelijk Havenge...|
|          Centrum-West|
|          Centrum-West|
|          Centrum-Oost|
|          Centrum-Oost|
|  Buitenveldert - Z...|
|          Centrum-West|
|          Centrum-Oost|
|         Bos en Lommer|
|          Centrum-West|
|  IJburg - Zeeburge...|
|          Centrum-Oost|
+----------------------+
only showing top 20 rows

