# ETL to transform Airbnb rawdata to database schema

This ETL is coded in pyspark. Though the current data size could be handled in Pandas or similar libraries, pyspark was chosen for scale reasons.

TODO: outline steps here

## Initialisation

In [1]:
### Import modules

import json
import warnings
warnings.filterwarnings('ignore')

import findspark
findspark.init()

from pyspark.sql import SparkSession, Window, SQLContext
from pyspark.sql import functions as psF
from pyspark.sql import types as psDT

spark = SparkSession.builder.appName('SparkWithPostgres')\
        .config("spark.driver.extraClassPath", "/project/Practical_exam/postgresql-42.3.2.jar")\
        .getOrCreate()
print('Spark session:')
print(spark)

import pyspark
# from pyspark.sql import SQLContext

Spark session:
<pyspark.sql.session.SparkSession object at 0x7f8a841bf3a0>


In [2]:
### Initialise Spark context

sc = pyspark.SparkContext.getOrCreate()

print('Initialisation of Spark context')
print(sc)
print('')

sqlContext = SQLContext(sc)
print('Initialisation of SparkSQL')
print(sqlContext)

spark.sparkContext.setLogLevel("ERROR")
sc.setLogLevel("ERROR")

Initialisation of Spark context
<SparkContext master=local[*] appName=SparkWithPostgres>

Initialisation of SparkSQL
<pyspark.sql.context.SQLContext object at 0x7f8aa8c759a0>


In [3]:
### Constants

CSV_PATH = './OriginalData_csv/'
PARQUET_PATH = './SchemaReadyData_parquet/'

## Neighbourhoods

In [4]:
### Load neighbourhoods to Spark dataframe

df_neighbourhoods = spark.read.options(delimiter = ',',
                                 header = True,
                                 #lineSep = '\n',
                                 escape = '"',
                                 multiline = True).csv(CSV_PATH + 'neighbourhoods.csv')

# Print schema
print('Schema of dataframe neighbourhoods')
print(df_neighbourhoods.printSchema())

Schema of dataframe neighbourhoods
root
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)

None


In [5]:
### Drop empty column and add index column

df_neighbourhoods = df_neighbourhoods.drop('neighbourhood_group')

# Trim neighbourhood column
df_neighbourhoods = df_neighbourhoods.withColumn('neighbourhood', psF.trim('neighbourhood'))

# Add index column
df_neighbourhoods = df_neighbourhoods.withColumn('id',
                                                 psF.row_number().over(Window.orderBy(psF.monotonically_increasing_id())))
df_neighbourhoods = df_neighbourhoods.withColumn('id',
                                                 df_neighbourhoods['id'].cast(psDT.LongType()))

# Print schema
print('Schema of table neighbourhood')
print(df_neighbourhoods.printSchema())
# df_neighbourhoods.show()

Schema of table neighbourhood
root
 |-- neighbourhood: string (nullable = true)
 |-- id: long (nullable = false)

None


In [6]:
### Export neighbourhoods to parquet

df_neighbourhoods.write.mode('overwrite').parquet(PARQUET_PATH + 'neighbourhoods.parquet')

## Listings

In [7]:
### Load listings to Spark dataframe

df_listings = spark.read.options(delimiter = ',',
                                 header = True,
                                 #lineSep = '\n',
                                 escape = '"',
                                 multiline = True).csv(CSV_PATH + 'listings.csv')

# Print schema
print('Schema of dataframe listings')
print(df_listings.printSchema())

Schema of dataframe listings
root
 |-- id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: string (nullable = true)
 |-- last_scraped: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: strin

In [8]:
### Drop empty columns (for more insight, consult EDA notebook)

COLUMNS_TO_DROP = ('neighbourhood_group_cleansed',
                  'bathrooms',
                  'calendar_updated',
                  'license')

df_listings = df_listings.drop(*COLUMNS_TO_DROP)

In [9]:
### Create 4 tables from listings df

print('Columns on original file:', len(df_listings.columns))

COLS_LISTINGS = ['id',
                 'host_id',
                 'listing_url',
                 'scrape_id',
                 'last_scraped',
                 'name',
                 'description',
                 'neighborhood_overview',
                 'picture_url',
                 'neighbourhood',
                 'neighbourhood_cleansed',
                 'latitude',
                 'longitude',
                 'property_type',
                 'room_type',
                 'accommodates',
                 'bathrooms_text',
                 'bedrooms',
                 'beds',
                 'price',
                 'minimum_nights',
                 'maximum_nights']

COLS_LISTINGS_AMENITIES = ['id',
                           'amenities']

COLS_HOSTS = ['host_id',
              'host_url',
              'host_name',
              'host_since',
              'host_location',
              'host_about',
              'host_response_time',
              'host_response_rate',
              'host_acceptance_rate',
              'host_is_superhost',
              'host_thumbnail_url',
              'host_picture_url',
              'host_neighbourhood',
              'host_listings_count',
              'host_total_listings_count',
              'host_verifications',
              'host_has_profile_pic',
              'host_identity_verified']

COLS_COMPLENET = ['id',
                  'minimum_minimum_nights',
                  'maximum_minimum_nights',
                  'minimum_maximum_nights',
                  'maximum_maximum_nights',
                  'minimum_nights_avg_ntm',
                  'maximum_nights_avg_ntm',
                  'has_availability',
                  'availability_30',
                  'availability_60',
                  'availability_90',
                  'availability_365',
                  'calendar_last_scraped',
                  'number_of_reviews',
                  'number_of_reviews_ltm',
                  'number_of_reviews_l30d',
                  'first_review',
                  'last_review',
                  'review_scores_rating',
                  'review_scores_accuracy',
                  'review_scores_cleanliness',
                  'review_scores_checkin',
                  'review_scores_communication',
                  'review_scores_location',
                  'review_scores_value',
                  'instant_bookable',
                  'calculated_host_listings_count',
                  'calculated_host_listings_count_entire_homes',
                  'calculated_host_listings_count_private_rooms',
                  'calculated_host_listings_count_shared_rooms',
                  'reviews_per_month']

print('Columns after the split:',
      len(COLS_LISTINGS) + len(COLS_HOSTS) + len(COLS_COMPLENET) + len(COLS_LISTINGS_AMENITIES) - 3) #Correction for 3 additioanl id columns

# Create dataframes
df_listings_main = df_listings.select(COLS_LISTINGS)
df_listings_amenities = df_listings.select(COLS_LISTINGS_AMENITIES)
df_hosts = df_listings.select(COLS_HOSTS)
df_listings_complements = df_listings.select(COLS_COMPLENET)

Columns on original file: 70
Columns after the split: 70


### Process listings main

In [10]:
### Merge with neighbourhoods to obtain neighbourhood_id

df_neighbourhoods_merge = df_neighbourhoods.withColumnRenamed('id', 'neighbourhood_id') \
    .withColumnRenamed('neighbourhood', 'neighbourhood_cleansed')
df_listings_main = df_listings_main.join(df_neighbourhoods_merge,
                                         how = 'left',
                                         on = ['neighbourhood_cleansed'])

df_listings_main = df_listings_main.drop('neighbourhood_cleansed')

In [11]:
### Give structure to listings table

print('Schema of main listings')
print(df_listings_main.printSchema())

# Define dict with datatypes
LISTINGS_MAIN_DTYPES = {'id': psDT.LongType(),
                        'host_id': psDT.LongType(),
                        'scrape_id': psDT.LongType(),
                        'last_scraped': psDT.DateType(),
                        'latitude': psDT.DoubleType(),
                        'longitude': psDT.DoubleType(),
                        'accommodates': psDT.IntegerType(),
                        'bedrooms': psDT.IntegerType(),
                        'beds': psDT.IntegerType(),
                        'price': psDT.DoubleType(),
                        'minimum_nights': psDT.IntegerType(),
                        'maximum_nights': psDT.IntegerType(),
                        'neighbourhood_id': psDT.IntegerType()}

# Define dict with new names
LISTINGS_MAIN_NAMES = {'last_scraped': 'date_last_scraped',
                      'neighbourhood': 'neighbourhood_typed'}

# Copy dataframe
df_listings_main_process = df_listings_main.select('*')

# Drop $ sign from price
df_listings_main_process = df_listings_main_process.withColumn('price',
                                                               psF.regexp_replace('price', '\$', ''))

# Use stripping for strings
for COLUMN in df_listings_main.columns:
    
    # Strip whitespaces
    df_listings_main_process = df_listings_main_process.withColumn(COLUMN, psF.trim(COLUMN))
    
    # Replace None and N/A with null
    df_listings_main_process = df_listings_main_process.withColumn(COLUMN,
                                                                   psF.when((psF.col(COLUMN) == 'None') | (psF.col(COLUMN) == 'N/A'), None) \
                                                                   .otherwise(psF.col(COLUMN)))
    
    # Change data type
    if(COLUMN in LISTINGS_MAIN_DTYPES.keys()):
        df_listings_main_process = df_listings_main_process.withColumn(COLUMN,
                                                                       df_listings_main_process[COLUMN] \
                                                                       .cast(LISTINGS_MAIN_DTYPES[COLUMN]))
    
    # Rename columns
    if(COLUMN in LISTINGS_MAIN_NAMES.keys()):
        df_listings_main_process = df_listings_main_process.withColumnRenamed(COLUMN,
                                                                             LISTINGS_MAIN_NAMES[COLUMN])
        
# Print schema
print('Schema of table listings')
print(df_listings_main_process.printSchema())
# df_listings_main_process.show()

Schema of main listings
root
 |-- id: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: string (nullable = true)
 |-- last_scraped: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- property_type: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: string (nullable = true)
 |-- bathrooms_text: string (nullable = true)
 |-- bedrooms: string (nullable = true)
 |-- beds: string (nullable = true)
 |-- price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- maximum_nights: string (nullable = true)
 |-- neighbourhood_id: long (nullable = true)

None
Schema of table listings
root
 |-- i

In [12]:
### Export listings to parquet

df_listings_main_process.write.mode('overwrite').parquet(PARQUET_PATH + 'listings.parquet')

### Process hosts - main data

In [13]:
### Drop duplicates

df_hosts_unique = df_hosts.dropDuplicates()

print('Rows on previous dataframe:', df_hosts.count())
print('Rows on unique dataframe:', df_hosts_unique.count())

Rows on previous dataframe: 66641
Rows on unique dataframe: 44695


In [14]:
### Separate host verifications

df_host_verifications = df_hosts_unique.select(['host_id', 'host_verifications'])
df_hosts_sliced = df_hosts_unique.drop('host_verifications')

# Replace None with null and drop those records (it means host has no verifications at all)
df_host_verifications = df_host_verifications.withColumn('host_verifications',
                                                        psF.when(psF.col('host_verifications') == 'None', None) \
                                                         .otherwise(psF.col('host_verifications')))
df_host_verifications = df_host_verifications.na.drop(subset = ['host_verifications'])
# df_host_verifications.show()

In [15]:
### Give structure to hosts sliced table

print('Schema of hosts sliced')
print(df_hosts_sliced.printSchema())

# Define dict with datatypes
HOST_DTYPES = {'host_id': psDT.LongType(),
               'host_since': psDT.DateType(),
               'host_response_rate': psDT.DoubleType(),
               'host_acceptance_rate': psDT.DoubleType(),
               'host_is_superhost': psDT.BooleanType(),
               'host_listings_count': psDT.IntegerType(),
               'host_total_listings_count': psDT.IntegerType(),
               'host_has_profile_pic': psDT.BooleanType(),
               'host_identity_verified': psDT.BooleanType()}

# Define features to be converted to bool
HOST_BOOL = [i for i in HOST_DTYPES.keys() if HOST_DTYPES[i] == psDT.BooleanType()]

# Define features to be transformed to decimal from percentage
PERCENTAGE_FEAT = ['host_response_rate', 'host_acceptance_rate']

# Copy dataframe
df_hosts_process = df_hosts_sliced.select('*')

# Drop % sign from percentage feats
for COLUMN in PERCENTAGE_FEAT:
    df_hosts_process = df_hosts_process.withColumn(COLUMN,
                                                   psF.regexp_replace(COLUMN, '\%', ''))

# Use stripping for strings
for COLUMN in df_hosts_process.columns:
    
    # Strip whitespaces
    df_hosts_process = df_hosts_process.withColumn(COLUMN, psF.trim(COLUMN))
    
    # Replace None and N/A with null
    df_hosts_process = df_hosts_process.withColumn(COLUMN,
                                                   psF.when((psF.col(COLUMN) == 'None') | (psF.col(COLUMN) == 'N/A'), None) \
                                                   .otherwise(psF.col(COLUMN)))
    
    # Change data type
    if(COLUMN in HOST_DTYPES.keys()):
        df_hosts_process = df_hosts_process.withColumn(COLUMN,
                                                       df_hosts_process[COLUMN] \
                                                       .cast(HOST_DTYPES[COLUMN]))
        
    # Make percentage columns decimal
    if(COLUMN in PERCENTAGE_FEAT):
        df_hosts_process = df_hosts_process.withColumn(COLUMN,
                                                   psF.col(COLUMN) / 100)
    
    # Rename columns by removing host_
    df_hosts_process = df_hosts_process.withColumnRenamed(COLUMN, COLUMN[5:])
        
# Print schema
print('Schema of table hosts')
print(df_hosts_process.printSchema())

Schema of hosts sliced
root
 |-- host_id: string (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: string (nullable = true)
 |-- host_total_listings_count: string (nullable = true)
 |-- host_has_profile_pic: string (nullable = true)
 |-- host_identity_verified: string (nullable = true)

None
Schema of table hosts
root
 |-- id: long (nullable = true)
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- since: date (nullab

In [16]:
### Export hosts to parquet

df_hosts_process.write.mode('overwrite').parquet(PARQUET_PATH + 'hosts.parquet')

### Process hosts - verifications

In [17]:
### Generate function to parse array of strings to rows containing array elements
### Credit: https://silpara.medium.com/pyspark-string-to-array-of-string-in-dataframe-b9572233ccea
def parse_array_from_string(x):
    res = json.loads(x)
    return res

retrieve_array_func = psF.udf(parse_array_from_string, psDT.ArrayType(psDT.StringType()))

In [18]:
### Explode verifications into different rows for final table

print('Schema of hosts verifications')
print(df_host_verifications.printSchema())

# Copy dataframe
df_host_verifications_process = df_host_verifications.select('*')

# Convert quotation to double quotation --> in order to properly load to json
df_host_verifications_process = df_host_verifications_process.withColumn('host_verifications',
                                                   psF.regexp_replace('host_verifications', '\'', '\"'))

# Change data types
df_host_verifications_process = df_host_verifications_process.withColumn('host_id',
                                                       df_host_verifications_process['host_id'].cast(psDT.LongType()))
df_host_verifications_process = df_host_verifications_process.withColumn('host_verifications',
                                                       retrieve_array_func(psF.col('host_verifications')))

# Explode list into rows
df_host_verifications_process = df_host_verifications_process.select('host_id',
                                                                    psF.explode(df_host_verifications_process.host_verifications) \
                                                                    .alias('verification'))

# Add index column
df_host_verifications_process = df_host_verifications_process.withColumn('id',
                                                                         psF.row_number() \
                                                                         .over(Window \
                                                                               .orderBy(psF.monotonically_increasing_id())))
df_host_verifications_process = df_host_verifications_process.withColumn('id',
                                                       df_host_verifications_process['id'].cast(psDT.LongType()))

# Trim values of string
df_host_verifications_process = df_host_verifications_process.withColumn('verification', psF.trim('verification'))

# Print schema
print('Schema of table host verification')
print(df_host_verifications_process.printSchema())

Schema of hosts verifications
root
 |-- host_id: string (nullable = true)
 |-- host_verifications: string (nullable = true)

None
Schema of table host verification
root
 |-- host_id: long (nullable = true)
 |-- verification: string (nullable = true)
 |-- id: long (nullable = false)

None


In [19]:
### Export host verification to parquet

df_host_verifications_process.write.mode('overwrite').parquet(PARQUET_PATH + 'host_verification.parquet')

### Process listings amenities

In [20]:
### Explode verifications into different rows for final table

print('Schema of amenities')
print(df_listings_amenities.printSchema())

# Copy dataframe
df_amenities_process = df_listings_amenities.select('*')

# Rename id columns to listing_id
df_amenities_process = df_amenities_process.withColumnRenamed('id', 'listing_id')

# Change data types
df_amenities_process = df_amenities_process.withColumn('listing_id',
                                                       df_amenities_process['listing_id'].cast(psDT.LongType()))
df_amenities_process = df_amenities_process.withColumn('amenities',
                                                       retrieve_array_func(psF.col('amenities')))

# Explode list into rows
df_amenities_process = df_amenities_process.select('listing_id',
                                                   psF.explode(df_amenities_process.amenities) \
                                                   .alias('amenity'))

# Add index column
df_amenities_process = df_amenities_process.withColumn('id',
                                                       psF.row_number() \
                                                       .over(Window \
                                                             .orderBy(psF.monotonically_increasing_id())))
df_amenities_process = df_amenities_process.withColumn('id',
                                                       df_amenities_process['id'].cast(psDT.LongType()))

# Trim values of string
df_amenities_process = df_amenities_process.withColumn('amenity', psF.trim('amenity'))

# Print schema
print('Schema of table listing amenities')
print(df_amenities_process.printSchema())
df_amenities_process.show()

Schema of amenities
root
 |-- id: string (nullable = true)
 |-- amenities: string (nullable = true)

None
Schema of table listing amenities
root
 |-- listing_id: long (nullable = true)
 |-- amenity: string (nullable = true)
 |-- id: long (nullable = false)

None
+----------+--------------------+---+
|listing_id|             amenity| id|
+----------+--------------------+---+
|     13913|           Hot water|  1|
|     13913|             Heating|  2|
|     13913|        Coffee maker|  3|
|     13913|      Building staff|  4|
|     13913|TV with standard ...|  5|
|     13913|Pack ’n play/Trav...|  6|
|     13913|Children’s books ...|  7|
|     13913|   Fire extinguisher|  8|
|     13913|                Iron|  9|
|     13913|Free parking on p...| 10|
|     13913|Lock on bedroom door| 11|
|     13913|Luggage dropoff a...| 12|
|     13913|               Dryer| 13|
|     13913|          Hair dryer| 14|
|     13913|Room-darkening sh...| 15|
|     13913|               Stove| 16|
|     13913|   

In [21]:
### Export listing amenities to parquet

df_amenities_process.write.mode('overwrite').parquet(PARQUET_PATH + 'listing_amenities.parquet')

### Process listings complements

In [44]:
### Give structure to listings table

print('Schema of listings complements')
print(df_listings_complements.printSchema())

# Define dict with datatypes
COMPLEMENTS_MAIN_DTPYES = {'listing_id': psDT.LongType(),
                          'minimum_minimum_nights': psDT.IntegerType(),
                          'maximum_minimum_nights': psDT.IntegerType(),
                          'minimum_maximum_nights': psDT.IntegerType(),
                          'maximum_maximum_nights': psDT.IntegerType(),
                          'minimum_nights_avg_ntm': psDT.DoubleType(),
                          'maximum_nights_avg_ntm': psDT.DoubleType(),
                          'has_availability': psDT.BooleanType(),
                          'availability_30': psDT.IntegerType(),
                          'availability_60': psDT.IntegerType(),
                          'availability_90': psDT.IntegerType(),
                          'availability_365': psDT.IntegerType(),
                          'calendar_last_scraped': psDT.DateType(),
                          'number_of_reviews': psDT.IntegerType(),
                          'number_of_reviews_ltm': psDT.IntegerType(),
                          'number_of_reviews_l30d': psDT.IntegerType(),
                          'first_review': psDT.DateType(),
                          'last_review': psDT.DateType(),
                          'review_scores_rating': psDT.DoubleType(),
                          'review_scores_accuracy': psDT.DoubleType(),
                          'review_scores_cleanliness': psDT.DoubleType(),
                          'review_scores_checkin': psDT.DoubleType(),
                          'review_scores_communication': psDT.DoubleType(),
                          'review_scores_location': psDT.DoubleType(),
                          'review_scores_value': psDT.DoubleType(),
                          'instant_bookable': psDT.BooleanType(),
                          'calculated_host_listings_count': psDT.IntegerType(),
                          'calculated_host_listings_count_entire_homes': psDT.IntegerType(),
                          'calculated_host_listings_count_private_rooms': psDT.IntegerType(),
                          'calculated_host_listings_count_shared_rooms': psDT.IntegerType(),
                          'reviews_per_month': psDT.DoubleType()}

# Copy dataframe
df_listings_complements_process = df_listings_complements.select('*')

# Rename id columns to listing_id
df_listings_complements_process = df_listings_complements_process.withColumnRenamed('id', 'listing_id')

# Transformations per column
for COLUMN in df_listings_complements_process.columns:
    
    # Strip whitespaces
    df_listings_complements_process = df_listings_complements_process.withColumn(COLUMN, psF.trim(COLUMN))
    
    # Replace None and N/A with null
    df_listings_complements_process = df_listings_complements_process.withColumn(COLUMN,
                                                                                 psF.when((psF.col(COLUMN) == 'None') | (psF.col(COLUMN) == 'N/A'), None) \
                                                                                 .otherwise(psF.col(COLUMN)))
    
    # Change data type
    if(COLUMN in COMPLEMENTS_MAIN_DTPYES.keys()):
        df_listings_complements_process = df_listings_complements_process.withColumn(COLUMN,
                                                                                     df_listings_complements_process[COLUMN] \
                                                                                     .cast(COMPLEMENTS_MAIN_DTPYES[COLUMN]))

# Add index column
df_listings_complements_process = df_listings_complements_process.withColumn('id',
                                                                             psF.row_number() \
                                                                             .over(Window \
                                                                                   .orderBy(psF.monotonically_increasing_id())))
df_listings_complements_process = df_listings_complements_process.withColumn('id',
                                                                             df_listings_complements_process['id'].cast(psDT.LongType()))
        
# Print schema
print('Schema of table listings complements')
print(df_listings_complements_process.printSchema())
# df_listings_complements_process.select('reviews_per_month').show()

Schema of listings complements
root
 |-- id: string (nullable = true)
 |-- minimum_minimum_nights: string (nullable = true)
 |-- maximum_minimum_nights: string (nullable = true)
 |-- minimum_maximum_nights: string (nullable = true)
 |-- maximum_maximum_nights: string (nullable = true)
 |-- minimum_nights_avg_ntm: string (nullable = true)
 |-- maximum_nights_avg_ntm: string (nullable = true)
 |-- has_availability: string (nullable = true)
 |-- availability_30: string (nullable = true)
 |-- availability_60: string (nullable = true)
 |-- availability_90: string (nullable = true)
 |-- availability_365: string (nullable = true)
 |-- calendar_last_scraped: string (nullable = true)
 |-- number_of_reviews: string (nullable = true)
 |-- number_of_reviews_ltm: string (nullable = true)
 |-- number_of_reviews_l30d: string (nullable = true)
 |-- first_review: string (nullable = true)
 |-- last_review: string (nullable = true)
 |-- review_scores_rating: string (nullable = true)
 |-- review_scores_ac

In [45]:
### Export listing complements to parquet

df_listings_complements_process.write.mode('overwrite').parquet(PARQUET_PATH + 'listing_complements.parquet')

## Reviews

In [46]:
### Load reviews to Spark dataframe

df_reviews = spark.read.options(delimiter = ',',
                                 header = True,
                                 #lineSep = '\n',
                                 escape = '"',
                                 multiline = True).csv(CSV_PATH + 'reviews.csv')

# Print schema
print('Schema of dataframe reviews')
print(df_reviews.printSchema())

Schema of dataframe reviews
root
 |-- listing_id: string (nullable = true)
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- reviewer_id: string (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)

None


In [54]:
### Give structure to reviews table

print('Schema of reviews')
print(df_reviews.printSchema())

# Define dict with datatypes
REVIEWS_MAIN_DTPYES = {'listing_id': psDT.LongType(),
                      'id': psDT.LongType(),
                      'date': psDT.DateType(),
                      'reviewer_id': psDT.LongType()}

# Copy dataframe
df_reviews_process = df_reviews.select('*')

# Transformations per column
for COLUMN in df_reviews_process.columns:
    
    # Strip whitespaces
    df_reviews_process = df_reviews_process.withColumn(COLUMN, psF.trim(COLUMN))
    
    # Replace None and N/A with null
    df_reviews_process = df_reviews_process.withColumn(COLUMN,
                                                       psF.when((psF.col(COLUMN) == 'None') | (psF.col(COLUMN) == 'N/A'), None) \
                                                       .otherwise(psF.col(COLUMN)))
    
    # Change data type
    if(COLUMN in REVIEWS_MAIN_DTPYES.keys()):
        df_reviews_process = df_reviews_process.withColumn(COLUMN,
                                                           df_reviews_process[COLUMN] \
                                                           .cast(REVIEWS_MAIN_DTPYES[COLUMN]))
    
# Print schema
print('Schema of table listings complements')
print(df_reviews_process.printSchema())

Schema of reviews
root
 |-- listing_id: string (nullable = true)
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- reviewer_id: string (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)

None
Schema of table listings complements
root
 |-- listing_id: long (nullable = true)
 |-- id: long (nullable = true)
 |-- date: date (nullable = true)
 |-- reviewer_id: long (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)

None


In [56]:
### Export reviews to parquet

df_reviews_process.write.mode('overwrite').parquet(PARQUET_PATH + 'reviews.parquet')

## Calendar

In [57]:
### Load reviews to Spark dataframe

df_calendar = spark.read.options(delimiter = ',',
                                 header = True,
                                 #lineSep = '\n',
                                 escape = '"',
                                 multiline = True).csv(CSV_PATH + 'calendar.csv')

# Print schema
print('Schema of dataframe calendar')
print(df_calendar.printSchema())

Schema of dataframe calendar
root
 |-- listing_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- available: string (nullable = true)
 |-- price: string (nullable = true)
 |-- adjusted_price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- maximum_nights: string (nullable = true)

None


In [70]:
### Give structure to calendar

print('Schema of calendar')
print(df_calendar.printSchema())

# Define dict with datatypes
CALENDAR_MAIN_DTPYES = {'listing_id': psDT.LongType(),
                       'date': psDT.DateType(),
                       'available': psDT.BooleanType(),
                       'price': psDT.DoubleType(),
                       'adjusted_price': psDT.DoubleType(),
                       'minimum_nights': psDT.IntegerType(),
                       'maximum_nights': psDT.IntegerType()}

# Copy dataframe
df_calendar_process = df_calendar.select('*')

# Drop $ sign from price and adjusted_price
df_calendar_process = df_calendar_process.withColumn('price',
                                                     psF.regexp_replace('price', '\$', ''))
df_calendar_process = df_calendar_process.withColumn('adjusted_price',
                                                     psF.regexp_replace('adjusted_price', '\$', ''))

# # Transformations per column
for COLUMN in df_calendar_process.columns:
    
    # Strip whitespaces
    df_calendar_process = df_calendar_process.withColumn(COLUMN, psF.trim(COLUMN))
    
    # Replace None and N/A with null
    df_calendar_process = df_calendar_process.withColumn(COLUMN,
                                                         psF.when((psF.col(COLUMN) == 'None') | (psF.col(COLUMN) == 'N/A'), None) \
                                                         .otherwise(psF.col(COLUMN)))
    
    # Change data type
    if(COLUMN in CALENDAR_MAIN_DTPYES.keys()):
        df_calendar_process = df_calendar_process.withColumn(COLUMN,
                                                             df_calendar_process[COLUMN] \
                                                             .cast(CALENDAR_MAIN_DTPYES[COLUMN]))

# Add index column
df_calendar_process = df_calendar_process.withColumn('id',
                                                     psF.row_number() \
                                                     .over(Window \
                                                           .orderBy(psF.monotonically_increasing_id())))
df_calendar_process = df_calendar_process.withColumn('id',
                                                     df_calendar_process['id'].cast(psDT.LongType()))
        
# Print schema
print('Schema of table calendar')
print(df_calendar_process.printSchema())
df_calendar_process.show(5)


Schema of calendar
root
 |-- listing_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- available: string (nullable = true)
 |-- price: string (nullable = true)
 |-- adjusted_price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- maximum_nights: string (nullable = true)

None
Schema of table calendar
root
 |-- listing_id: long (nullable = true)
 |-- date: date (nullable = true)
 |-- available: boolean (nullable = true)
 |-- price: double (nullable = true)
 |-- adjusted_price: double (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- maximum_nights: integer (nullable = true)
 |-- id: long (nullable = false)

None
+----------+----------+---------+-----+--------------+--------------+--------------+---+
|listing_id|      date|available|price|adjusted_price|minimum_nights|maximum_nights| id|
+----------+----------+---------+-----+--------------+--------------+--------------+---+
|    182802|2021-12-09|     true| 55.0|        

In [71]:
### Export calendar to parquet

df_calendar_process.write.mode('overwrite').parquet(PARQUET_PATH + 'calendar.parquet')