# ETL to transform Airbnb rawdata to database schema

This ETL is coded in pyspark. Though the current data size could be handled in Pandas or similar libraries, pyspark was chosen for scale reasons.

TODO: outline steps here

## Initialisation

In [1]:
### Import modules

import warnings
warnings.filterwarnings('ignore')

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkWithPostgres')\
        .config("spark.driver.extraClassPath", "/project/Practical_exam/postgresql-42.3.2.jar")\
        .getOrCreate()
print('Spark session:')
print(spark)

import pyspark
from pyspark.sql import SQLContext

Spark session:
<pyspark.sql.session.SparkSession object at 0x7fe9328a0040>


In [2]:
### Initialise Spark context

sc = pyspark.SparkContext.getOrCreate()

print('Initialisation of Spark context')
print(sc)
print('')

sqlContext = SQLContext(sc)
print('Initialisation of SparkSQL')
print(sqlContext)

spark.sparkContext.setLogLevel("ERROR")
sc.setLogLevel("ERROR")

Initialisation of Spark context
<SparkContext master=local[*] appName=SparkWithPostgres>

Initialisation of SparkSQL
<pyspark.sql.context.SQLContext object at 0x7fe9328a08e0>


In [3]:
### Constants

CSV_PATH = './OriginalData_csv/'
PARQUET_PATH = './SchemaReadyData_parquet/'

## Listings

In [100]:
### Load listings to Spark dataframe

df_listings = spark.read.options(delimiter = ',',
                                 header = True,
                                 #lineSep = '\n',
                                 escape = '"',
                                 multiline = True).csv(CSV_PATH + 'listings.csv')

# Print schema
df_listings.printSchema()

root
 |-- id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: string (nullable = true)
 |-- last_scraped: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: string (nullable = true)
 |-- host

In [101]:
### Drop empty columns (for more insight, consult EDA notebook)

COLUMNS_TO_DROP = ('neighbourhood_group_cleansed',
                  'bathrooms',
                  'calendar_updated',
                  'license')

df_listings = df_listings.drop(*COLUMNS_TO_DROP)

In [102]:
### Create 3 tables from listings df

print('Columns on original file:', len(df_listings.columns))

COLS_LISTINGS = ['id',
                 'listing_url',
                 'scrape_id',
                 'last_scraped',
                 'name',
                 'description',
                 'neighborhood_overview',
                 'picture_url',
                 'neighbourhood',
                 'neighbourhood_cleansed',
                 'latitude',
                 'longitude',
                 'property_type',
                 'room_type',
                 'accommodates',
                 'bathrooms_text',
                 'bedrooms',
                 'beds',
                 'amenities',
                 'price',
                 'minimum_nights',
                 'maximum_nights']

COLS_HOSTS = ['id',
              'host_id',
              'host_url',
              'host_name',
              'host_since',
              'host_location',
              'host_about',
              'host_response_time',
              'host_response_rate',
              'host_acceptance_rate',
              'host_is_superhost',
              'host_thumbnail_url',
              'host_picture_url',
              'host_neighbourhood',
              'host_listings_count',
              'host_total_listings_count',
              'host_verifications',
              'host_has_profile_pic',
              'host_identity_verified']

COLS_COMPLENET = ['id',
                  'minimum_minimum_nights',
                  'maximum_minimum_nights',
                  'minimum_maximum_nights',
                  'maximum_maximum_nights',
                  'minimum_nights_avg_ntm',
                  'maximum_nights_avg_ntm',
                  'has_availability',
                  'availability_30',
                  'availability_60',
                  'availability_90',
                  'availability_365',
                  'calendar_last_scraped',
                  'number_of_reviews',
                  'number_of_reviews_ltm',
                  'number_of_reviews_l30d',
                  'first_review',
                  'last_review',
                  'review_scores_rating',
                  'review_scores_accuracy',
                  'review_scores_cleanliness',
                  'review_scores_checkin',
                  'review_scores_communication',
                  'review_scores_location',
                  'review_scores_value',
                  'instant_bookable',
                  'calculated_host_listings_count',
                  'calculated_host_listings_count_entire_homes',
                  'calculated_host_listings_count_private_rooms',
                  'calculated_host_listings_count_shared_rooms',
                  'reviews_per_month']

print('Columns after the split:',
      len(COLS_LISTINGS) + len(COLS_HOSTS) + len(COLS_COMPLENET) - 2) #Correction for 2 additioanl id columns

# Create dataframes
df_listings_main = df_listings.select(COLS_LISTINGS)
df_hosts = df_listings.select(COLS_HOSTS)
df_listings_complements = df_listings.select(COLS_COMPLENET)

Columns on original file: 70
Columns after the split: 70


### Process listings main

In [104]:
### Give structure to listings table

df_listings_main.first()
df_listings_main.printSchema()

root
 |-- id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: string (nullable = true)
 |-- last_scraped: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- neighbourhood_cleansed: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- property_type: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: string (nullable = true)
 |-- bathrooms_text: string (nullable = true)
 |-- bedrooms: string (nullable = true)
 |-- beds: string (nullable = true)
 |-- amenities: string (nullable = true)
 |-- price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- maximum_nights: string (nullable = true)

