In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, avg,udf, month,max, datediff, lit, to_date,count, rank,regexp_replace
from pyspark.sql.window import Window
from pyspark.sql.types import StringType,StructField,StructType,IntegerType

In [3]:
# Initialize a Spark session
spark = SparkSession \
    .builder \
    .appName("Day_6") \
    .config("spark.jars", "/usr/lib/jvm/java-11-openjdk-amd64/lib/postgresql-42.6.0.jar") \
    .getOrCreate()

23/09/08 17:03:32 WARN Utils: Your hostname, ubuntu-Lenovo-Legion-5-15ARH05 resolves to a loopback address: 127.0.1.1; using 172.16.5.112 instead (on interface wlp4s0)
23/09/08 17:03:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/09/08 17:03:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/08 17:03:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


**Extract the data**

In [4]:
# Load the CSV data into DataFrames
listing_df_raw = spark.read.csv('raw_data/listings.tsv', header=True, inferSchema=True, sep="\t")
reviews_df_raw = spark.read.csv('raw_data/reviews.tsv', header=True, inferSchema=True, sep="\t")
calendar_df_raw = spark.read.csv('raw_data/calendar.tsv', header=True, inferSchema=True,sep="\t")

                                                                                

**Transform the data**

In [5]:
# Transform the data

# LISTINGS DATA

# Drop the 'summary' column
listing_df_raw = listing_df_raw.drop('summary')
listing_df_raw = listing_df_raw.drop('description')
listing_df_raw = listing_df_raw.drop('host_about')


# Convert 'host_is_superhost' to boolean
listing_df_raw = listing_df_raw.withColumn('host_is_superhost', when(col('host_is_superhost') == 't', True).otherwise(False))

# Drop 'country' and 'market' columns
listing_df_raw = listing_df_raw.drop('country', 'market')

# Drop rows with null values in the 'space' column
listing_df_raw = listing_df_raw.na.drop(subset=['space'])

In [6]:
# Replace commas in all columns using a loop
for column in listing_df_raw.columns:
    listing_df_raw = listing_df_raw.withColumn(column, regexp_replace(col(column), ',', ''))


In [7]:
# CALENDAR DATA

# Convert 'available' to boolean
calendar_df_raw = calendar_df_raw.withColumn('available', when(col('available') == 't', True).otherwise(False))

# Remove "$" and convert to integer
listing_df_raw = listing_df_raw.withColumn("price", regexp_replace(col("price"), "[^0-9]", "").cast(IntegerType()))

#fill price colum with "booked"
calendar_df_raw = calendar_df_raw.withColumn("price", when(col("available") == False, "booked").otherwise(col("price")))


In [8]:
# Save data locally

listing_df_raw.coalesce(1).write.parquet('cleaned_data/clean_listing', mode="overwrite",compression="snappy")
calendar_df_raw.coalesce(1).write.parquet('cleaned_data/clean_calendar',  mode="overwrite",compression="snappy")
reviews_df_raw.coalesce(1).write.parquet('cleaned_data/clean_reviews',  mode="overwrite" ,compression="snappy")

# reviews_df_raw.repartition(10).write.parquet('cleaned_data/clean_reviews',  mode="overwrite")

                                                                                

In [23]:
# Define the JDBC connection properties
jdbc_url = "jdbc:postgresql://localhost:5432/spark_project"
properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}

# Write the DataFrame to the existing PostgreSQL table
listing_df_raw.write.jdbc(url=jdbc_url, table='listing_table', mode="append", properties=properties)
calendar_df_raw.write.jdbc(url=jdbc_url, table='calendar_table', mode="append", properties=properties)
reviews_df_raw.write.jdbc(url=jdbc_url, table='reviews_table', mode="append", properties=properties)

                                                                                