# Imports & Configuration

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession \
    .builder \
    .appName("Data_Import") \
    .config("spark.memory.offHeap.enabled","true") \
    .config("spark.driver.cores","4") \
    .config("spark.driver.maxResultSize","4g") \
    .config("spark.memory.offHeap.size","4g") \
    .config("spark.driver.memory", "4g")\
    .config("spark.executor.memory","4g")\
    .getOrCreate()

23/07/06 16:39:02 WARN Utils: Your hostname, nuno-g14 resolves to a loopback address: 127.0.1.1; using 192.168.1.103 instead (on interface wlp2s0)
23/07/06 16:39:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/07/06 16:39:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/07/06 16:39:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/07/06 16:39:03 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/07/06 16:39:03 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/07/06 16:39:03 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


# Data Reading

In [4]:
frame = spark.read.parquet("../../MSc_Model_Datasets/Data_SEF.parquet")

In [5]:
frame.limit(5).show()

+--------+-----------------+-----+---------+-------------+--------------------+--------------------+-----+-----------+-------------+-----------+
|     rsn|Unidade_Hoteleira|  CAE|Tipologia|Classificacao|        Data_Checkin|       Data_Checkout|Idade|Concelho_UH|Nacionalidade|Pais_Origem|
+--------+-----------------+-----+---------+-------------+--------------------+--------------------+-----+-----------+-------------+-----------+
|55561514|             1487|55111|    Hotel|   4 estrelas|2016-11-30T00:00:...|2016-12-01T00:00:...|   35| Matosinhos|  REINO UNIDO|REINO UNIDO|
|55561515|             1487|55111|    Hotel|   4 estrelas|2016-11-30T00:00:...|2016-12-01T00:00:...|   32| Matosinhos|  REINO UNIDO|REINO UNIDO|
|56150801|              645|55121|    Hotel|   4 estrelas|2017-01-08T00:00:...|2017-01-08T00:00:...|   50|     Lisboa|  REINO UNIDO|REINO UNIDO|
|55957037|             1116|55111|    Hotel|   4 estrelas|2016-12-28T00:00:...|2016-12-29T00:00:...|   60|       Faro|  REINO UNID

# Data Cleaning & Exploration

## Remove Duplicates

In [6]:
# Removing general duplicates (with the values of all the columns equal)

initial_rows = frame.count()

print("Initial number of rows: " + str(initial_rows))

frame = frame.distinct()
after_general_drop_rows = frame.count()

print("Number of rows after removing general duplicates: " + str(after_general_drop_rows))
print("Number of rows droped: " + str(initial_rows - after_general_drop_rows))

# Removing rsn duplicates 

initial_rows = frame.count()
frame = frame.dropDuplicates(['rsn'])
after_rsn_drop_rows = frame.count()

print("Number of rows after removing rsn duplicates: " + str(after_rsn_drop_rows))
print("Number of rows droped: " + str(initial_rows - after_rsn_drop_rows))

Initial number of rows: 61911504


ERROR:root:KeyboardInterrupt while sending command.               (6 + 11) / 17]
Traceback (most recent call last):
  File "/home/nuno/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/nuno/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

[Stage 7:>                                                        (0 + 16) / 40]

## Dropping Null Values

In [None]:
frame = frame.na.drop(subset=["rsn", "Unidade_Hoteleira", "CAE", "Tipologia", "Data_Checkin", "Data_Checkout", "Idade", 
                      "Concelho_UH", "Nacionalidade", "Pais_Origem"])

## Dealing with strange Values

In [None]:
frame = frame.withColumn('Data_Checkin', \
         to_timestamp('Data_Checkin'))
frame = frame.withColumn('Data_Checkout', \
         to_timestamp('Data_Checkout'))

In [None]:
frame = frame.where(frame.Idade>=1)

In [None]:
frame = frame.withColumn("Number_of_Nights", datediff(col("Data_Checkout"),col("Data_Checkin")))

In [None]:
frame = frame.withColumn("Year", year(col("Data_Checkin")))

In [None]:
frame = frame.where(frame.Number_of_Nights <= 99).where(frame.Number_of_Nights >= 1)
frame = frame.where(frame.Year <= 2018).where(frame.Year >= 2014)

In [None]:
if torch.cuda.is_available(): 
    dev = "cuda:0" 
else: 
    dev = "cpu" 
device = torch.device(dev) 
print(dev)

In [None]:
initial_rows = after_rsn_drop_rows
after_strange_drop_rows = frame.count()
print("Initial number of rows: " + str(initial_rows))
print("Number of rows after removing strange values: " + str(after_strange_drop_rows))
print("Number of rows droped: " + str(initial_rows - after_strange_drop_rows))

## Dealing with Schema

In [None]:
frame = frame.withColumn('Data_Checkin', \
         unix_timestamp('Data_Checkin'))
frame = frame.withColumn('Data_Checkout', \
         unix_timestamp('Data_Checkout'))

In [None]:
frame = frame.withColumn("rsn",frame.rsn.cast('long'))
frame = frame.withColumn("Unidade_Hoteleira",frame.Unidade_Hoteleira.cast('double'))
frame = frame.withColumn("CAE",frame.CAE.cast('double'))
frame = frame.withColumn("Idade",frame.Idade.cast('double'))
frame = frame.withColumn("Classificacao", regexp_replace(col("Classificacao"), " estrelas", ""))
frame = frame.withColumn("Classificacao",frame.Classificacao.cast('double'))

frame = frame.drop("Year")

In [None]:
frame.printSchema()

In [None]:
frame.limit(5).show()

# Data Writing

In [None]:
frame.write.parquet("../MSc_Model_Datasets/Data_SEF_Clean.parquet")