In [223]:
import os
import sys
from pyspark.sql.functions import date_format , split , col, unix_timestamp, from_unixtime, expr  ,when
from pyspark.sql import SparkSession
from hdfs import InsecureClient
import pandas as pd 


In [224]:

def get_hdfs_csv_files(client, folder_path):
    csv_files = {}
    # Listing files in the folder
    file_status = client.list(folder_path)
    # Iterating through files
    for file_or_folder_name in file_status:
        file_or_folder_full_path = folder_path + '/' + file_or_folder_name
        if client.status(file_or_folder_full_path)['type'] == 'DIRECTORY':
            # If it's a directory, recursively call the function
            csv_files.update(get_hdfs_csv_files(client, file_or_folder_full_path))
        elif file_or_folder_name.endswith('.csv'):
            # If it's a CSV file, add it to the dictionary along with its name
            csv_files[file_or_folder_name] = file_or_folder_full_path
    return csv_files


def add_filename_column(df, filename):
    df['Filename'] = filename
    return df

def process_csv_files(csv_files_dict, hdfs_client):
    dfs_with_filenames = []
    for filename, filepath in csv_files_dict.items():
        with hdfs_client.read(filepath) as reader:
            try:
                df = pd.read_csv(reader)
                if not df.empty:
                    df = add_filename_column(df, filename)
                    dfs_with_filenames.append(df) 
                else:
                    print(f"Empty DataFrame found in file: {filename}")
            except pd.errors.EmptyDataError:
                print(f"Empty file: {filename}")
            except pd.errors.ParserError as e:
                print(f"Error parsing file {filename}: {e}")
    concatenated_df = pd.concat(dfs_with_filenames, ignore_index=True)
    return concatenated_df


# Arrival data 


In [None]:
# Example usage:
hdfs_url = 'http://localhost:50070'
folder_path = '/user/PFE_data/arrival_flights'  # Adjust this path accordingly

# Create an HDFS client
client = InsecureClient(hdfs_url)

# Get all CSV files from the specified folder and its subfolders
csv_files_dict = get_hdfs_csv_files(client, folder_path)
print()
# Process CSV files: open each CSV file, add a filename column, and store the DataFrame in a dictionary
dfs_arrival = process_csv_files(csv_files_dict, client)

dfs_arrival


Empty DataFrame found in file: Anguilla2024-03-13.csv
Empty DataFrame found in file: Antarctica2024-03-19.csv


In [5]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Read from Hadoop") \
    .getOrCreate()
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
#spark.conf.set("spark.memory.offHeap.enabled","true") 
#spark.conf.set("spark.memory.offHeap.size","10g")

# Set configurations

arrival_data = spark.createDataFrame(dfs_arrival)


In [6]:
arrival_data.show()

+-----------------+-----------+--------------------+--------------------+------+--------------+-------------+------------+--------------------+--------------------+
|             Date|Flight Time|            Aircraft|      Origin Airport|flight|Aircraft Model|Flight Status|time_arrival|Destination Aeroport|            Filename|
+-----------------+-----------+--------------------+--------------------+------+--------------+-------------+------------+--------------------+--------------------+
|   Sunday, Mar 17|      13:55|TAAG Angola Airlines|Luanda Quatro de ...| DT122|          B737|      Unknown|     Unknown|     Cabinda Airport|Angola2024-03-18.csv|
|   Sunday, Mar 17|      17:45|TAAG Angola Airlines|Luanda Quatro de ...| DT124|          B737|       Landed|       17:40|     Cabinda Airport|Angola2024-03-18.csv|
|   Sunday, Mar 17|      18:45|TAAG Angola Airlines|Luanda Quatro de ...| DT126|          B737|      Unknown|     Unknown|     Cabinda Airport|Angola2024-03-18.csv|
|   Monday

In [161]:
from pyspark.sql.functions import when, create_map, lit, split, trim, regexp_extract, substring
from itertools import chain  # Import chain function from itertools module

def process_arrival_data(spark_df):
    # Drop rows with null values
    df = spark_df.na.drop()

    # Split the "date" column into "day", "month", and "month_number" columns
    df = df.withColumn("day", split(df["date"], ",")[0])
    df = df.withColumn("month", trim(split(df["date"], ",")[1]))
    df = df.withColumn("day_number", trim(split(df["date"], " ")[2]))
    df = df.withColumn('Origin Country', split(df['Origin Airport'], ',')[1])

    # Define a mapping dictionary for month names
    month_mapping = {
        "Jan": "January", "Feb": "February", "Mar": "March", "Apr": "April", 
        "May": "May", "Jun": "June", "Jul": "July", "Aug": "August", 
        "Sep": "September", "Oct": "October", "Nov": "November", "Dec": "December"
    }

    # Convert the dictionary to a map
    mapping_expr = create_map([lit(x) for x in chain(*month_mapping.items())])

    # Extract the first character of the "month" column
    first_word_month = split(col("month"), " ")[0]

    # Apply the mapping to the first character
    df = df.withColumn("complete_month", mapping_expr.getItem(first_word_month))

    # Drop the original "month" column
    df = df.drop("month")

    # Remove duplicate rows
    df = df.dropDuplicates()

    # Extract "date_extraction" column based on conditions
    df = df.withColumn("date_extraction", 
                       when(df["filename"].rlike("\\d"), 
                            substring(df["filename"], -14, 10))
                       .otherwise(None))

    # Rename the "Aircraft" column to "Airline"
    df = df.withColumnRenamed("Aircraft", "Airline")
    df = df.withColumnRenamed("Destination Aeroport", "Destination Airport")


    # Add a new column "Airport country" based on filename content
    df = df.withColumn("Airport country",
                       when(df["Filename"].contains("_"),
                            split(col("Filename"), "_")[0])
                       .otherwise(regexp_extract(col("Filename"), "^(.*?)(?=[0-9])", 1)))
    df = df.drop("Filename")
    
    # Add a column "year" with constant value 2024
    df = df.withColumn("year", lit(2024))
    df = df.withColumn("Flight Time2", unix_timestamp(col("Flight Time"), "HH:mm").cast(TimestampType()))

    df = df.withColumn("time_arrival2", 
                       when(col("time_arrival").rlike("\\d"), 
                            unix_timestamp(col("time_arrival"), "HH:mm").cast(TimestampType()))
                       .otherwise(None))
    df = df.withColumn("delay", 
                               when(col("time_arrival2").isNotNull(), 
                                    expr("time_arrival2 - `Flight Time2`"))
                               .when(col("time_arrival") == "Scheduled", "Scheduled")
                               .when(col("time_arrival") == "Unknown", "Unknown")
                               .when(col("time_arrival") == "Canceled", "Canceled")
                               .otherwise("No"))
    df = df.withColumn("delay",
                               expr("CASE WHEN delay LIKE 'INTERVAL%' THEN regexp_extract(delay, '([0-9]+:[0-9]+:[0-9]+)', 1) ELSE delay END"))
    df =  df.drop("time_arrival2")
    df =  df.drop("Flight Time2")


    return df



In [162]:
# Apply the function to your DataFrame
arrival_data_processed = process_arrival_data(arrival_data)

arrival_data_processed.toPandas()



Unnamed: 0,Date,Flight Time,Airline,Origin Airport,flight,Aircraft Model,Flight Status,time_arrival,Destination Airport,day,day_number,Origin Country,complete_month,date_extraction,Airport country,year,delay
0,"Wednesday, Mar 20",01:05,IndiGo,"Delhi Indira Gandhi International Airport, India",6E1803,A20N,Scheduled,Scheduled,Baku Heydar Aliyev International Airport,Wednesday,20,India,March,2024-03-18,Azerbaijan,2024,Scheduled
1,"Monday, Mar 04",15:25,IBC Airways,"Nassau Lynden Pindling International Airport, ...",II107,SF34,Landed,16:26,Freeport Grand Bahama International Airport,Monday,04,Bahamas,March,,Bahamas,2024,01:01:00
2,"Monday, Mar 04",15:45,Bahamasair,Fort Lauderdale Hollywood International Airpor...,UP161,AT46,Landed,17:11,George Town Exuma International Airport,Monday,04,United States,March,,Bahamas,2024,01:26:00
3,"Wednesday, Mar 13",17:50,Novoair,"Dhaka Shahjalal International Airport, Bangladesh",VQ935,AT7,Unknown,Unknown,Cox's Bazar Airport,Wednesday,13,Bangladesh,March,2024-03-14,Bangladesh,2024,Unknown
4,"Wednesday, Mar 13",16:00,Biman Bangladesh Airlines,"Dhaka Shahjalal International Airport, Bangladesh",BG603,DH8D,Landed,15:54,Sylhet Osmani International Airport,Wednesday,13,Bangladesh,March,2024-03-14,Bangladesh,2024,00:06:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47040,"Tuesday, Mar 05",15:00,Vietnam Airlines,"Hanoi Noi Bai International Airport, Vietnam",VN1559,A21N,Landed,15:32,Nha Trang Cam Ranh International Airport,Tuesday,05,Vietnam,March,,Vietnam,2024,00:32:00
47041,"Tuesday, Mar 05",07:50,VietJet Air,"Ho Chi Minh City International Airport, Vietnam",VJ240,A321,Landed,09:30,Sao Vang Tho Xuan Airport,Tuesday,05,Vietnam,March,,Vietnam,2024,01:40:00
47042,"Thursday, Mar 07",07:50,VietJet Air,"Ho Chi Minh City International Airport, Vietnam",VJ240,320,Scheduled,Scheduled,Sao Vang Tho Xuan Airport,Thursday,07,Vietnam,March,,Vietnam,2024,Scheduled
47043,"Thursday, Mar 07",12:00,VietJet Air,"Ho Chi Minh City International Airport, Vietnam",VJ244,321,Scheduled,Scheduled,Sao Vang Tho Xuan Airport,Thursday,07,Vietnam,March,,Vietnam,2024,Scheduled


# Departure data


In [52]:
# Example usage:
hdfs_url = 'http://localhost:50070'
folder_path = '/user/PFE_data/departure_flights'  # Adjust this path accordingly

# Create an HDFS client
client = InsecureClient(hdfs_url)

# Get all CSV files from the specified folder and its subfolders
csv_files_dict = get_hdfs_csv_files(client, folder_path)
print()
# Process CSV files: open each CSV file, add a filename column, and store the DataFrame in a dictionary
dfs_departure = process_csv_files(csv_files_dict, client)

dfs_departure


Error parsing file Cayman-Islands2024-03-07.csv: Error tokenizing data. C error: Expected 11 fields in line 4, saw 20

Error parsing file Congo2024-03-07.csv: Error tokenizing data. C error: Expected 11 fields in line 4, saw 20

Error parsing file Dominican-Republic2024-03-07.csv: Error tokenizing data. C error: Expected 11 fields in line 4, saw 20

Empty DataFrame found in file: Montenegro2024-03-12.csv
Empty file: Solomon-Islands2024-03-14.csv
Empty file: Tajikistan2024-03-14.csv
Empty file: Timor-leste-east-Timor2024-03-14.csv
Empty file: Trinidad-And-Tobago2024-03-14.csv


Unnamed: 0,Date,Flight Time,Airline,Destination Aeroport,flight,Aircraft Model,Flight Status,time_arrival,Original Aeroport,Filename,Aircraft
0,"Sunday, Mar 10",23:05,Wizz Air,"London Luton Airport, United Kingdom",W94472,A21N,Departed,03:14,Tirana International Airport,Albania2024-03-12.csv,
1,"Monday, Mar 11",01:55,Pegasus,"Istanbul Sabiha Gokcen International Airport, ...",PC282,A21N,Departed,01:57,Tirana International Airport,Albania2024-03-12.csv,
2,"Monday, Mar 11",04:00,AirSERBIA,"Belgrade Nikola Tesla Airport, Serbia",JU217,AT76,Departed,04:01,Tirana International Airport,Albania2024-03-12.csv,
3,"Monday, Mar 11",04:55,Austrian Airlines,"Vienna International Airport, Austria",OS850,A320,Departed,04:55,Tirana International Airport,Albania2024-03-12.csv,
4,"Monday, Mar 11",06:00,Wizz Air,"Dortmund Airport, Germany",W43841,A320,Departed,06:02,Tirana International Airport,Albania2024-03-12.csv,
...,...,...,...,...,...,...,...,...,...,...,...
33962,"Saturday, Mar 09",13:10,Air Tanzania,"Lubumbashi International Airport, Democratic R...",TC213,223,Scheduled,Scheduled,Ndola Simon Mwansa Kapwepwe International Airport,Zambia2024-03-08.csv,
33963,"Saturday, Mar 09",13:15,Airlink,"Johannesburg OR Tambo International Airport, S...",4Z151,E135,Scheduled,Scheduled,Ndola Simon Mwansa Kapwepwe International Airport,Zambia2024-03-08.csv,
33964,"Saturday, Mar 09",13:45,Proflight Zambia,"Lusaka Kenneth Kaunda International Airport, Z...",P0335,J41,Scheduled,Scheduled,Ndola Simon Mwansa Kapwepwe International Airport,Zambia2024-03-08.csv,
33965,"Saturday, Mar 09",18:15,Proflight Zambia,"Lusaka Kenneth Kaunda International Airport, Z...",P0309,J41,Scheduled,Scheduled,Ndola Simon Mwansa Kapwepwe International Airport,Zambia2024-03-08.csv,


In [139]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Read from Hadoop") \
    .getOrCreate()
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
#spark.conf.set("spark.memory.offHeap.enabled","true") 
#spark.conf.set("spark.memory.offHeap.size","10g")

# Set configurations

departure_data = spark.createDataFrame(dfs_departure)


  Expected bytes, got a 'int' object
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


In [156]:
from pyspark.sql.functions import split, trim, regexp_extract, create_map, lit, col, when , date_format
from itertools import chain
from pyspark.sql.functions import expr, when, col, unix_timestamp
from pyspark.sql.types import TimestampType

def process_departure_data(departure_data):
    # Remove rows with null values
    df = departure_data.na.drop()

    # Split the "date" column into "day", "month", and "month_number" columns
    df = df.withColumn("day", split(df["date"], ",")[0])
    df = df.withColumn("month", trim(split(df["date"], ",")[1]))
    df = df.withColumn("day_number", trim(split(df["date"], " ")[2]))
    df = df.drop("Aircraft")

    # Define a mapping dictionary for month names
    month_mapping = {
        "Jan": "January", "Feb": "February", "Mar": "March", "Apr": "April", 
        "May": "May", "Jun": "June", "Jul": "July", "Aug": "August", 
        "Sep": "September", "Oct": "October", "Nov": "November", "Dec": "December"
    }

    # Convert the dictionary to a map
    mapping_expr = create_map([lit(x) for x in chain(*month_mapping.items())])

    # Extract the first character of the "month" column
    first_word_month = split(col("month"), " ")[0]

    # Apply the mapping to the first character
    df = df.withColumn("complete_month", mapping_expr.getItem(first_word_month))

    # Drop the original "month" column
    df = df.drop("month")

    # Remove duplicate rows
    df = df.dropDuplicates()

    # Add a new column "Airport country" based on filename content
    df = df.withColumn("Airport country",
                       when(df["Filename"].contains("_"),
                            split(col("Filename"), "_")[0])
                       .otherwise(regexp_extract(col("Filename"), "^(.*?)(?=[0-9])", 1)))
    df = df.drop("Filename")

    # Rename columns
    df = df.withColumnRenamed("Original Aeroport", "Origin Airport")
    df = df.withColumnRenamed("Destination Aeroport", "Destination Airport")

    # Split "Destination Airport" column to get "Destination Country"
    df = df.withColumn('Destination Country', split(df['Destination Airport'], ',')[1])
    
    # Add a column "year" with constant value 2024
    df = df.withColumn("year", lit(2024))
        # Convert string column "Flight Time" to timestamp type
    df = df.withColumn("Flight Time2", unix_timestamp(col("Flight Time"), "HH:mm").cast(TimestampType()))
    # Convert "arrival_time" column to timestamp type for HH:mm values
    df = df.withColumn("time_arrival2", 
                       when(col("time_arrival").rlike("\\d"), 
                            unix_timestamp(col("time_arrival"), "HH:mm").cast(TimestampType()))
                       .otherwise(None))
    df = df.withColumn("delay", 
                               when(col("time_arrival2").isNotNull(), 
                                    expr("time_arrival2 - `Flight Time2`"))
                               .when(col("time_arrival") == "Scheduled", "Scheduled")
                               .when(col("time_arrival") == "Unknown", "Unknown")
                               .when(col("time_arrival") == "Canceled", "Canceled")
                               .otherwise("No"))
    df = df.withColumn("delay",
                               expr("CASE WHEN delay LIKE 'INTERVAL%' THEN regexp_extract(delay, '([0-9]+:[0-9]+:[0-9]+)', 1) ELSE delay END"))
    df =  df.drop("time_arrival2")
    df =  df.drop("Flight Time2")

    return df


In [157]:
processed_departure_data = process_departure_data(departure_data)
processed_departure_data.toPandas()



Unnamed: 0,Date,Flight Time,Airline,Destination Airport,flight,Aircraft Model,Flight Status,time_arrival,Origin Airport,day,day_number,complete_month,Airport country,Destination Country,year,delay
0,"Wednesday, Mar 06",21:15,Wizz Air,"Bologna Guglielmo Marconi Airport, Italy",W43847,A320,Estimated dep.,21:30,Tirana International Airport,Wednesday,06,March,Albania,Italy,2024,00:15:00
1,"Thursday, Mar 07",17:00,Aerolineas Argentinas (Retro Livery),"Formosa International Airport, Argentina",AR1760,B737,Scheduled,Scheduled,Buenos Aires Jorge Newbery Airport,Thursday,07,March,Argentina,Argentina,2024,Scheduled
2,"Thursday, Mar 07",01:04,Copa Airlines,"Panama City Tocumen International Airport, Panama",CM501,738,Scheduled,Scheduled,Buenos Aires Ministro Pistarini International ...,Thursday,07,March,Argentina,Panama,2024,Scheduled
3,"Thursday, Mar 07",06:50,Avianca,"Bogota El Dorado International Airport, Colombia",AV88,32N,Estimated dep.,06:50,Buenos Aires Ministro Pistarini International ...,Thursday,07,March,Argentina,Colombia,2024,00:00:00
4,"Tuesday, Mar 05",13:35,Aerolineas Argentinas,"Puerto Iguazu International Airport, Argentina",AR1731,E190,Estimated dep.,13:38,Salta Martin Miguel de Guemes International Ai...,Tuesday,05,March,Argentina,Argentina,2024,00:03:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33955,"Tuesday, Mar 19",17:20,Vietnam Airlines,"Da Nang International Airport, Vietnam",VN7134,321,Estimated dep.,17:20,Ho Chi Minh City International Airport,Tuesday,19,March,Vietnam,Vietnam,2024,00:00:00
33956,"Tuesday, Mar 19",19:45,Bamboo Airways,"Hue Phu Bai International Airport, Vietnam",QH1102,E90,Estimated dep.,19:45,Ho Chi Minh City International Airport,Tuesday,19,March,Vietnam,Vietnam,2024,00:00:00
33957,"Wednesday, Mar 20",07:45,Vietnam Airlines,"Con Dao Airport, Vietnam",VN1893,AT7,Scheduled,Scheduled,Ho Chi Minh City International Airport,Wednesday,20,March,Vietnam,Vietnam,2024,Scheduled
33958,"Monday, Mar 18",13:40,Vietnam Airlines,"Hanoi Noi Bai International Airport, Vietnam",VN1622,A321,Departed,13:39,Qui Nhon Phu Cat Airport,Monday,18,March,Vietnam,Vietnam,2024,00:01:00


# Reviews data

In [163]:
# Example usage:
hdfs_url = 'http://localhost:50070'
folder_path = '/user/PFE_data/reviews_flights'  # Adjust this path accordingly

# Create an HDFS client
client = InsecureClient(hdfs_url)

# Get all CSV files from the specified folder and its subfolders
csv_files_dict = get_hdfs_csv_files(client, folder_path)
print()
# Process CSV files: open each CSV file, add a filename column, and store the DataFrame in a dictionary
dfs_reviews = process_csv_files(csv_files_dict, client)

dfs_reviews


Error parsing file Argentina2024-03-08.csv: Error tokenizing data. C error: Expected 11 fields in line 4, saw 22

Empty file: Austria2024-03-14.csv
Empty file: Guernsey2024-03-14.csv
Error parsing file Latvia2024-03-07.csv: Error tokenizing data. C error: Expected 11 fields in line 4, saw 22

Empty file: Malawi2024-03-14.csv
Empty file: Moldova2024-03-14.csv
Error parsing file Nigeria2024-03-07.csv: Error tokenizing data. C error: Expected 11 fields in line 4, saw 22

Error parsing file Portugal2024-03-08.csv: Error tokenizing data. C error: Expected 11 fields in line 4, saw 22

Empty file: Portugal2024-03-14.csv
Error parsing file Suriname2024-03-08.csv: Error tokenizing data. C error: Expected 11 fields in line 4, saw 22

Error parsing file Ukraine2024-03-07.csv: Error tokenizing data. C error: Expected 11 fields in line 4, saw 22



Unnamed: 0,date,Stars Title,text review,Getting to the airport,Check-in,Security check,Terminal facilities,Food and retail services,WiFi,Lounge,Immigration/customs,Baggage claim,Destination Aeroport,Filename,{{objSubRating.subject}},Boarding/deplaning,Flight crew,In-Flight services,In-Flight entertainment
0,2024-03-08 07:30:31,Rated 4/5,"A very nice terminal, recent, modern, comforta...",Rated 2/5,Rated 5/5,Rated 5/5,Rated 5/5,Rated 4/5,,,,,Tirana International Airport,Albania2024-03-13.csv,,,,,
1,2023-12-11 11:04:49,Rated 1/5,Lounge was closed without prior notice neither...,Rated 1/5,Rated 2/5,Rated 1/5,Rated 1/5,Rated 2/5,Rated 1/5,Rated 1/5,,,Tirana International Airport,Albania2024-03-13.csv,,,,,
2,2023-10-13 04:50:22,Rated 3/5,In 2023 still not admit the digital boarding p...,Rated 3/5,Rated 2/5,Rated 3/5,Rated 3/5,,Rated 3/5,,,,Tirana International Airport,Albania2024-03-13.csv,,,,,
3,2023-08-28 09:46:12,Rated 3/5,Tirana’s airport is way too small compared to ...,Rated 4/5,,Rated 5/5,Rated 3/5,Rated 2/5,Rated 1/5,,,,Tirana International Airport,Albania2024-03-13.csv,,,,,
4,2022-06-01 13:47:01,Rated 3/5,The immigration staff are rude and impersonal....,Rated 2/5,Rated 5/5,Rated 3/5,Rated 2/5,Rated 3/5,Rated 1/5,Rated 3/5,,,Tirana International Airport,Albania2024-03-13.csv,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6765,2020-02-27 15:38:12,Rated 3/5,This airport is very strange. It has a section...,,,,Rated 2/5,Rated 2/5,Rated 3/5,,,,Lusaka Kenneth Kaunda International Airport,Zambia_reviews.csv,,,,,
6766,2020-02-27 15:27:14,Rated 3/5,Immigration and customs went smoothly because ...,,,,,,,,Rated 4/5,,Lusaka Kenneth Kaunda International Airport,Zambia_reviews.csv,,,,,
6767,2015-03-27 15:14:06,Rated 4/5,"Nice airport, the staff just a bit slow, the i...",Rated 3/5,,,Rated 3/5,Rated 2/5,Rated 3/5,Rated 0/5,Rated 3/5,Rated 4/5,Lusaka Kenneth Kaunda International Airport,Zambia_reviews.csv,,,,,
6768,2013-04-23 20:23:06,Rated 5/5,lovely airport with lovely staff :),,,,,,,,,,Lusaka Kenneth Kaunda International Airport,Zambia_reviews.csv,,,,,


In [211]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Read from Hadoop") \
    .getOrCreate()
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
#spark.conf.set("spark.memory.offHeap.enabled","true") 
#spark.conf.set("spark.memory.offHeap.size","10g")

# Set configurations

departure_data = spark.createDataFrame(dfs_reviews)


In [221]:

from pyspark.sql.functions import col, split, regexp_extract, when
from pyspark.sql.functions import to_timestamp

def transform_departure_data(departure_data):
    # Drop unnecessary columns
    columns_to_drop = ["Baggage claim", "Check-in", "In-Flight entertainment", 
                       "In-Flight services", "Flight crew", "Boarding/deplaning", 
                       "{{objSubRating.subject}}", "Immigration/customs"]
    df = departure_data.drop(*columns_to_drop)

    # Define transformation for each column
    columns_to_transform = ["Getting to the airport", "Security check", "Terminal facilities",
                            "Food and retail services", "WiFi", "Stars Title", "Lounge"]
    for column in columns_to_transform:
        df = df.withColumn(column, col(column).substr(7, 1))

    # Convert date column to timestamp format
    df = df.withColumn("date", to_timestamp("date", "yyyy-MM-dd HH:mm:ss"))

    # Extract Airport country from Filename column
    df = df.withColumn("Airport country",
                       when(df["Filename"].contains("_"),
                            split(col("Filename"), "_")[0])
                       .otherwise(regexp_extract(col("Filename"), "^(.*?)(?=[0-9])", 1)))
    df = df.drop("Filename")
    df = df.dropDuplicates()

    # Rename column
    df = df.withColumnRenamed("Destination Aeroport", "Airport")

    return df



In [222]:
# Apply transformation function to the departure_data DataFrame
transformed_departure_data = transform_departure_data(departure_data)

# Display the transformed DataFrame
transformed_departure_data.toPandas()


Unnamed: 0,date,Stars Title,text review,Getting to the airport,Security check,Terminal facilities,Food and retail services,WiFi,Lounge,Airport,Airport country
0,2023-12-11 11:04:49,1,Lounge was closed without prior notice neither...,1,1,1,2,1,1,Tirana International Airport,Albania
1,2023-03-20 19:56:45,4,"So, if you;re ever visiting, this airport is b...",2,4,4,4,2,5,Barranquilla Ernesto Cortissoz International A...,Colombia
2,2023-05-02 02:55:00,2,"If you are getting a local flight, C terminal ...",4,5,2,2,5,3,Buenos Aires Ministro Pistarini International ...,Argentina
3,2023-06-20 00:39:01,5,"The good things about it are its simplicity, c...",5,5,5,4,5,5,Setif Ain Arnat Airport,Algeria
4,2023-04-08 18:43:33,3,Tired of how Colombian police at airports targ...,5,1,3,4,3,1,Medellin Jose Maria Cordova International Airport,Colombia
...,...,...,...,...,...,...,...,...,...,...,...
6095,2022-11-13 16:16:41,5,Food and Drink is extremely expensive.\nI don'...,5,3,5,1,,,Malaga Costa Del Sol Airport,Spain
6096,2023-04-14 20:46:15,4,"Pretty clean, WiFi is slow however and at plac...",5,5,5,4,2,,Palma de Mallorca Airport,Spain
6097,2012-09-16 23:29:10,/,Fantastic!!,,,,,,,Santander Airport,Spain
6098,NaT,5,"Nice little airport, flew out of there today. ...",,,,,,,Valencia Airport,Spain
