In [13]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException,TimeoutException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options
import os
import pandas as pd
from datetime import datetime
from hdfs import InsecureClient



In [14]:
def get_country():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options = chrome_options)
    try:
        url = "https://www.flightradar24.com/data/airports"
        driver.get(url)
        # Find all links to countries
        country_links = driver.find_elements(By.CSS_SELECTOR, 'table#tbl-datatable a[href^="https://www.flightradar24.com/data/airports/"]')
        country = []
        # Extract and print the country names
        for country_link in country_links:
            country_name = country_link.get_attribute("title")
            country.append(country_name.replace(" ", "-").replace("(", "").replace(")", ""))
        country_set = set(country)
    finally:
        driver.quit()
        return (country_set)

In [15]:
def scrape_airport_data(country_url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options = chrome_options)

    try:
        driver.get(country_url)

        # Find all airport links
        IATA = []
        airport_names = []
        airport_links = driver.find_elements(By.CSS_SELECTOR, 'a[data-iata][data-lat][data-lon]')
        for airport_link in airport_links:
            airport_name = airport_link.text.strip().split('\n')[0]
            airport_iata = airport_link.get_attribute('data-iata')
            airport_icao = airport_name.split('(')[-1].split(')')[0]
            IATA.append(airport_iata)
            airport_names.append(airport_name)
        airport_names_cleaned = [airport.split(' (')[0] for airport in airport_names]

        

    finally:
        driver.quit()
    return IATA , airport_names_cleaned


In [16]:
def scrape_all_countries_airports(main_url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options = chrome_options)

    try:
        driver.get(main_url)

        # Find all links to countries
        country_links = driver.find_elements(By.CSS_SELECTOR, 'table#tbl-datatable a[href^="https://www.flightradar24.com/data/airports/"]')
        AEROPORT_IATA = {}
        for country_link in set(country_links):
            country_url = country_link.get_attribute("href")
            for elem in scrape_airport_data(country_url)[0] : 
                AEROPORT_IATA[elem] = country_url[44:]

    finally:
        driver.quit()
    return AEROPORT_IATA


In [17]:
def click_button( driver , button_text):
            k = 0
            while (True and k<10):
                try:
                    k = k+1
                    # Wait for the button to be present and visible
                    button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, f'//button[text()="{button_text}"]')))
                    # Click the button
                    button.click()
                    time.sleep(3)
                except (ElementClickInterceptedException, ElementNotInteractableException, TimeoutException) as e:
                    break
                except (ElementNotVisibleException, TimeoutException):
                    # Break the loop if the button is not visible or timeout occurs
                    break 

In [18]:
print("a")

a


In [19]:
def scrape_flight_data (airport_code) :
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options = chrome_options)
    airport_url = "https://www.flightradar24.com/data/airports/"+airport_code+"/arrivals"
    try:
        driver.get(airport_url)

        # Wait for the button to appear and accept cookies
        time.sleep(15)
        try:
            button1 = driver.find_element(By.ID, 'onetrust-accept-btn-handler')
            button1.click()
        except NoSuchElementException:
            pass  # If the accept cookies button is not found, continue without clicking
        
        element = driver.find_elements(By.XPATH,'//*[@id="cnt-data-content"]/div/div[2]/div/aside/div[1]/table/tbody/tr[2]')
        for e in element:
            soup = e.get_attribute('outerHTML')
            sp = BeautifulSoup(soup, 'html.parser')
            tr_element = sp.find('tr')
            # Get the value of the class attribute
            class_value = tr_element.get('class')
            if (len(class_value) >0):
            # Wait for the "Load later flights" button to be clickable
               # click_button(driver, "Load later flights")
               # time.sleep(3)
                click_button(driver, "Load earlier flights")
                #time.sleep(3)
                # Find all flight information rows
                flight_rows = driver.find_elements(By.XPATH, '//tr[@class="hidden-xs hidden-sm ng-scope"]')
                flight_data = []
                for row in flight_rows:
                    soup = row.get_attribute('outerHTML')
                    sp = BeautifulSoup(soup, 'html.parser')
                    tr_tag = sp.find('tr', {'class': 'hidden-xs hidden-sm ng-scope'})
                    # Extract the value of the 'data-date' attribute
                    if tr_tag :
                        data_date = tr_tag.get('data-date')
                    else : 
                        data_date = "N/A" 
                
                    flight_time_tag = sp.find('td', class_='ng-binding')
                    if flight_time_tag : 
                        flight_time = flight_time_tag.text.strip()
                    else : 
                        flight_time ="N/A"
                    # Find the div tag with ng-show="(objFlight.flight.airport.origin)"
                    div_tag = sp.find('div', {'ng-show': '(objFlight.flight.airport.origin)'})
                    # If the div tag is found, extract the origin airport
                    if div_tag:
                        origin_airport = div_tag.find('a', class_='ng-binding')['title']
                    else:
                        origin_airport = "N/A"                
                    flight_tag = sp.find('a', class_='notranslate ng-binding')
                    if flight_tag : 
                        flight = flight_tag.text.strip()
                    else : 
                        flight ="N/A"
                
                    aircraft_model_tag = sp.find('span', class_='notranslate ng-binding')
                    if aircraft_model_tag : 
                        aircraft_model = aircraft_model_tag.text.strip()
                    else : 
                        aircraft_model = "N/A"

                    flight_status_tag =sp.find('span', class_='ng-binding', attrs={'ng-bind-html': 'objFlight.flight.statusMessage.text | unsafe'})
                    if flight_status_tag : 
                        flight_status = flight_status_tag.text.strip()
                    else : 
                        flight_status= "N/A"
                    # Find the <td> tag containing the time
                    td_elements = sp.find_all('td', class_='ng-binding')
                    # Extract the text from the second <td> element
                    if len(td_elements) > 1:
                        time_arrival = td_elements[1].text.split()[-1]
                    else:
                        time_arrival="Time not found."
                    # Find all <a> tags with class "notranslate ng-binding"
                    a_tags = sp.find_all('a', class_='notranslate ng-binding')
                    # Extract the text content of the second <a> tag (index 1)
                    if len(a_tags) >= 2:
                        aircraft = a_tags[1].text.strip()
                    else:
                        aircraft = "N/A"
                    # Append data to the list as a dictionary
                    flight_data.append({
        "Date": data_date,
        "Flight Time": flight_time,
        "Aircraft": aircraft,
        "Origin Airport": origin_airport,
        "flight": flight,
        "Aircraft Model": aircraft_model,
        "Flight Status": flight_status,
        "time_arrival":time_arrival})
                df = pd.DataFrame(flight_data)
                return (df)
            else : 
                return "we don't have any data for this ariport" 


    finally:
            #time.sleep(3)
            driver.quit()
            


In [8]:
df=scrape_flight_data("alg")
df

Unnamed: 0,Date,Flight Time,Aircraft,Origin Airport,flight,Aircraft Model,Flight Status,time_arrival
0,"Monday, Mar 25",22:06,Saudia,"Medina Prince Mohammad bin Abdulaziz Airport, ...",SV9316,A332,Landed,22:05
1,"Monday, Mar 25",22:35,Air Algerie,"Hassi Messaoud Oued Irara Airport, Algeria",AH6225,AT7,Canceled,Canceled
2,"Monday, Mar 25",22:50,Air Algerie,"Paris Charles de Gaulle Airport, France",AH1013,B738,Landed,23:00
3,"Monday, Mar 25",23:15,Air Algerie,"Nouakchott Oumtounsy International Airport, Ma...",AH5201,73K,Unknown,Unknown
4,"Monday, Mar 25",23:25,Turkish Airlines,"Istanbul Airport, Turkey",TK655,A321,Landed,00:04
...,...,...,...,...,...,...,...,...
220,"Thursday, Mar 28",09:50,Air Algerie,"Paris Orly Airport, France",AH1009,73H,Scheduled,Scheduled
221,"Thursday, Mar 28",10:05,Luxaviation Belgium,"Brussels Airport, Belgium",,C25C,Scheduled,Scheduled
222,"Thursday, Mar 28",10:10,Air Algerie,"Annaba Rabah Bitat Airport, Algeria",AH6171,AT7,Scheduled,Scheduled
223,"Thursday, Mar 28",10:10,Air Algerie,Constantine Mohamed Boudiaf International Airp...,AH6191,AT7,Scheduled,Scheduled


In [20]:
def scrape_flight_from_country (country_name):
    url_country = "https://www.flightradar24.com/data/airports/" + country_name
    IATA , aeroport_names = scrape_airport_data(url_country)
    data_total_country = []
    for i, j in  zip (IATA, aeroport_names) : 
        df_aeroport = scrape_flight_data(i) 
        if isinstance(df_aeroport, pd.DataFrame):
            print("data founded in ",j , " = " , len(df_aeroport)  )
            df_aeroport["Destination Aeroport"] =  j 
            data_total_country.append(df_aeroport)
    if len(data_total_country) >0 :
        concatenated_df = pd.concat(data_total_country, ignore_index=True)
        return concatenated_df


In [21]:
def save_csv(data, parent_folder, folder_name, file_name):
    # Create the folder if it doesn't exist
    folder_path = os.path.join(parent_folder, folder_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Define the file path within the new folder
    file_path = os.path.join(folder_path, file_name)

    send_to_hdfs(data , file_path.replace(os.path.sep, '/'))


In [22]:
def send_to_hdfs(data, file_path): 
    client = InsecureClient('http://localhost:50070')
    with client.write(file_path, overwrite=True) as writer:
        data.to_csv(writer, index=False)


In [23]:
from IPython.display import HTML
def scrape_all_arrival_flight() :
    countries = get_country()
 #   k = 0
    for country in countries :
       # if country not in ["Central-African-Republic","Guam","Togo","Malawi","Ukraine","Timor-leste-east-Timor","Guernsey","Ghana","Puerto-Rico","American-Samoa","Haiti","Virgin-Islands-Us","Cameroon","Moldova","Latvia","Uruguay","Gambia","Martinique","Dominican-Republic","Burkina-Faso","India" ,"Cambodia","Seychelles","Vietnam","Panama","Bahamas","Uganda","Anguilla,""Saint-Vincent-And-The-Grenadines","Mexico" "Yemen", "Curacao", "Brunei","Grenada","Guatemala","Chad", "Northern-Mariana-Islands" , "Nauru","Ethiopia", "Singapore","Democratic-Republic-Of-The-Congo","Libya","Benin","Bosnia-And-Herzegovina","United-Arab-Emirates","Cape-Verde","Tonga","Russia", "Papua-New-Guinea","Sierra-Leone" ]:
            print("start of extract data from :", country)
            data = scrape_flight_from_country(country) 
            print("End of extract data from :", country)

            date = datetime.now().date()
            # File path to save the CSV data
            file_name = country + str(date) +".csv"
            # Save the DataFrame to a CSV file
            parent_folder = "/user/PFE_data/arrival_flights"
            #folder name 
            folder_name = country

            if isinstance(data, pd.DataFrame) :
                save_csv(data,parent_folder ,folder_name,file_name )
                message = '<p style="color:green;">data saved</p>'
                display(HTML(message))
               


In [24]:
scrape_all_arrival_flight()

start of extract data from : Luxembourg
data founded in  Luxembourg Findel Airport  =  243
End of extract data from : Luxembourg


start of extract data from : Paraguay
data founded in  Asuncion Silvio Pettirossi International Airport  =  54
End of extract data from : Paraguay


start of extract data from : Greenland
data founded in  Aasiaat Airport  =  12
data founded in  Ilulissat Airport  =  32
data founded in  Kangerlussuaq Airport  =  51
data founded in  Kulusuk Airport  =  11
data founded in  Maniitsoq Airport  =  7
data founded in  Narsarsuaq Airport  =  20
data founded in  Nerlerit Inaat Airport  =  10
