### Task 1 - Data Collection

#### SpaceX-API Data Crawling

SpaceX-API: https://github.com/r-spacex/SpaceX-API/tree/master/docs

Total 179 rows of data from 2010 Jun 4 to 2022 Oct 5.

In [1]:
import requests
import pandas as pd
import numpy as np

In [2]:
# Launch data
launch_url = "https://api.spacexdata.com/v4/launches/past"
launch_data = requests.get(launch_url).json()

# Rocket data
rocket_url = "https://api.spacexdata.com/v4/rockets"
rocket_data = requests.get(rocket_url).json()
rocket_df = pd.json_normalize(rocket_data)

# Payload data
payload_url = "https://api.spacexdata.com/v4/payloads"
payload_data = requests.get(payload_url).json()
payload_df = pd.json_normalize(payload_data)

# Launchpad data
launchpad_url = "https://api.spacexdata.com/v4/launchpads"
launchpad_data = requests.get(launchpad_url).json()
launchpad_df = pd.json_normalize(launchpad_data)

# Core data
core_url = "https://api.spacexdata.com/v4/cores"
core_data = requests.get(core_url).json()
core_df = pd.json_normalize(core_data)

# Orbit altitude data (collected from google)
orbit_data = [["LEO", 2000], ["VLEO", 450], ["GTO", 35786], ["SSO", 800], ["ES-L1", 1500000], ["HEO", 35786], ["ISS", 400], \
               ["MEO", 20000], ["GEO", 35786], ["PO", 1000]]
orbit_df = pd.DataFrame(orbit_data, columns=["Orbit", "Altitude"])

In [3]:
# Select columns
api_data = pd.json_normalize(launch_data, record_path = ["cores"], meta=["date_utc", "rocket", "payloads", "launchpad"])
api_data = api_data.rename(columns={"flight":"Flights", "gridfins":"GridFins", "legs":"Legs", "reused":"Reused", \
                                          "landing_success":"Outcome", "landpad":"LandingPad"})

rocket = rocket_df[["id", "name"]]
rocket = rocket.rename(columns={"id":"rocket_id", "name":"BoosterVersion"})

payload = payload_df[["id", "mass_kg", "orbit"]]
payload = payload.rename(columns={"id":"payload_id", "mass_kg":"PayloadMass", "orbit":"Orbit"})

launchpad = launchpad_df[["id", "name", "longitude", "latitude"]]
launchpad = launchpad.rename(columns={"id":"launchpad_id", "name":"LaunchSite", "longitude":"Longitude", "latitude":"Latitude"})

core = core_df[["id", "block", "reuse_count", "serial"]]
core = core.rename(columns={"id":"core_id", "block":"Block", "reuse_count":"ReusedCount", "serial":"Serial"})

In [4]:
# Merge data and data cleaning
api_df = api_data.merge(rocket, how="left", left_on='rocket', right_on='rocket_id')
api_df = api_df[api_df["BoosterVersion"] == "Falcon 9"]

# If a rocket carries multiple payloads, calculate the total mass of the payloads.
# If differet payloads are sent to different orbits within a mission, select the highest altitude of the orbits.
payloadmass = np.zeros(len(api_df))
orbit = np.zeros(len(api_df), object)

for i, n in enumerate(api_df.payloads):
    n_payloads = len(n)
    if n_payloads == 0: # no payload on rocket
        payloadmass[i] = None
        orbit[i] = None
    elif n_payloads == 1: # one payload on rocket
        payload_id = n[0]
        payloadmass[i] = payload[payload["payload_id"] == payload_id]["PayloadMass"]
        orbit[i] = payload[payload["payload_id"] == payload_id]["Orbit"].values[0]
    else: # more than one payloads on rocket
        total_mass = 0
        payload_orbits = []
        # calculate the total mass
        for payload_id in n:
            mass = payload[payload["payload_id"] == payload_id]["PayloadMass"].values[0]
            if not np.isnan(mass):
                total_mass += mass
            payload_orbits.append(payload[payload["payload_id"] == payload_id]["Orbit"].values[0])
        
        # derive the highest orbit of the payloads
        highest_altitude_orbit = payload_orbits[0]
        highest_altitude = orbit_df[orbit_df["Orbit"] == highest_altitude_orbit]["Altitude"].values[0]
        for o in range(len(payload_orbits)-1):
            next_payload_orbit = payload_orbits[o+1]
            if next_payload_orbit is not None and next_payload_orbit != highest_altitude_orbit:
                next_orbit_altitude = orbit_df[orbit_df["Orbit"] == next_payload_orbit]["Altitude"].values[0]
                if  next_orbit_altitude > highest_altitude:
                    highest_altitude_orbit = next_payload_orbit
                    highest_altitude = next_orbit_altitude
        
        payloadmass[i] = total_mass
        orbit[i] = highest_altitude_orbit

api_df["PayloadMass"] = payloadmass
api_df["Orbit"] = orbit

api_df = api_df.merge(launchpad, how="left", left_on='launchpad', right_on='launchpad_id')
api_df = api_df.merge(core, how="left", left_on='core', right_on='core_id')

api_df[['Date', 'Time']] = api_df.date_utc.str.split("T", expand = True)
api_df["Date"] = pd.to_datetime(api_df['Date'])
api_df["Hour"] = api_df.Time.str.split(":").str[0]

api_df = api_df[["Date", "Hour", "BoosterVersion", "PayloadMass", "Orbit", "LaunchSite", "Flights", "GridFins", "Reused", "Legs", \
                       "Block", "ReusedCount", "Serial", "Longitude", "Latitude", "Outcome"]]
api_df.loc[api_df["Orbit"] == "SO", "Orbit"] = "SSO"
api_df = api_df.sort_values(by=["Date"])
api_df = api_df.reset_index(drop=True)

In [5]:
api_df

Unnamed: 0,Date,Hour,BoosterVersion,PayloadMass,Orbit,LaunchSite,Flights,GridFins,Reused,Legs,Block,ReusedCount,Serial,Longitude,Latitude,Outcome
0,2010-06-04,18,Falcon 9,,LEO,CCSFS SLC 40,1,False,False,False,1.0,0,B0003,-80.577366,28.561857,
1,2010-12-08,15,Falcon 9,0.0,LEO,CCSFS SLC 40,1,False,False,False,1.0,0,B0004,-80.577366,28.561857,
2,2012-05-22,07,Falcon 9,525.0,LEO,CCSFS SLC 40,1,False,False,False,1.0,0,B0005,-80.577366,28.561857,
3,2012-10-08,00,Falcon 9,800.0,LEO,CCSFS SLC 40,1,False,False,False,1.0,0,B0006,-80.577366,28.561857,
4,2013-03-01,19,Falcon 9,677.0,ISS,CCSFS SLC 40,1,False,False,False,1.0,0,B0007,-80.577366,28.561857,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,2022-09-05,02,Falcon 9,13440.0,VLEO,CCSFS SLC 40,7,True,True,True,5.0,6,B1052,-80.577366,28.561857,True
175,2022-09-11,01,Falcon 9,14760.0,VLEO,KSC LC 39A,14,True,True,True,5.0,13,B1058,-80.603956,28.608058,True
176,2022-09-17,01,Falcon 9,13260.0,VLEO,CCSFS SLC 40,6,True,True,True,5.0,5,B1067,-80.577366,28.561857,True
177,2022-09-24,23,Falcon 9,13260.0,VLEO,CCSFS SLC 40,4,True,True,True,5.0,0,B1072,-80.577366,28.561857,True


#### Wikipedia Data Crawling - BeautifulSoup

2010-2019: https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches_(2010%E2%80%932019)

2020-current: https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches


Total 255 rows of data from 2010 Jun 4 to 2023 Sep 12.

In [6]:
from bs4 import BeautifulSoup

In [7]:
launch_2010_2019_url = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches_(2010%E2%80%932019)"
launch_2020_2023_url = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"

launch_2010_2019 = requests.get(launch_2010_2019_url)
launch_2020_2023 = requests.get(launch_2020_2023_url)

In [8]:
soup_2010_2019 = BeautifulSoup(launch_2010_2019.text, "html.parser")
num_launch_2010_2019 = len(soup_2010_2019.select('[id^="F9-"]')) # number of falcon 9 launches from 2010 to 2019

soup_2020_2023 = BeautifulSoup(launch_2020_2023.text, "html.parser")
num_launch_2020_2023 = len(soup_2020_2023.select('[id^="F9-"]')) # number of falcon 9 launches from 2020 to 2023

total_launch = num_launch_2010_2019 + num_launch_2020_2023

launch_df = pd.DataFrame(index=range(total_launch),columns=["FlightNumber", "DateTime", "BoosterVersion", "LaunchSite", "Payload", \
                                                            "PayloadMass", "Orbit", "Customer", "LaunchOutcome", "LandingOutcome"])

# Convert data from table to dataframe
# data from 2010 to 2019
for i in range(num_launch_2010_2019):
    i += 1
    if i < 10:
        data = soup_2010_2019.find(id=f"F9-00{i}").text.strip().split('\n\n')
        launch_df.iloc[i-1] = data
    else:
        data = soup_2010_2019.find(id=f"F9-0{i}").text.strip().split('\n\n')
        launch_df.iloc[i-1] = data

# data from 2020 to 2023
for i in range(num_launch_2010_2019, total_launch):
    i += 1
    if i < 100:
        data = soup_2020_2023.find(id=f"F9-0{i}").text.strip().split('\n\n')
        launch_df.iloc[i-1] = data
    else:
        data = soup_2020_2023.find(id=f"F9-{i}").text.strip().split('\n\n')
        launch_df.iloc[i-1] = data

In [9]:
# Data Cleaning
wp_df = launch_df[["FlightNumber"]].copy()

wp_df["Date"] = launch_df.DateTime.str.split(":").str[0].str[:-2]
wp_df["Hour"] = launch_df.DateTime.str.split(":").str[0].str[-2:]
wp_df["Minute"] = launch_df.DateTime.str.split(":").str[1].str[:2]
wp_df["Date"] = pd.to_datetime(wp_df['Date'])

wp_df.loc[launch_df["BoosterVersion"].str.contains("F9"), "BoosterVersion"] = "Falcon 9"

wp_df["PayloadMass"] = launch_df.PayloadMass.str.split("\xa0kg").str[0]
wp_df.loc[launch_df["PayloadMass"].str.contains("No payload"), "PayloadMass"] = "0"
wp_df["PayloadMass"] = wp_df["PayloadMass"].str.replace("~", "").str.replace(",", "")

# condition: if the format payload mass is like 1000kg-2000kg, then assign the mass as the average value 
condition = wp_df["PayloadMass"].str.contains("–")
average_payload = wp_df.loc[condition, "PayloadMass"].str.split("–").apply(lambda x: str((float(x[0]) + float(x[1])) / 2))
wp_df.loc[condition, "PayloadMass"] = average_payload

wp_df["PayloadMass"] = pd.to_numeric(wp_df["PayloadMass"], errors="coerce")

wp_df["Orbit"] = launch_df["Orbit"].str.split("[").str[0] # remove the reference symbols
wp_df.loc[wp_df["Orbit"] == "LEO (ISS)", "Orbit"] = "ISS"
wp_df.loc[wp_df["Orbit"] == "Polar orbit LEO", "Orbit"] = "PO"
wp_df.loc[wp_df["Orbit"] == "Sun–Earth L1 insertion", "Orbit"] = "ES-L1"
wp_df.loc[wp_df["Orbit"] == "Polar LEO", "Orbit"] = "PO"
wp_df.loc[wp_df["Orbit"] == "HEO for P/2 orbit", "Orbit"] = "HEO"
wp_df.loc[wp_df["Orbit"] == "Sub-orbital", "Orbit"] = "SSO"
wp_df.loc[wp_df["Orbit"] == "Heliocentric", "Orbit"] = "TLI"
wp_df.loc[wp_df["Orbit"] == "Ballistic lunar transfer (BLT)", "Orbit"] = "TLI"
wp_df.loc[wp_df["Orbit"] == "Retrograde LEO", "Orbit"] = "LEO"
wp_df.loc[wp_df["Orbit"] == "Sun–Earth L2 injection", "Orbit"] = "ES-L2"

wp_df["LaunchSite"] = launch_df["LaunchSite"].str.replace(",", " ").str.replace("-", " ").str.split("[").str[0]

wp_df["Serial"] = launch_df['BoosterVersion'].str.extract(r'(B\d{4})')

# Add Block column
block_df = core[["Block", "Serial"]].drop_duplicates()
wp_df["BlockName"] = launch_df['BoosterVersion'].str.extract(r'(v\d+\.\d+|FT|B\d+)')
wp_df = wp_df.merge(block_df, how="left",left_on='Serial', right_on='Serial')
wp_df.loc[(wp_df["Block"].isna()) & (wp_df["BlockName"] == "B4"), "Block"] = 4.0
wp_df.loc[(wp_df["Block"].isna()) & (wp_df["BlockName"] == "B5"), "Block"] = 5.0
wp_df.drop("BlockName", axis=1, inplace=True)

# Add Longtitude, Latitude columns
launchsite_df = launchpad[["LaunchSite", "Longitude", "Latitude"]]
# VSFB (record in Wikipedia) is the new name of VAFB (record in SpaceX)
launchsite_df.loc[launchsite_df["LaunchSite"] == "VAFB SLC 4E", "LaunchSite"] = "VSFB SLC 4E"
wp_df = wp_df.merge(launchsite_df, how="left",left_on='LaunchSite', right_on='LaunchSite')

wp_df["Outcome"] = None
wp_df.loc[launch_df["LandingOutcome"].str.contains("Success"), "Outcome"] = True
wp_df.loc[launch_df["LandingOutcome"].str.contains("Controlled"), "Outcome"] = True
wp_df.loc[launch_df["LandingOutcome"].str.contains("Failure"), "Outcome"] = False
wp_df.loc[launch_df["LandingOutcome"].str.contains("Uncontrolled"), "Outcome"] = False
wp_df.loc[launch_df["LandingOutcome"].str.contains("Precluded"), "Outcome"] = False
wp_df.loc[launch_df["LandingOutcome"].str.contains("No attempt"), "Outcome"] = "No attempt"

wp_df["LandingPlace"] = None
wp_df.loc[launch_df["LandingOutcome"].str.contains("drone ship"), "LandingPlace"] = "Drone ship"
wp_df.loc[launch_df["LandingOutcome"].str.contains("ground pad"), "LandingPlace"] = "Ground pad"
wp_df.loc[launch_df["LandingOutcome"].str.contains("ocean"), "LandingPlace"] = "Ocean"
# the landing place of the booster which is attempt to land with a parachute is ocean
wp_df.loc[launch_df["LandingOutcome"].str.contains("parachute"), "LandingPlace"] = "Ocean"


In [10]:
wp_df

Unnamed: 0,FlightNumber,Date,Hour,Minute,BoosterVersion,PayloadMass,Orbit,LaunchSite,Serial,Block,Longitude,Latitude,Outcome,LandingPlace
0,1,2010-06-04,18,45,Falcon 9,0.0,LEO,CCSFS SLC 40,B0003,1.0,-80.577366,28.561857,False,Ocean
1,2,2010-12-08,15,43,Falcon 9,,ISS,CCSFS SLC 40,B0004,1.0,-80.577366,28.561857,False,Ocean
2,3,2012-05-22,07,44,Falcon 9,525.0,ISS,CCSFS SLC 40,B0005,1.0,-80.577366,28.561857,No attempt,
3,4,2012-10-08,00,35,Falcon 9,4700.0,ISS,CCSFS SLC 40,B0006,1.0,-80.577366,28.561857,No attempt,
4,5,2013-03-01,15,10,Falcon 9,4877.0,ISS,CCSFS SLC 40,B0007,1.0,-80.577366,28.561857,No attempt,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,251,2023-09-01,02,21,Falcon 9,17600.0,LEO,CCSFS SLC 40,B1077,5.0,-80.577366,28.561857,True,Drone ship
251,252,2023-09-02,14,25,Falcon 9,,LEO,VSFB SLC 4E,B1063,5.0,-120.610829,34.632093,True,Ground pad
252,253,2023-09-04,02,47,Falcon 9,16800.0,LEO,KSC LC 39A,B1073,5.0,-80.603956,28.608058,True,Drone ship
253,254,2023-09-09,03,12,Falcon 9,17600.0,LEO,CCSFS SLC 40,B1076,5.0,-80.577366,28.561857,True,Drone ship


In [11]:
# api_df.to_csv("datasets/api_dataset.csv", sep=",", index=False)
# wp_df.to_csv("datasets/wp_dataset.csv", sep=",", index=False)