# Space X Falcon 9 First Stage Landing Prediction

### Web scraping Falcon 9 and Falcon Heavy Launches Records from Wikipedia¶

## import required packages

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import unicodedata


## Fetch the HTML Page

Use the static Wikipedia URL to fetch the page.

In [2]:
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

# Perform HTTP GET request
response = requests.get(static_url)

# Parse the response content
soup = BeautifulSoup(response.content, "html.parser")

# Print the page title to verify
print(soup.title.text)


List of Falcon 9 and Falcon Heavy launches - Wikipedia


##  Extract the Target Table

Find all tables in the page and identify the one with the launch data.

In [4]:
# Find all tables on the page
html_tables = soup.find_all("table", class_="wikitable")

# Select the third table (index 2)
target_table = html_tables[2]
print(target_table)


<table class="wikitable plainrowheaders collapsible" style="width: 100%;">
<tbody><tr>
<th scope="col">Flight No.
</th>
<th scope="col">Date and<br/>time (<a href="/wiki/Coordinated_Universal_Time" title="Coordinated Universal Time">UTC</a>)
</th>
<th scope="col"><a href="/wiki/List_of_Falcon_9_first-stage_boosters" title="List of Falcon 9 first-stage boosters">Version,<br/>Booster</a><sup class="reference" id="cite_ref-booster_11-2"><a href="#cite_note-booster-11"><span class="cite-bracket">[</span>b<span class="cite-bracket">]</span></a></sup>
</th>
<th scope="col">Launch site
</th>
<th scope="col">Payload<sup class="reference" id="cite_ref-Dragon_12-2"><a href="#cite_note-Dragon-12"><span class="cite-bracket">[</span>c<span class="cite-bracket">]</span></a></sup>
</th>
<th scope="col">Payload mass
</th>
<th scope="col">Orbit
</th>
<th scope="col">Customer
</th>
<th scope="col">Launch<br/>outcome
</th>
<th scope="col"><a href="/wiki/Falcon_9_first-stage_landing_tests" title="Falcon 9

## Extract Column Names

Parse the table headers (<th> elements) to extract column names.

In [5]:
# Extract column names
def extract_column_from_header(row):
    if row.br:
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()
    column_name = ' '.join(row.contents).strip()
    return column_name if column_name and not column_name.isdigit() else None

column_names = [
    extract_column_from_header(th)
    for th in target_table.find_all("th")
]
column_names = [name for name in column_names if name]
print(column_names)


['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']


### helper function

In [18]:
# Parse rows
for row in target_table.find_all("tr")[1:]:  # Skip header row
    cells = row.find_all("td")
    if len(cells) > 0:
        # Ensure row has a <th> element before accessing it
        flight_no = row.find("th")
        if flight_no is not None:
            launch_dict['Flight No.'].append(flight_no.text.strip())
        else:
            launch_dict['Flight No.'].append("")  # Add a placeholder if no <th> is present

        # Extract date and time
        date, time = date_time(cells[0])
        launch_dict['Date'] = date
        launch_dict['Time'] = time

        # Extract other details
        launch_dict['Version Booster'].append(cells[1].text.strip() if len(cells) > 1 else "")
        launch_dict['Launch site'].append(cells[2].text.strip() if len(cells) > 2 else "")
        launch_dict['Payload'].append(cells[3].text.strip() if len(cells) > 3 else "")
        launch_dict['Payload mass'].append(get_mass(cells[4]) if len(cells) > 4 else "")
        launch_dict['Orbit'].append(cells[5].text.strip() if len(cells) > 5 else "")
        launch_dict['Customer'].append(cells[6].text.strip() if len(cells) > 6 else "")
        launch_dict['Launch outcome'].append(cells[7].text.strip() if len(cells) > 7 else "")
        launch_dict['Booster landing'].append(cells[8].text.strip() if len(cells) > 8 else "")


## Parse Table Rows

Iterate through table rows and extract relevant data into a dictionary.

Initialize the dictionary:

In [13]:
launch_dict= dict.fromkeys(column_names)

# Remove an irrelvant column
del launch_dict['Date and time ( )']

# Let's initial the launch_dict with each value to be an empty list
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []
# Added some new columns
launch_dict['Version Booster']=[]
launch_dict['Booster landing']=[]
launch_dict['Date']=[]
launch_dict['Time']=[]


In [17]:


# Parse rows
for row in target_table.find_all("tr")[1:]:  # Skip header row
    cells = row.find_all("td")
    if len(cells) > 0:
        # Ensure row has a <th> element before accessing it
        flight_no = row.find("th")
        if flight_no is not None:
            launch_dict['Flight No.'].append(flight_no.text.strip())
        else:
            launch_dict['Flight No.'].append("")  # Add a placeholder if no <th> is present

        # Extract date and time
        date, time = date_time(cells[0])
        launch_dict['Date'] = date
        launch_dict['Time'] = time

        # Extract other details
        launch_dict['Version Booster'].append(cells[1].text.strip() if len(cells) > 1 else "")
        launch_dict['Launch site'].append(cells[2].text.strip() if len(cells) > 2 else "")
        launch_dict['Payload'].append(cells[3].text.strip() if len(cells) > 3 else "")
        launch_dict['Payload mass'].append(get_mass(cells[4]) if len(cells) > 4 else "")
        launch_dict['Orbit'].append(cells[5].text.strip() if len(cells) > 5 else "")
        launch_dict['Customer'].append(cells[6].text.strip() if len(cells) > 6 else "")
        launch_dict['Launch outcome'].append(cells[7].text.strip() if len(cells) > 7 else "")
        launch_dict['Booster landing'].append(cells[8].text.strip() if len(cells) > 8 else "")



## Create a DataFrame

In [20]:
df = pd.DataFrame(launch_dict)
df.head()

Unnamed: 0,Flight No.,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Version Booster,Booster landing,Date,Time
0,14.0,"Cape Canaveral,LC-40",SpaceX CRS-5[68](Dragon C107),"2,395 kg",LEO (ISS),NASA (CRS),Success[70],F9 v1.1B1012[8],Failure (drone ship),Payload included eleven satellites weighing 17...,[
1,14.0,"Cape Canaveral,LC-40",SpaceX CRS-5[68](Dragon C107),"2,395 kg",LEO (ISS),NASA (CRS),Success[70],F9 v1.1B1012[8],Failure (drone ship),Payload included eleven satellites weighing 17...,[
2,,,,,,,,,,Payload included eleven satellites weighing 17...,[
3,15.0,"Cape Canaveral,LC-40",DSCOVR[68][73],570 kg,HEO(Sun–Earth L1 insertion),USAF\nNASA\nNOAA,Success,F9 v1.1B1013[8],Controlled(ocean)[d],Payload included eleven satellites weighing 17...,[
4,,,,,,,,,,Payload included eleven satellites weighing 17...,[


## Export the Data

In [21]:
df.to_csv('spacex_web_scraped.csv', index=False)
print("Data exported to spacex_web_scraped.csv")


Data exported to spacex_web_scraped.csv
