In [None]:
pip install requests beautifulsoup4 pandas

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_Electron_launches"

# Sending a request to fetch the content of the page
response = requests.get(url)
response.raise_for_status()  # Check if the request was successful

# Parsing the page content with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Finding the table with the list of launches
tables = soup.find_all("table", {"class": "wikitable"})

# Extracting data from the tables
launch_data = []
headers = []
for table in tables:
    current_headers = [header.get_text(strip=True) for header in table.find_all("th")]
    if len(current_headers) > len(headers):
        headers = current_headers
    rows = table.find_all("tr")
    for row in rows:
        cells = row.find_all(["th", "td"])
        if len(cells) > 0:
            launch_data.append([cell.get_text(strip=True) for cell in cells])

# Determine the maximum number of columns in the data rows
max_columns = max(len(row) for row in launch_data)

# Ensure all rows have the same number of columns
for row in launch_data:
    while len(row) < max_columns:
        row.append("")

# Create DataFrame
df = pd.DataFrame(launch_data)

# Set the headers
df.columns = headers[:max_columns]

# Display the DataFrame
print(df)

# Save the DataFrame to a CSV file
df.to_csv("electron_launches.csv", index=False)


In [3]:
import pandas as pd

# Load the CSV file
file_path = "electron_launches.csv"
df = pd.read_csv(file_path)

# Drop any completely empty rows
df.dropna(how='all', inplace=True)

# Function to check if a row is valid (i.e., it contains a flight number and other key data points)
def is_valid_row(row):
    return row['FlightNo.'].isdigit()

# Filter out rows that are not valid
df = df[df.apply(is_valid_row, axis=1)]

# Reset index after filtering
df.reset_index(drop=True, inplace=True)

# Strip whitespace from headers and rows
df.columns = df.columns.str.strip()
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Display the cleaned DataFrame
print("Cleaned DataFrame:")
print(df.head(10))

# Save the cleaned DataFrame to a new CSV file
cleaned_file_path_final = "electron_launches_cleaned_final.csv"
df.to_csv(cleaned_file_path_final, index=False)

print(f"Cleaned data has been saved to {cleaned_file_path_final}")


Cleaned DataFrame:
  FlightNo.                                 Name  \
0         1                        "It's a Test"   
1         2                      "Still Testing"   
2         3             "It's Business Time"[19]   
3         4        "This One's ForPickering"[28]   
4         5                  "Two Thumbs Up"[31]   
5         6  "That's a Funny Looking Cactus"[36]   
6         7                       "Make it Rain"   
7         8                  "Look Ma, No Hands"   
8         9                  "As the Crow Flies"   
9        10             "Running Out Of Fingers"   

                   Date/time(UTC)           Launch site  \
0           25 May 2017, 04:20[3]          Mahia, LC-1A   
1       21 January 2018, 01:43[6]          Mahia, LC-1A   
2         11 November 2018, 03:50  Mahia, LC-1A[19][20]   
3      16 December 2018, 06:33[7]          Mahia, LC-1A   
4        28 March 2019, 23:27[31]          Mahia, LC-1A   
5           5 May 2019, 06:00[37]          Mahia, LC-1

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [5]:
import pandas as pd
import re

# Load the CSV file
file_path = "electron_launches_cleaned_final.csv"
df = pd.read_csv(file_path)

# Function to clean special characters
def clean_special_characters(text):
    if isinstance(text, str):
        return text.encode('ascii', 'ignore').decode('ascii')
    return text

# Function to remove references like [20] from the text
def remove_references(text):
    if isinstance(text, str):
        return re.sub(r'\[\d+\]', '', text)
    return text

# Function to extract KG values from the payload mass, considering both 'kg' and 'kilograms'
def extract_kg(text):
    if isinstance(text, str):
        match = re.search(r'(\d+)\s*(kg|kilograms)', text.lower())
        return match.group(1) + ' kg' if match else ''
    return ''

# Apply the cleaning functions to the DataFrame
df = df.applymap(clean_special_characters)
df = df.applymap(remove_references)
df['Payload Mass'] = df['Payload Mass'].apply(extract_kg)

# Strip whitespace from headers and rows
df.columns = df.columns.str.strip()
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Save the cleaned DataFrame to a new CSV file
cleaned_file_path_final_v4 = "electron_launches_cleaned_final_v4.csv"
df.to_csv(cleaned_file_path_final_v4, index=False)

# Display the first few rows of the cleaned DataFrame
cleaned_df_final_head = df.head(20)
print(cleaned_df_final_head)

# Path to the cleaned file
cleaned_file_path_final_v4


    FlightNo. LaunchType                                Name  \
0           1    Orbital                       "It's a Test"   
1           2    Orbital                     "Still Testing"   
2           3    Orbital                "It's Business Time"   
3           4    Orbital           "This One's ForPickering"   
4           5    Orbital                     "Two Thumbs Up"   
5           6    Orbital     "That's a Funny Looking Cactus"   
6           7    Orbital                      "Make it Rain"   
7           8    Orbital                 "Look Ma, No Hands"   
8           9    Orbital                 "As the Crow Flies"   
9          10    Orbital            "Running Out Of Fingers"   
10         11    Orbital                "Birds of a Feather"   
11         12    Orbital                 "Don't Stop Me Now"   
12         13    Orbital          "Pics Or It Didn't Happen"   
13         14    Orbital  "I Can't Believe It's Not Optical"   
14         15    Orbital                

  df = df.applymap(clean_special_characters)
  df = df.applymap(remove_references)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


'electron_launches_cleaned_final_v4.csv'