In [None]:
"""
SpaceX Falcon 9 - Web Scraping Wikipedia
Collecting historical launch data from Wikipedia
"""

# Install required packages
!pip install beautifulsoup4 requests pandas lxml html5lib

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

print("="*70)
print("SPACEX WEB SCRAPING - WIKIPEDIA")
print("="*70)

# Target URL
url = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"
print(f"\nüì° Target URL: {url}")

# Add headers to avoid 403 Forbidden error
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}

# Make HTTP Request with headers
try:
    response = requests.get(url, headers=headers, timeout=30)
    response.raise_for_status()
    print(f"‚úÖ HTTP Status: {response.status_code}")
except Exception as e:
    print(f"‚ùå Error: {e}")
    raise

# Parse HTML
soup = BeautifulSoup(response.content, 'html.parser')
print("‚úÖ HTML parsed successfully")

# Function to clean text
def clean_text(text):
    """Remove references [1], [2] and extra spaces"""
    if text is None:
        return None
    text = re.sub(r'\[.*?\]', '', str(text))  # Remove [1], [2], etc
    text = re.sub(r'\s+', ' ', text)          # Remove multiple spaces
    return text.strip()

# Find all wikitables
tables = soup.find_all('table', class_='wikitable')
print(f"üìä Tables found: {len(tables)}")

# Extract data from tables
launch_data = []
print("\nüîç Extracting data from tables...")

for table_idx, table in enumerate(tables):
    rows = table.find_all('tr')

    # Skip header row
    for row_idx, row in enumerate(rows[1:], 1):
        cells = row.find_all(['td', 'th'])

        # Only process rows with enough cells
        if len(cells) >= 6:
            try:
                # Extract text from cells safely
                data = {
                    'Flight_No': clean_text(cells[0].get_text()),
                    'Date': clean_text(cells[1].get_text()) if len(cells) > 1 else None,
                    'Version': clean_text(cells[2].get_text()) if len(cells) > 2 else None,
                    'Launch_Site': clean_text(cells[3].get_text()) if len(cells) > 3 else None,
                    'Payload': clean_text(cells[4].get_text()) if len(cells) > 4 else None,
                    'Payload_Mass': clean_text(cells[5].get_text()) if len(cells) > 5 else None,
                    'Orbit': clean_text(cells[6].get_text()) if len(cells) > 6 else None,
                    'Customer': clean_text(cells[7].get_text()) if len(cells) > 7 else None,
                    'Launch_Outcome': clean_text(cells[8].get_text()) if len(cells) > 8 else None,
                }

                # Add to list if Flight_No is not empty
                if data['Flight_No'] and data['Flight_No'] not in ['', 'N/A', 'None']:
                    launch_data.append(data)

            except Exception as e:
                # Silent error handling for problematic rows
                continue

print(f"\n‚úÖ Data extracted: {len(launch_data)} records")

# Create DataFrame
if len(launch_data) > 0:
    df_wiki = pd.DataFrame(launch_data)

    # Remove duplicates
    df_wiki = df_wiki.drop_duplicates()

    # Display info
    print("\n" + "="*70)
    print("DATAFRAME INFORMATION")
    print("="*70)
    print(f"Rows: {len(df_wiki)}")
    print(f"Columns: {len(df_wiki.columns)}")
    print(f"\nColumn names: {list(df_wiki.columns)}")

    # Show first few rows
    print("\n" + "="*70)
    print("FIRST 5 RECORDS")
    print("="*70)
    print(df_wiki.head())

    # Show data types
    print("\n" + "="*70)
    print("DATA TYPES")
    print("="*70)
    print(df_wiki.dtypes)

    # Check for null values
    print("\n" + "="*70)
    print("NULL VALUES CHECK")
    print("="*70)
    print(df_wiki.isnull().sum())

    # Basic statistics
    print("\n" + "="*70)
    print("BASIC STATISTICS")
    print("="*70)
    print(f"Total launches: {len(df_wiki)}")
    print(f"Unique launch sites: {df_wiki['Launch_Site'].nunique()}")
    print(f"Unique customers: {df_wiki['Customer'].nunique()}")

    # Save to CSV
    output_file = 'spacex_wikipedia_data.csv'
    df_wiki.to_csv(output_file, index=False)
    print(f"\n‚úÖ Data saved to: {output_file}")

    # Final summary
    print("\n" + "="*70)
    print("‚úÖ WEB SCRAPING COMPLETED SUCCESSFULLY!")
    print("="*70)
    print(f"Total launches scraped: {len(df_wiki)}")
    print(f"CSV file created: {output_file}")
    print("="*70)

else:
    print("‚ùå No data extracted. Please check the URL and table structure.")

# Display sample
df_wiki.head(10)


SPACEX WEB SCRAPING - WIKIPEDIA

üì° Target URL: https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches
‚úÖ HTTP Status: 200
‚úÖ HTML parsed successfully
üìä Tables found: 5

üîç Extracting data from tables...

‚úÖ Data extracted: 382 records

DATAFRAME INFORMATION
Rows: 382
Columns: 9

Column names: ['Flight_No', 'Date', 'Version', 'Launch_Site', 'Payload', 'Payload_Mass', 'Orbit', 'Customer', 'Launch_Outcome']

FIRST 5 RECORDS
  Flight_No                   Date        Version             Launch_Site  \
0       286   January 3, 202403:44   F9 B5B1082‚Äë1      Vandenberg, SLC‚Äë4E   
1       287   January 3, 202423:04  F9 B5B1076‚Äë10  Cape Canaveral, SLC‚Äë40   
2       288   January 7, 202422:35  F9 B5B1067‚Äë16  Cape Canaveral, SLC‚Äë40   
3       289  January 14, 202408:59  F9 B5B1061‚Äë18      Vandenberg, SLC‚Äë4E   
4       290  January 15, 202401:52  F9 B5B1073‚Äë12  Cape Canaveral, SLC‚Äë40   

                                Payload            Payload_Mass

Unnamed: 0,Flight_No,Date,Version,Launch_Site,Payload,Payload_Mass,Orbit,Customer,Launch_Outcome
0,286,"January 3, 202403:44",F9 B5B1082‚Äë1,"Vandenberg, SLC‚Äë4E",Starlink: Group 7-9 (22 satellites),"~16,800 kg (37,000 lb)",LEO,SpaceX,Success
1,287,"January 3, 202423:04",F9 B5B1076‚Äë10,"Cape Canaveral, SLC‚Äë40",Ovzon-3,"1,800 kg (4,000 lb)",GTO,Ovzon,Success
2,288,"January 7, 202422:35",F9 B5B1067‚Äë16,"Cape Canaveral, SLC‚Äë40",Starlink: Group 6-35 (23 satellites),"~17,100 kg (37,700 lb)",LEO,SpaceX,Success
3,289,"January 14, 202408:59",F9 B5B1061‚Äë18,"Vandenberg, SLC‚Äë4E",Starlink: Group 7-10 (22 satellites),"~16,700 kg (36,800 lb)",LEO,SpaceX,Success
4,290,"January 15, 202401:52",F9 B5B1073‚Äë12,"Cape Canaveral, SLC‚Äë40",Starlink: Group 6-37 (23 satellites),"~17,100 kg (37,700 lb)",LEO,SpaceX,Success
5,291,"January 18, 202421:49",F9 B5B1080‚Äë5,"Kennedy, LC‚Äë39A",Ax-3 (Crew Dragon C212-3 Freedom),"~13,000 kg (29,000 lb)",LEO (ISS),Axiom Space,Success
6,292,"January 24, 202400:35",F9 B5B1063‚Äë16,"Vandenberg, SLC‚Äë4E",Starlink: Group 7-11 (22 satellites),"~16,700 kg (36,800 lb)",LEO,SpaceX,Success
7,293,"January 29, 202401:10",F9 B5B1062‚Äë18,"Kennedy, LC‚Äë39A",Starlink: Group 6-38 (23 satellites),"~17,100 kg (37,700 lb)",LEO,SpaceX,Success
8,294,"January 29, 202405:57",F9 B5B1075‚Äë9,"Vandenberg, SLC‚Äë4E",Starlink: Group 7-12 (22 satellites),"~16,700 kg (36,800 lb)",LEO,SpaceX,Success
9,295,"January 30, 202417:07",F9 B5B1077‚Äë10,"Cape Canaveral, SLC‚Äë40","CRS NG-20 (S.S. Patricia ""Patty"" Hilliard Robe...","3,726 kg (8,214 lb)",LEO (ISS),Northrop Grumman (CRS),Success
