# SpaceX Falcon 9 — Data Collection (API + Web)

**Purpose:** Collect SpaceX launch data using the public SpaceX REST API and an archived (static) Wikipedia table as a fallback for historical launch metadata.  

This notebook is cleaned, documented, and structured for reproducible execution in **Google Colab** or **Jupyter Notebook**. Run the cells in order.  


*Author: Sarthak Shandilya*

In [None]:
# --------------------
# Setup: Install (if needed) & Imports
# Run this cell first in Colab or Jupyter.
# --------------------
import sys
import os

# If running in Google Colab, uncomment the install line below.
try:
    import google.colab  # type: ignore
    COLAB = True
except Exception:
    COLAB = False

if COLAB:
    # install only when running in Colab
    !pip install pandas requests beautifulsoup4 lxml html5lib plotly folium scikit-learn matplotlib seaborn nbformat --quiet

# Core imports
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)

print('Imports ready. Python version:', sys.version.split()[0])


In [None]:
# --------------------
# Functions: Data collection helpers (API + Wikipedia archived fallback)
# --------------------

def fetch_spacex_api(v4=True, timeout=15):
    """Fetch SpaceX launches. Returns a pandas DataFrame."""
    if v4:
        url = 'https://api.spacexdata.com/v4/launches/past'
    else:
        url = 'https://api.spacexdata.com/v3/launches'
    print(f'Fetching SpaceX API data from: {url}')
    resp = requests.get(url, timeout=timeout)
    resp.raise_for_status()
    data = resp.json()
    df = pd.json_normalize(data)
    return df

def fetch_wikipedia_falcon9(archived=True, timeout=20):
    """Attempt to fetch the Falcon 9 launch list from live Wikipedia; if the live page does not expose tables,
    use an archived static snapshot (Wayback Machine) as a fallback. Returns a DataFrame with the main table."""
    live_url = 'https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches'
    archive_url = 'https://web.archive.org/web/20240601000000/https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches'
    urls_to_try = [live_url]
    if archived:
        urls_to_try.append(archive_url)
    last_exception = None
    for url in urls_to_try:
        try:
            print('Requesting:', url)
            html = requests.get(url, timeout=timeout).text
            soup = BeautifulSoup(html, 'html.parser')
            tables = soup.find_all('table')
            if not tables:
                raise ValueError('No <table> elements found on the page.')
            largest = max(tables, key=lambda t: len(t.find_all('tr')))
            df = pd.read_html(str(largest))[0]
            print('Found table with shape', df.shape, 'from', url)
            return df
        except Exception as e:
            last_exception = e
            print('Attempt failed for', url, '->', repr(e))
            continue
    raise last_exception or ValueError('Failed to retrieve a table from Wikipedia.')

# Note: do not call these functions automatically if you're offline.


In [None]:
# --------------------
# Execute data collection and save results
# Run this cell to fetch data and persist CSV files in the notebook working directory.
# --------------------
from datetime import datetime
out_dir = 'data'
os.makedirs(out_dir, exist_ok=True)

# 1) API data
try:
    api_df = fetch_spacex_api(v4=True)
    api_path = os.path.join(out_dir, 'spacex_api_raw.csv')
    api_df.to_csv(api_path, index=False)
    print('✅ SpaceX API data saved to', api_path, 'shape=', api_df.shape)
except Exception as e:
    print('✖ Failed to fetch SpaceX API data:', e)
    api_df = None

# 2) Wikipedia data (with archived fallback)
try:
    wiki_df = fetch_wikipedia_falcon9(archived=True)
    wiki_path = os.path.join(out_dir, 'spacex_wikipedia_raw.csv')
    wiki_df.to_csv(wiki_path, index=False)
    print('✅ Wikipedia table saved to', wiki_path, 'shape=', wiki_df.shape)
except Exception as e:
    print('✖ Failed to fetch Wikipedia table:', e)
    wiki_df = None

# quick previews
if api_df is not None:
    display(api_df.head())
if wiki_df is not None:
    display(wiki_df.head())


In [None]:
# --------------------
# Data Cleaning & Normalization (starter template)
# --------------------

import re

def clean_wiki_table(df):
    df = df.copy()
    df.columns = [re.sub('\s+', '_', str(c).strip()).strip('_') for c in df.columns]
    cols = df.columns.tolist()
    mapping = {}
    for c in cols:
        lc = c.lower()
        if 'date' in lc and 'time' in lc:
            mapping[c] = 'Date'
        elif 'date' in lc:
            mapping[c] = 'Date'
        elif 'rocket' in lc:
            mapping[c] = 'Rocket'
        elif 'launch' in lc and 'site' in lc:
            mapping[c] = 'Launch_Site'
        elif 'payload' in lc and ('mass' in lc or 'kg' in lc):
            mapping[c] = 'Payload_Mass'
        elif 'payload' in lc:
            mapping[c] = 'Payload'
        elif 'orbit' in lc:
            mapping[c] = 'Orbit'
        elif 'outcome' in lc or 'result' in lc:
            mapping[c] = 'Outcome'
        elif 'customer' in lc or 'operator' in lc:
            mapping[c] = 'Customer'
    df = df.rename(columns=mapping)
    keep = [v for v in ['Date','Rocket','Launch_Site','Payload','Payload_Mass','Orbit','Outcome','Customer'] if v in df.columns]
    df = df[keep]
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    if 'Payload_Mass' in df.columns:
        df['Payload_Mass'] = df['Payload_Mass'].astype(str).str.replace('[^0-9\.-]', '', regex=True)
        df['Payload_Mass'] = pd.to_numeric(df['Payload_Mass'], errors='coerce')
    if 'Outcome' in df.columns:
        df['Outcome'] = df['Outcome'].astype(str).str.title()
    return df

# Example usage (run after wiki_df exists):
# cleaned_wiki = clean_wiki_table(wiki_df)
# cleaned_wiki.to_csv('data/spacex_wikipedia_cleaned.csv', index=False)


In [None]:
# --------------------
# Starter EDA (visualizations)
# --------------------

import os
api_path = os.path.join('data', 'spacex_api_raw.csv')
wiki_path = os.path.join('data', 'spacex_wikipedia_raw.csv')

if os.path.exists(api_path):
    api_df = pd.read_csv(api_path)
    print('API data shape:', api_df.shape)
if os.path.exists(wiki_path):
    wiki_df = pd.read_csv(wiki_path)
    print('Wiki data shape:', wiki_df.shape)

if 'date_utc' in api_df.columns:
    api_df['date_utc'] = pd.to_datetime(api_df['date_utc'], errors='coerce')
    api_df['year'] = api_df['date_utc'].dt.year
    plt.figure(figsize=(10,4))
    sns.countplot(x='year', data=api_df)
    plt.title('Number of launches per year (API)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

if 'success' in api_df.columns:
    plt.figure(figsize=(6,4))
    sns.countplot(x=api_df['success'].map({True:'Success', False:'Failure'}))
    plt.title('Launch success / failure (API)')
    plt.show()

cleaned_wiki_path = os.path.join('data', 'spacex_wikipedia_cleaned.csv')
if os.path.exists(cleaned_wiki_path):
    cw = pd.read_csv(cleaned_wiki_path)
    if 'Payload_Mass' in cw.columns:
        plt.figure(figsize=(8,4))
        sns.histplot(cw['Payload_Mass'].dropna(), bins=25, kde=True)
        plt.title('Payload mass distribution (Wikipedia table)')
        plt.show()


In [None]:
# --------------------
# Modeling: Simple baseline classifier (example)
# --------------------

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

if 'api_df' in globals() and 'date_utc' in api_df.columns and 'success' in api_df.columns:
    api_df['year'] = pd.to_datetime(api_df['date_utc'], errors='coerce').dt.year.fillna(0).astype(int)
    X = api_df[['year']].fillna(0)
    y = api_df['success'].astype(int).fillna(0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Baseline Decision Tree (year) accuracy:', accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
else:
    print('Baseline model not executed: required columns missing (date_utc, success).')


In [None]:
# --------------------
# Save cleaned data and final notes
# --------------------

if 'wiki_df' in globals():
    try:
        cleaned = clean_wiki_table(wiki_df)
        cleaned_path = os.path.join('data', 'spacex_wikipedia_cleaned.csv')
        cleaned.to_csv(cleaned_path, index=False)
        print('Saved cleaned wikipedia table to', cleaned_path)
    except Exception as e:
        print('Could not clean or save wikipedia table:', e)

print('\nNext steps:')
print('- Review cleaned CSVs in /data folder.')
print('- Create separate notebooks for EDA, SQL queries, Folium map, and Dash dashboard as required by capstone rubric.')
print('- Commit .ipynb, .py, cleaned CSVs, presentation PDF, and README.md to your GitHub repo.')
