In [14]:
import numpy as np
import requests
import pandas as pd
import datetime
from time import sleep

class SpaceXDataExtractor:
    def __init__(self, static_json_url):
        self.static_json_url = static_json_url
        self.df = None

    def load_base_data(self):
        # Load static JSON data from IBM server
        response = requests.get(self.static_json_url)
        if response.status_code != 200:
            raise Exception("Failed to fetch base data")
        data = response.json()
        self.df = pd.json_normalize(data)

        # Subset required columns
        self.df = self.df[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]

        # Clean cores and payloads
        self.df = self.df[self.df['cores'].map(len) == 1]
        self.df = self.df[self.df['payloads'].map(len) == 1]
        self.df['cores'] = self.df['cores'].map(lambda x: x[0])
        self.df['payloads'] = self.df['payloads'].map(lambda x: x[0])

        # Parse date
        self.df['date'] = pd.to_datetime(self.df['date_utc']).dt.date

        # Filter date
        self.df = self.df[self.df['date'] <= datetime.date(2020, 11, 13)]

    def api_enrich(self, ids, endpoint, attribute):
        enriched_data = []
        for item_id in ids:
            url = f"https://api.spacexdata.com/v4/{endpoint}/{item_id}"
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    data = response.json()
                    enriched_data.append(data.get(attribute))
                else:
                    enriched_data.append(None)
            except Exception as e:
                enriched_data.append(None)
            sleep(0.1)  # slight delay to avoid hitting API too fast
        return enriched_data

    def enrich_all(self):
        self.df['BoosterVersion'] = self.api_enrich(self.df['rocket'], "rockets", "name")
        self.df['LaunchSite'] = self.api_enrich(self.df['launchpad'], "launchpads", "name")
        self.df['Longitude'] = self.api_enrich(self.df['launchpad'], "launchpads", "longitude")
        self.df['Latitude'] = self.api_enrich(self.df['launchpad'], "launchpads", "latitude")
        self.df['PayloadMass'] = self.api_enrich(self.df['payloads'], "payloads", "mass_kg")
        self.df['Orbit'] = self.api_enrich(self.df['payloads'], "payloads", "orbit")

        # Core data enrichment separately because nested structure
        self.get_core_data()

    def get_core_data(self):
        blocks, reused_counts, serials, outcomes, flights, gridfins, reuseds, legs, landingpads = [], [], [], [], [], [], [], [], []

        for core in self.df['cores']:
            try:
                response = requests.get(f"https://api.spacexdata.com/v4/cores/{core}")
                if response.status_code == 200:
                    data = response.json()
                    blocks.append(data.get("block"))
                    reused_counts.append(data.get("reuse_count"))
                    serials.append(data.get("serial"))
                else:
                    blocks.append(None)
                    reused_counts.append(None)
                    serials.append(None)
            except Exception as e:
                blocks.append(None)
                reused_counts.append(None)
                serials.append(None)

        # Now extract directly from original dataframe cores field
        for row in self.df['cores']:
            outcomes.append(str(row.get('landing_success')) + " " + str(row.get('landing_type')))
            flights.append(row.get('flight'))
            gridfins.append(row.get('gridfins'))
            reuseds.append(row.get('reused'))
            legs.append(row.get('legs'))
            landingpads.append(row.get('landpad'))

        self.df['Block'] = blocks
        self.df['ReusedCount'] = reused_counts
        self.df['Serial'] = serials
        self.df['Outcome'] = outcomes
        self.df['Flights'] = flights
        self.df['GridFins'] = gridfins
        self.df['Reused'] = reuseds
        self.df['Legs'] = legs
        self.df['LandingPad'] = landingpads

    def finalize_dataset(self):
        # Filter only Falcon 9 launches
        falcon9_df = self.df[self.df['BoosterVersion'] == 'Falcon 9'].copy()

        # Reset Flight Numbers
        falcon9_df.loc[:, 'FlightNumber'] = list(range(1, falcon9_df.shape[0]+1))

        # Handle missing PayloadMass
        payload_mass_mean = falcon9_df['PayloadMass'].mean()
        falcon9_df['PayloadMass'] = falcon9_df['PayloadMass'].fillna(payload_mass_mean)

        return falcon9_df

    def save_to_csv(self, df, filename="dataset_part_1.csv"):
        df.to_csv(filename, index=False)


# --- Usage Example ---
if __name__ == "__main__":
    static_json_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/API_call_spacex_api.json'

    extractor = SpaceXDataExtractor(static_json_url)
    extractor.load_base_data()
    extractor.enrich_all()
    final_df = extractor.finalize_dataset()
    extractor.save_to_csv(final_df)
    print(final_df.head())


KeyboardInterrupt: 