In [1]:
import requests
import pandas as pd
import numpy as np
import datetime

# set pandas to print all columns with all data in the DF
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Helper functions

In [2]:
# Takes the dataset and uses the rocket column to call the API and append the data to the list
def getBoosterVersion(data):
    for x in data['rocket']:
       if x:
        response = requests.get("https://api.spacexdata.com/v4/rockets/"+str(x)).json()
        BoosterVersion.append(response['name'])

# Takes the dataset and uses the launchpad column to call the API and append the data to the list
def getLaunchSite(data):
    for x in data['launchpad']:
       if x:
         response = requests.get("https://api.spacexdata.com/v4/launchpads/"+str(x)).json()
         Longitude.append(response['longitude'])
         Latitude.append(response['latitude'])
         LaunchSite.append(response['name'])

# Takes the dataset and uses the payloads column to call the API and append the data to the lists
def getPayloadData(data):
    for load in data['payloads']:
       if load:
        response = requests.get("https://api.spacexdata.com/v4/payloads/"+load).json()
        PayloadMass.append(response['mass_kg'])
        Orbit.append(response['orbit'])

# Takes the dataset and uses the cores column to call the API and append the data to the lists
def getCoreData(data):
    for core in data['cores']:
            if core['core'] != None:
                response = requests.get("https://api.spacexdata.com/v4/cores/"+core['core']).json()
                Block.append(response['block'])
                ReusedCount.append(response['reuse_count'])
                Serial.append(response['serial'])
            else:
                Block.append(None)
                ReusedCount.append(None)
                Serial.append(None)
            Outcome.append(str(core['landing_success'])+' '+str(core['landing_type']))
            Flights.append(core['flight'])
            GridFins.append(core['gridfins'])
            Reused.append(core['reused'])
            Legs.append(core['legs'])
            LandingPad.append(core['landpad'])

In [5]:
# request rocket launch data from SpaceX API
#spacex_url = "https://api.spacexdata.com/v4/launches/past"
spacex_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/API_call_spacex_api.json"
response = requests.get(spacex_url)

In [16]:
# convert the JSON results into pandas DF
API_data = response.json()
data = pd.json_normalize(API_data)
#print(data)

In [17]:
# Lets take a subset of our dataframe keeping only the features we want and the flight number, and date_utc.
data = data[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]

# We will remove rows with multiple cores because those are falcon rockets with 2 extra rocket boosters and rows that have multiple payloads in a single rocket.
data = data[data['cores'].map(len)==1]
data = data[data['payloads'].map(len)==1]

# Since payloads and cores are lists of size 1 we will also extract the single value in the list and replace the feature.
data['cores'] = data['cores'].map(lambda x : x[0])
data['payloads'] = data['payloads'].map(lambda x : x[0])

# We also want to convert the date_utc to a datetime datatype and then extracting the date leaving the time
data['date'] = pd.to_datetime(data['date_utc']).dt.date

# Using the date we will restrict the dates of the launches
data = data[data['date'] <= datetime.date(2020, 11, 13)]

In [18]:
#Global variables 
BoosterVersion = []
PayloadMass = []
Orbit = []
LaunchSite = []
Outcome = []
Flights = []
GridFins = []
Reused = []
Legs = []
LandingPad = []
Block = []
ReusedCount = []
Serial = []
Longitude = []
Latitude = []

In [21]:
# populate lists with info converting IDs to names through API

getBoosterVersion(data)
getLaunchSite(data)
getPayloadData(data)
getCoreData(data)

In [32]:
# construct the dataset in a dictionary using the populated lists
launch_dict = {'FlightNumber': list(data['flight_number']),
               'Date': list(data['date']),
               'BoosterVersion': BoosterVersion,
               'PayloadMass': PayloadMass,
               'Orbit': Orbit,
               'LaunchSite': LaunchSite,
               'Outcome': Outcome,
               'Flights':Flights,
               'GridFins':GridFins,
               'Reused':Reused,
               'Legs':Legs,
               'LandingPad':LandingPad,
               'Block':Block,
               'ReusedCount':ReusedCount,
               'Serial':Serial,
               'Longitude': Longitude,
               'Latitude': Latitude}

In [33]:
# make the pandas df from the dictionary
df = pd.DataFrame({ key:pd.Series(value) for key, value in launch_dict.items() })
df.head(10)

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
0,1.0,2006-03-24,Falcon 1,20.0,LEO,Kwajalein Atoll,None None,1.0,False,False,False,,,0.0,Merlin1A,167.743129,9.047721
1,2.0,2007-03-21,Falcon 1,,LEO,Kwajalein Atoll,None None,1.0,False,False,False,,,0.0,Merlin2A,167.743129,9.047721
2,4.0,2008-09-28,Falcon 1,165.0,LEO,Kwajalein Atoll,None None,1.0,False,False,False,,,0.0,Merlin2C,167.743129,9.047721
3,5.0,2009-07-13,Falcon 1,200.0,LEO,Kwajalein Atoll,None None,1.0,False,False,False,,,0.0,Merlin3C,167.743129,9.047721
4,6.0,2010-06-04,Falcon 9,,LEO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B0003,-80.577366,28.561857
5,8.0,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B0005,-80.577366,28.561857
6,10.0,2013-03-01,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B0007,-80.577366,28.561857
7,11.0,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1.0,False,False,False,,1.0,0.0,B1003,-120.610829,34.632093
8,12.0,2013-12-03,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B1004,-80.577366,28.561857
9,13.0,2014-01-06,Falcon 9,3325.0,GTO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B1005,-80.577366,28.561857


In [43]:
# filter out any launches that are NOT Falcon 9
data_falcon9_step = df.loc[df['BoosterVersion'] != 'Falcon 1']
# remove NaN entries
data_falcon9 = data_falcon9_step.loc[data_falcon9_step['Date'].notna()]
#data_falcon9.head(10)

In [44]:
# reset the FlightNumber column
data_falcon9.loc[:,'FlightNumber'] = list(range(1, data_falcon9.shape[0]+1))
data_falcon9

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
4,1.0,2010-06-04,Falcon 9,,LEO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B0003,-80.577366,28.561857
5,2.0,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B0005,-80.577366,28.561857
6,3.0,2013-03-01,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B0007,-80.577366,28.561857
7,4.0,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1.0,False,False,False,,1.0,0.0,B1003,-120.610829,34.632093
8,5.0,2013-12-03,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B1004,-80.577366,28.561857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,86.0,2020-09-03,Falcon 9,15600.0,VLEO,KSC LC 39A,True ASDS,2.0,True,True,True,5e9e3032383ecb6bb234e7ca,5.0,12.0,B1060,-80.603956,28.608058
90,87.0,2020-10-06,Falcon 9,15600.0,VLEO,KSC LC 39A,True ASDS,3.0,True,True,True,5e9e3032383ecb6bb234e7ca,5.0,13.0,B1058,-80.603956,28.608058
91,88.0,2020-10-18,Falcon 9,15600.0,VLEO,KSC LC 39A,True ASDS,6.0,True,True,True,5e9e3032383ecb6bb234e7ca,5.0,12.0,B1051,-80.603956,28.608058
92,89.0,2020-10-24,Falcon 9,15600.0,VLEO,CCSFS SLC 40,True ASDS,3.0,True,True,True,5e9e3033383ecbb9e534e7cc,5.0,12.0,B1060,-80.577366,28.561857


In [45]:
# check for other NaNs
data_falcon9.isnull().sum()

FlightNumber       0
Date               0
BoosterVersion     0
PayloadMass        5
Orbit              0
LaunchSite         0
Outcome            0
Flights            0
GridFins           0
Reused             0
Legs               0
LandingPad        26
Block              0
ReusedCount        0
Serial             0
Longitude          0
Latitude           0
dtype: int64

In [46]:
# replace the NaNs in PayloadMass with the mean payload mass value
# Calculate the mean value of PayloadMass column
meanPM = data_falcon9['PayloadMass'].mean()

# Replace the np.nan values with its mean value
data_falcon9['PayloadMass'] = data_falcon9['PayloadMass'].replace(np.nan, meanPM)
data_falcon9.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_falcon9['PayloadMass'] = data_falcon9['PayloadMass'].replace(np.nan, meanPM)


FlightNumber       0
Date               0
BoosterVersion     0
PayloadMass        0
Orbit              0
LaunchSite         0
Outcome            0
Flights            0
GridFins           0
Reused             0
Legs               0
LandingPad        26
Block              0
ReusedCount        0
Serial             0
Longitude          0
Latitude           0
dtype: int64

In [47]:
# export to csv
data_falcon9.to_csv('dataset_part_1.csv', index=False)