In [1]:
import requests
import pandas as pd
import numpy as np
import datetime
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
print("All libraries have been imported.")

All libraries have been imported.


In [3]:
# NOTE: This code was provided.
# Takes the dataset and uses the rocket column to call the API and append the booster version to the DataFrame
def getBoosterVersion(data):
    for x in data['rocket']:
       if x:
        response = requests.get("https://api.spacexdata.com/v4/rockets/"+str(x)).json()
        BoosterVersion.append(response['name'])

In [4]:
# NOTE: This code was provided.
# Takes the dataset and uses the launchpad column to call the API and append the latitude and longitude to the DataFrame
def getLaunchSite(data):
    for x in data['launchpad']:
       if x:
         response = requests.get("https://api.spacexdata.com/v4/launchpads/"+str(x)).json()
         Longitude.append(response['longitude'])
         Latitude.append(response['latitude'])
         LaunchSite.append(response['name'])

In [5]:
# NOTE: This code was provided.
# Takes the dataset and uses the payloads column to call the API and append the payload mass to the DataFrame
def getPayloadData(data):
    for load in data['payloads']:
       if load:
        response = requests.get("https://api.spacexdata.com/v4/payloads/"+load).json()
        PayloadMass.append(response['mass_kg'])
        Orbit.append(response['orbit'])

In [6]:
# NOTE: This code was provided.
# Takes the dataset and uses the cores column to call the API and append the data about the cores to the DataFrame
def getCoreData(data):
    for core in data['cores']:
            if core['core'] != None:
                response = requests.get("https://api.spacexdata.com/v4/cores/"+core['core']).json()
                Block.append(response['block'])
                ReusedCount.append(response['reuse_count'])
                Serial.append(response['serial'])
            else:
                Block.append(None)
                ReusedCount.append(None)
                Serial.append(None)
            Outcome.append(str(core['landing_success'])+' '+str(core['landing_type']))
            Flights.append(core['flight'])
            GridFins.append(core['gridfins'])
            Reused.append(core['reused'])
            Legs.append(core['legs'])
            LandingPad.append(core['landpad'])

## Task 1: Request and parse the SpaceX launch data using the GET request

In [8]:
# Convert JSON file into DataFrame
static_json_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/API_call_spacex_api.json'
response = requests.get(static_json_url)
response_json = response.json()
data_initial = pd.json_normalize(response_json)
data_initial.head(1)

Unnamed: 0,static_fire_date_utc,static_fire_date_unix,tbd,net,window,rocket,success,details,crew,ships,capsules,payloads,launchpad,auto_update,failures,flight_number,name,date_utc,date_unix,date_local,date_precision,upcoming,cores,id,fairings.reused,fairings.recovery_attempt,fairings.recovered,fairings.ships,links.patch.small,links.patch.large,links.reddit.campaign,links.reddit.launch,links.reddit.media,links.reddit.recovery,links.flickr.small,links.flickr.original,links.presskit,links.webcast,links.youtube_id,links.article,links.wikipedia,fairings
0,2006-03-17T00:00:00.000Z,1142554000.0,False,False,0.0,5e9d0d95eda69955f709d1eb,False,Engine failure at 33 seconds and loss of vehicle,[],[],[],[5eb0e4b5b6c3bb0006eeb1e1],5e9e4502f5090995de566f86,True,"[{'time': 33, 'altitude': None, 'reason': 'merlin engine failure'}]",1,FalconSat,2006-03-24T22:30:00.000Z,1143239400,2006-03-25T10:30:00+12:00,hour,False,"[{'core': '5e9e289df35918033d3b2623', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}]",5eb87cd9ffd86e000604b32a,False,False,False,[],https://images2.imgbox.com/3c/0e/T8iJcSN3_o.png,https://images2.imgbox.com/40/e3/GypSkayF_o.png,,,,,[],[],,https://www.youtube.com/watch?v=0a_00nJ_Y88,0a_00nJ_Y88,https://www.space.com/2196-spacex-inaugural-falcon-1-rocket-lost-launch.html,https://en.wikipedia.org/wiki/DemoSat,


In [9]:
data_initial.shape

(107, 42)

In [10]:
# View column names
pd.DataFrame(data_initial.columns)

Unnamed: 0,0
0,static_fire_date_utc
1,static_fire_date_unix
2,tbd
3,net
4,window
5,rocket
6,success
7,details
8,crew
9,ships


## DataFrame of Launch Data - Selected Information

In [11]:
# Lets take a subset of our dataframe keeping only the features we want and the flight number, and date_utc.
data = data_initial[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]

# We will remove rows with multiple cores because those are falcon rockets with 2 extra rocket boosters and rows that have multiple payloads in a single rocket.
data = data[data['cores'].map(len)==1]
data = data[data['payloads'].map(len)==1]

# Since payloads and cores are lists of size 1 we will also extract the single value in the list and replace the feature.
data['cores'] = data['cores'].map(lambda x : x[0])
data['payloads'] = data['payloads'].map(lambda x : x[0])

# We also want to convert the date_utc to a datetime datatype and then extracting the date leaving the time
data['date'] = pd.to_datetime(data['date_utc']).dt.date

# Using the date we will restrict the dates of the launches
data = data[data['date'] <= datetime.date(2020, 11, 13)]

In [12]:
# Set global variables to be empty lists
BoosterVersion = []
PayloadMass = []
Orbit = []
LaunchSite = []
Outcome = []
Flights = []
GridFins = []
Reused = []
Legs = []
LandingPad = []
Block = []
ReusedCount = []
Serial = []
Longitude = []
Latitude = []

In [13]:
# Confirm list to be empty
BoosterVersion

[]

In [14]:
# Call getBoosterVersion
getBoosterVersion(data)

In [15]:
# Call getLaunchSite
getLaunchSite(data)

In [16]:
# Call getPayloadData
getPayloadData(data)

In [17]:
# Call getCoreData
getCoreData(data)

In [18]:
# The lists has now been updated
BoosterVersion[0:5]

['Falcon 1', 'Falcon 1', 'Falcon 1', 'Falcon 1', 'Falcon 9']

In [19]:
# Combine the columns into a dictionary
launch_dict = {'FlightNumber': list(data['flight_number']),
                'Date': list(data['date']),
                'BoosterVersion':BoosterVersion,
                'PayloadMass':PayloadMass,
                'Orbit':Orbit,
                'LaunchSite':LaunchSite,
                'Outcome':Outcome,
                'Flights':Flights,
                'GridFins':GridFins,
                'Reused':Reused,
                'Legs':Legs,
                'LandingPad':LandingPad,
                'Block':Block,
                'ReusedCount':ReusedCount,
                'Serial':Serial,
                'Longitude': Longitude,
                'Latitude': Latitude}

In [20]:
# Create a DataFrame from launch_dict
launch_df = pd.DataFrame(launch_dict)
launch_df.head(3)

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
0,1,2006-03-24,Falcon 1,20.0,LEO,Kwajalein Atoll,None None,1,False,False,False,,,0,Merlin1A,167.743129,9.047721
1,2,2007-03-21,Falcon 1,,LEO,Kwajalein Atoll,None None,1,False,False,False,,,0,Merlin2A,167.743129,9.047721
2,4,2008-09-28,Falcon 1,165.0,LEO,Kwajalein Atoll,None None,1,False,False,False,,,0,Merlin2C,167.743129,9.047721


In [21]:
launch_df.shape

(94, 17)

## TASK 2

In [22]:
# Quantify types of booster versions.
launch_df['BoosterVersion'].value_counts()

Falcon 9    90
Falcon 1     4
Name: BoosterVersion, dtype: int64

In [23]:
# Exclude all launches except those with the Falcon 9 booster.
data_falcon_9 = launch_df.loc[launch_df['BoosterVersion'].isin(['Falcon 9'])]
data_falcon_9.head(2)

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
4,6,2010-06-04,Falcon 9,,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0003,-80.577366,28.561857
5,8,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0005,-80.577366,28.561857


In [24]:
# Confirm that only the Falcon 9 booster is included.
data_falcon_9['BoosterVersion'].value_counts()

Falcon 9    90
Name: BoosterVersion, dtype: int64

In [25]:
# Reset the FlightNumber column
data_falcon_9.loc[:,'FlightNumber'] = list(range(1, data_falcon_9.shape[0]+1))
data_falcon_9.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
4,1,2010-06-04,Falcon 9,,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0003,-80.577366,28.561857
5,2,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0005,-80.577366,28.561857


In [26]:
data_falcon_9.shape

(90, 17)

In [27]:
data_falcon_9.describe()

Unnamed: 0,FlightNumber,PayloadMass,Flights,Block,ReusedCount,Longitude,Latitude
count,90.0,85.0,90.0,90.0,90.0,90.0,90.0
mean,45.5,6123.547647,1.788889,3.5,3.188889,-86.366477,29.449963
std,26.124701,4870.916417,1.213172,1.595288,4.194417,14.149518,2.141306
min,1.0,350.0,1.0,1.0,0.0,-120.610829,28.561857
25%,23.25,2482.0,1.0,2.0,0.0,-80.603956,28.561857
50%,45.5,4535.0,1.0,4.0,1.0,-80.577366,28.561857
75%,67.75,9600.0,2.0,5.0,4.0,-80.577366,28.608058
max,90.0,15600.0,6.0,5.0,13.0,-80.577366,34.632093


## Data Wrangeling 

In [28]:
# There are some missing values in the dataset
data_falcon_9.isnull().sum()

FlightNumber       0
Date               0
BoosterVersion     0
PayloadMass        5
Orbit              0
LaunchSite         0
Outcome            0
Flights            0
GridFins           0
Reused             0
Legs               0
LandingPad        26
Block              0
ReusedCount        0
Serial             0
Longitude          0
Latitude           0
dtype: int64

## Task 3: Dealing with Missing Values

In [32]:
# Calculate the mean value of the values in the PayloadMass column and replace the np.nan values with this mean value
mean = data_falcon_9['PayloadMass'].mean()
data_falcon_9['PayloadMass'].replace(np.nan, mean, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [34]:
# There are now no missing values for 'PayLoadMass'. We keep the 'None' values in the 'LandingPad' column to represent when landing pads were not used.
data_falcon_9.isnull().sum()

FlightNumber       0
Date               0
BoosterVersion     0
PayloadMass        0
Orbit              0
LaunchSite         0
Outcome            0
Flights            0
GridFins           0
Reused             0
Legs               0
LandingPad        26
Block              0
ReusedCount        0
Serial             0
Longitude          0
Latitude           0
dtype: int64

## Export DataFrame to .CSV

In [35]:
# Export DataFrame as .csv
data_falcon_9.to_csv('dataset_part_1.csv', index=False)