# Data Collection

In [1]:
import requests
import pandas as pd
import numpy as np
import datetime
print("Imported Libraries")

Imported Libraries


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
print("Set Options")

Set Options


## APIs

In [3]:
# Takes the dataset and uses the rocket column to call the API and append the data to the list
def getBoosterVersion(data):
    for x in data['rocket']:
       if x:
        response = requests.get("https://api.spacexdata.com/v4/rockets/"+str(x)).json()
        BoosterVersion.append(response['name'])

In [4]:
# Takes the dataset and uses the launchpad column to call the API and append the data to the list
def getLaunchSite(data):
    for x in data['launchpad']:
       if x:
         response = requests.get("https://api.spacexdata.com/v4/launchpads/"+str(x)).json()
         Longitude.append(response['longitude'])
         Latitude.append(response['latitude'])
         LaunchSite.append(response['name'])

In [5]:
def getPayloadData(data):
    for load in data['payloads']:
       if load:
        response = requests.get("https://api.spacexdata.com/v4/payloads/"+load).json()
        PayloadMass.append(response['mass_kg'])
        Orbit.append(response['orbit'])

In [6]:
# Takes the dataset and uses the cores column to call the API and append the data to the lists
def getCoreData(data):
    for core in data['cores']:
            if core['core'] != None:
                response = requests.get("https://api.spacexdata.com/v4/cores/"+core['core']).json()
                Block.append(response['block'])
                ReusedCount.append(response['reuse_count'])
                Serial.append(response['serial'])
            else:
                Block.append(None)
                ReusedCount.append(None)
                Serial.append(None)
            Outcome.append(str(core['landing_success'])+' '+str(core['landing_type']))
            Flights.append(core['flight'])
            GridFins.append(core['gridfins'])
            Reused.append(core['reused'])
            Legs.append(core['legs'])
            LandingPad.append(core['landpad'])

## Task 1:  Request and parse the SpaceX launch data using the GET request

In [7]:
spacex_url="https://api.spacexdata.com/v4/launches/past"
response = requests.get(spacex_url)
response.status_code

200

In [8]:
data = pd.json_normalize(response.json())
data.shape

(187, 43)

In [9]:
data.head(1)

Unnamed: 0,static_fire_date_utc,static_fire_date_unix,net,window,rocket,success,failures,details,crew,ships,capsules,payloads,launchpad,flight_number,name,date_utc,date_unix,date_local,date_precision,upcoming,cores,auto_update,tbd,launch_library_id,id,fairings.reused,fairings.recovery_attempt,fairings.recovered,fairings.ships,links.patch.small,links.patch.large,links.reddit.campaign,links.reddit.launch,links.reddit.media,links.reddit.recovery,links.flickr.small,links.flickr.original,links.presskit,links.webcast,links.youtube_id,links.article,links.wikipedia,fairings
0,2006-03-17T00:00:00.000Z,1142554000.0,False,0.0,5e9d0d95eda69955f709d1eb,False,"[{'time': 33, 'altitude': None, 'reason': 'merlin engine failure'}]",Engine failure at 33 seconds and loss of vehicle,[],[],[],[5eb0e4b5b6c3bb0006eeb1e1],5e9e4502f5090995de566f86,1,FalconSat,2006-03-24T22:30:00.000Z,1143239400,2006-03-25T10:30:00+12:00,hour,False,"[{'core': '5e9e289df35918033d3b2623', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}]",True,False,,5eb87cd9ffd86e000604b32a,False,False,False,[],https://images2.imgbox.com/94/f2/NN6Ph45r_o.png,https://images2.imgbox.com/5b/02/QcxHUb5V_o.png,,,,,[],[],,https://www.youtube.com/watch?v=0a_00nJ_Y88,0a_00nJ_Y88,https://www.space.com/2196-spacex-inaugural-falcon-1-rocket-lost-launch.html,https://en.wikipedia.org/wiki/DemoSat,


## Pre-processing

In [10]:
# Lets take a subset of our dataframe keeping only the features we want and the flight number, and date_utc.
data = data[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]

In [11]:
# We will remove rows with multiple cores because those are falcon rockets with 2 extra rocket boosters and rows that have multiple payloads in a single rocket.
data = data[data['cores'].map(len)==1]
data = data[data['payloads'].map(len)==1]

In [12]:
# Since payloads and cores are lists of size 1 we will also extract the single value in the list and replace the feature.
data['cores'] = data['cores'].map(lambda x : x[0])
data['payloads'] = data['payloads'].map(lambda x : x[0])

In [13]:
# We also want to convert the date_utc to a datetime datatype and then extracting the date leaving the time
data['date'] = pd.to_datetime(data['date_utc']).dt.date

In [14]:
# Using the date we will restrict the dates of the launches
data = data[data['date'] <= datetime.date(2020, 11, 13)]

In [15]:
# Call getBoosterVersion
BoosterVersion = []
getBoosterVersion(data)
print(len(BoosterVersion))

94


In [None]:
# Call getLaunchSite
Longitude = []
Latitude = []
LaunchSite = []
getLaunchSite(data)
print(len(Longitude),len(Latitude),len(LaunchSite))

In [None]:
# Call getPayloadData
PayloadMass = []
Orbit = []
getPayloadData(data)
print(len(PayloadMass),len(Orbit))

In [None]:
# Call getCoreData
Outcome = []; Flights = []; GridFins = []; Reused = []; Legs = []; LandingPad = []
Block = []; ReusedCount = []; Serial = []
getCoreData(data)
print(len(Outcome), len(Flights), len(GridFins), len(Reused), len(Legs), len(LandingPad))
print(len(Block), len(ReusedCount), len(Serial))

## Populate Dataframe

In [None]:
launch_dict = { 'FlightNumber': list(data['flight_number']),
                'Date': list(data['date']),
                'BoosterVersion':BoosterVersion,
                'PayloadMass':PayloadMass,
                'Orbit':Orbit,
                'LaunchSite':LaunchSite,
                'Outcome':Outcome,
                'Flights':Flights,
                'GridFins':GridFins,
                'Reused':Reused,
                'Legs':Legs,
                'LandingPad':LandingPad,
                'Block':Block,
                'ReusedCount':ReusedCount,
                'Serial':Serial,
                'Longitude': Longitude,
                'Latitude': Latitude}

In [None]:
# Check data lengths
length = 0
for item, value in launch_dict.items():
    if length != len(value):
        print(item, 'count', len(value))
        length = len(value)

In [None]:
# Create a data from launch_dict
df = pd.DataFrame(launch_dict)

In [None]:
# Show the head of the dataframe
df.head()

## Task 2: Filter the dataframe to only include `Falcon 9` launches

In [None]:
# Hint data['BoosterVersion']!='Falcon 1'
data_falcon9 = df[df['BoosterVersion']=='Falcon 9']
data_falcon9[:3]

In [None]:
data_falcon9.shape

In [None]:
data_falcon9.loc[:,'FlightNumber'] = list(range(1, data_falcon9.shape[0]+1))
data_falcon9[:3]

## Task 3: Dealing with Missing Values

In [None]:
data_falcon9.isnull().sum()

In [None]:
# Replace the np.nan values with its mean value
data_falcon9.loc[:,'PayloadMass'] = np.round(data_falcon9['PayloadMass'].fillna(data_falcon9['PayloadMass'].mean()),1)

In [None]:
data_falcon9[:3]

## Save Data to CSV File

In [None]:
data_falcon9.to_csv('dataset_part_1.csv', index=False)