In [2]:
import pandas as pd
import numpy as np

In [4]:
# load dataset saved from API
df = pd.read_csv('./dataset_part_1.csv')

df['FlightNumber'] = df['FlightNumber'].astype(int)
df.head(10)

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
0,1,2010-06-04,Falcon 9,6123.547647,LEO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B0003,-80.577366,28.561857
1,2,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B0005,-80.577366,28.561857
2,3,2013-03-01,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B0007,-80.577366,28.561857
3,4,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1.0,False,False,False,,1.0,0.0,B1003,-120.610829,34.632093
4,5,2013-12-03,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B1004,-80.577366,28.561857
5,6,2014-01-06,Falcon 9,3325.0,GTO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B1005,-80.577366,28.561857
6,7,2014-04-18,Falcon 9,2296.0,ISS,CCSFS SLC 40,True Ocean,1.0,False,False,True,,1.0,0.0,B1006,-80.577366,28.561857
7,8,2014-07-14,Falcon 9,1316.0,LEO,CCSFS SLC 40,True Ocean,1.0,False,False,True,,1.0,0.0,B1007,-80.577366,28.561857
8,9,2014-08-05,Falcon 9,4535.0,GTO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B1008,-80.577366,28.561857
9,10,2014-09-07,Falcon 9,4428.0,GTO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B1011,-80.577366,28.561857


In [5]:
# calculate the percentage of missing values in each column
df.isnull().sum()/len(df)*100

FlightNumber       0.000000
Date               0.000000
BoosterVersion     0.000000
PayloadMass        0.000000
Orbit              0.000000
LaunchSite         0.000000
Outcome            0.000000
Flights            0.000000
GridFins           0.000000
Reused             0.000000
Legs               0.000000
LandingPad        28.888889
Block              0.000000
ReusedCount        0.000000
Serial             0.000000
Longitude          0.000000
Latitude           0.000000
dtype: float64

In [6]:
# identify numerical and categorical columns
df.dtypes

FlightNumber        int64
Date               object
BoosterVersion     object
PayloadMass       float64
Orbit              object
LaunchSite         object
Outcome            object
Flights           float64
GridFins             bool
Reused               bool
Legs                 bool
LandingPad         object
Block             float64
ReusedCount       float64
Serial             object
Longitude         float64
Latitude          float64
dtype: object

In [9]:
### calculate the number of launches at each Launch Site
# CCSFS SLC 40 : Cape Canaveral Space Launch Complex 40
# KSC LC 39A : Kennedy Space Center Launch Complex 39A
# VAFB SLC 4E : Vandenberg Air Force Base Space Launch Complex 4E

df['LaunchSite'].value_counts()

LaunchSite
CCSFS SLC 40    55
KSC LC 39A      22
VAFB SLC 4E     13
Name: count, dtype: int64

In [11]:
### calculate the number and occurence of each orbit type
# LEO : Low Earth Orbit
# VLEO : Very Low Earth Orbit
# GTO/GEO: Geosynchronous orbit
# SSO (SO) : Sun-synchronous orbit
# ES-L1 : L1 Lagrange pt 
# HEO : highly elliptical orbit
# ISS : International Space Station
# MEO : Geocentric orbit, intermediate circular orbit
# HEO : Geocentric orbit above the altitude of GEO
# PO : polar orbit

df['Orbit'].value_counts()

Orbit
GTO      27
ISS      21
VLEO     14
PO        9
LEO       7
SSO       5
MEO       3
ES-L1     1
HEO       1
SO        1
GEO       1
Name: count, dtype: int64

In [16]:
### calculate number and occurence of outcomes

landing_outcomes = df['Outcome'].value_counts()

for i,outcome in enumerate(landing_outcomes.keys()):
    print(i,outcome)

0 True ASDS
1 None None
2 True RTLS
3 False ASDS
4 True Ocean
5 False Ocean
6 None ASDS
7 False RTLS


In [18]:
# select failed outcomes
bad_outcomes = set(landing_outcomes.keys()[[1,3,5,6,7]])

In [19]:
# one-hot encode landing outcomes to failures (0) or successes (1)
landing_class = []
for oc in df['Outcome']:
    if oc in bad_outcomes: landing_class.append(0)
    else: landing_class.append(1)

In [20]:
# add back into dataframe
df['Class'] = landing_class
df.head(5)

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude,Class
0,1,2010-06-04,Falcon 9,6123.547647,LEO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B0003,-80.577366,28.561857,0
1,2,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B0005,-80.577366,28.561857,0
2,3,2013-03-01,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B0007,-80.577366,28.561857,0
3,4,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1.0,False,False,False,,1.0,0.0,B1003,-120.610829,34.632093,0
4,5,2013-12-03,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1.0,False,False,False,,1.0,0.0,B1004,-80.577366,28.561857,0


In [21]:
# calculate an overall success rate for landings
df['Class'].mean()

0.6666666666666666

In [22]:
# export the updated dataset
df.to_csv("dataset_part_2.csv", index=False)