In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

path = 'spaceX_eda'
df = pd.read_csv(path)
df.head()

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude,Class
0,1,2010-06-04,Falcon 9,6104.959412,LEO,CCAFS SLC 40,None None,1,False,False,False,,1.0,0,B0003,-80.577366,28.561857,0
1,2,2012-05-22,Falcon 9,525.0,LEO,CCAFS SLC 40,None None,1,False,False,False,,1.0,0,B0005,-80.577366,28.561857,0
2,3,2013-03-01,Falcon 9,677.0,ISS,CCAFS SLC 40,None None,1,False,False,False,,1.0,0,B0007,-80.577366,28.561857,0
3,4,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1,False,False,False,,1.0,0,B1003,-120.610829,34.632093,0
4,5,2013-12-03,Falcon 9,3170.0,GTO,CCAFS SLC 40,None None,1,False,False,False,,1.0,0,B1004,-80.577366,28.561857,0


In [2]:
df.dtypes

FlightNumber        int64
Date               object
BoosterVersion     object
PayloadMass       float64
Orbit              object
LaunchSite         object
Outcome            object
Flights             int64
GridFins             bool
Reused               bool
Legs                 bool
LandingPad         object
Block             float64
ReusedCount         int64
Serial             object
Longitude         float64
Latitude          float64
Class               int64
dtype: object

### Number of Flights vs Launch Site:


* Based of the scatter plot, we can visualize the relationship between Successful launches per **Launch Site** and vice-versa; we can see towards the beggining that SpaceX attempted the majority of launches at SLC-40; SLC-40 failed the first 5 launches and had a few successes up until Flight Number 20. Flights 25-40 were attempted at LC-39A, the majority being successes.


* **61% or more than half of Flights, were attempted at Cape Canaveral w/ Success Rate (60%)**


* **24% or a quarter of Flights attempted were at Kennedy Space Station w/ Success Rate (77%)**


* **14% of Flights attempted were at Vandenberg SLC  w/ Success Rate (77%)**

In [3]:
successRate_perSite = pd.DataFrame({'SuccessCount': df[[
    'Class', 'LaunchSite']].value_counts(normalize = False)[1]})

successRate_perSite['FlightCount'] = df.groupby(['FlightNumber'])[
    'LaunchSite'].unique().value_counts(normalize = False)

successRate_perSite['Percent_of_Total_Flights'] = (successRate_perSite.iloc[:, 1] / 90)


successRate_perSite['SuccessRate'] = (successRate_perSite.iloc[:, 0] / successRate_perSite.iloc[:, 1])

successRate_perSite

Unnamed: 0_level_0,SuccessCount,FlightCount,Percent_of_Total_Flights,SuccessRate
LaunchSite,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CCAFS SLC 40,33,55,0.611111,0.6
KSC LC 39A,17,22,0.244444,0.772727
VAFB SLC 4E,10,13,0.144444,0.769231


### TASK 3: Visualize the relationship between success rate of each orbit type
Next, we want to visually check if there are any relationship between success rate and orbit type.

In [17]:
fig = px.histogram(df, df.Orbit, color = df.Class, 
            facet_col = df.GridFins, facet_col_spacing = .1)
fig.update_layout(dict(font_size = 14))

### TASK 2: Visualize the relationship between Payload and Launch Site

In [53]:
fig = px.histogram(df, df.LaunchSite, df.PayloadMass, color = df.Class, 
          facet_col = df.GridFins, facet_col_spacing = .1, template = 'simple_white')
fig.update_layout(dict(font_size = 14))

### TASK 4: Visualize the relationship between FlightNumber and Orbit type

In [None]:
px.bar(df, df.Orbit, color = df.Class, 
            title = 'Flight Count per Orbit Type vs Success & Failure')

### TASK  5: Visualize the relationship between Payload and Orbit type

In [12]:
px.histogram(df, df.Orbit, df.PayloadMass, color = df.Class)

### TASK  6: Visualize the launch success yearly trend

In [13]:
df.head()

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude,Class
0,1,2010-06-04,Falcon 9,6104.959412,LEO,CCAFS SLC 40,None None,1,False,False,False,,1.0,0,B0003,-80.577366,28.561857,0
1,2,2012-05-22,Falcon 9,525.0,LEO,CCAFS SLC 40,None None,1,False,False,False,,1.0,0,B0005,-80.577366,28.561857,0
2,3,2013-03-01,Falcon 9,677.0,ISS,CCAFS SLC 40,None None,1,False,False,False,,1.0,0,B0007,-80.577366,28.561857,0
3,4,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1,False,False,False,,1.0,0,B1003,-120.610829,34.632093,0
4,5,2013-12-03,Falcon 9,3170.0,GTO,CCAFS SLC 40,None None,1,False,False,False,,1.0,0,B1004,-80.577366,28.561857,0


In [19]:
year = []
for i in df['Date']:
    year.append(i.split('-')[0])
df['Year'] = year

In [15]:
# Visualize the launch success yearly trend
px.histogram(df,  df.Year, df.Class, text_auto = True,
             title = 'Time Series: Success Rate Trend (2010-2020)', 
            labels = {'Class':  'Successful Launches per Year'})

In [44]:
features = df[['FlightNumber', 'PayloadMass', 'Orbit', 'LaunchSite', 'Flights', 'GridFins', 'Reused', 'Legs', 'LandingPad', 'Block', 'ReusedCount', 'Serial']]
features.head()

Unnamed: 0,FlightNumber,PayloadMass,Orbit,LaunchSite,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial
0,1,6104.959412,LEO,CCAFS SLC 40,1,False,False,False,,1.0,0,B0003
1,2,525.0,LEO,CCAFS SLC 40,1,False,False,False,,1.0,0,B0005
2,3,677.0,ISS,CCAFS SLC 40,1,False,False,False,,1.0,0,B0007
3,4,500.0,PO,VAFB SLC 4E,1,False,False,False,,1.0,0,B1003
4,5,3170.0,GTO,CCAFS SLC 40,1,False,False,False,,1.0,0,B1004


In [45]:
features_one_hot = pd.get_dummies(features, drop_first = True)

In [46]:
features_one_hot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 76 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   FlightNumber                         90 non-null     int64  
 1   PayloadMass                          90 non-null     float64
 2   Flights                              90 non-null     int64  
 3   GridFins                             90 non-null     bool   
 4   Reused                               90 non-null     bool   
 5   Legs                                 90 non-null     bool   
 6   Block                                90 non-null     float64
 7   ReusedCount                          90 non-null     int64  
 8   Orbit_GEO                            90 non-null     uint8  
 9   Orbit_GTO                            90 non-null     uint8  
 10  Orbit_HEO                            90 non-null     uint8  
 11  Orbit_ISS                         

In [19]:
features_one_hot = features_one_hot.astype(float)
features_one_hot.dtypes

FlightNumber    float64
PayloadMass     float64
Flights         float64
GridFins        float64
Reused          float64
                 ...   
Serial_B1056    float64
Serial_B1058    float64
Serial_B1059    float64
Serial_B1060    float64
Serial_B1062    float64
Length: 76, dtype: object

In [25]:
features_one_hot.iloc[:, 14:]

Unnamed: 0,Orbit_PO,Orbit_SO,Orbit_SSO,Orbit_VLEO,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LandingPad_5e9e3032383ecb554034e7c9,LandingPad_5e9e3032383ecb6bb234e7ca,LandingPad_5e9e3032383ecb761634e7cb,LandingPad_5e9e3033383ecbb9e534e7cc,...,Serial_B1048,Serial_B1049,Serial_B1050,Serial_B1051,Serial_B1054,Serial_B1056,Serial_B1058,Serial_B1059,Serial_B1060,Serial_B1062
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
86,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
87,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
88,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
features_one_hot.to_csv('features.csv', index = False)

In [3]:
dashData = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/spacex_launch_dash.csv"
data = pd.read_csv(dashData)
data.to_csv('dash_data.csv', index = False)