In [1]:
# Load Data Libraries for Data Wrangling

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load dataset.csv created from notebook 1 - 1_data_collection.csv
df = pd.read_csv('1_data_collection.csv')

# Preview top 10 rows to check if dataset loaded correctly
df.head(10)

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
0,6,2010-06-04T18:45:00.000Z,Falcon 9,,LEO,CCSFS SLC 40,,1,False,False,False,,1,0,B0003,-80.577366,28.561857
1,7,2010-12-08T15:43:00.000Z,Falcon 9,,LEO,CCSFS SLC 40,,1,False,False,False,,1,0,B0004,-80.577366,28.561857
2,8,2012-05-22T07:44:00.000Z,Falcon 9,525.0,LEO,CCSFS SLC 40,,1,False,False,False,,1,0,B0005,-80.577366,28.561857
3,9,2012-10-08T00:35:00.000Z,Falcon 9,400.0,ISS,CCSFS SLC 40,,1,False,False,False,,1,0,B0006,-80.577366,28.561857
4,10,2013-03-01T19:10:00.000Z,Falcon 9,677.0,ISS,CCSFS SLC 40,,1,False,False,False,,1,0,B0007,-80.577366,28.561857
5,11,2013-09-29T16:00:00.000Z,Falcon 9,500.0,PO,VAFB SLC 4E,False,1,False,False,False,,1,0,B1003,-120.610829,34.632093
6,12,2013-12-03T22:41:00.000Z,Falcon 9,3170.0,GTO,CCSFS SLC 40,,1,False,False,False,,1,0,B1004,-80.577366,28.561857
7,13,2014-01-06T18:06:00.000Z,Falcon 9,3325.0,GTO,CCSFS SLC 40,,1,False,False,False,,1,0,B1005,-80.577366,28.561857
8,14,2014-04-18T19:25:00.000Z,Falcon 9,2296.0,ISS,CCSFS SLC 40,True,1,False,False,True,,1,0,B1006,-80.577366,28.561857
9,15,2014-07-14T15:15:00.000Z,Falcon 9,1316.0,LEO,CCSFS SLC 40,True,1,False,False,True,,1,0,B1007,-80.577366,28.561857


In [4]:
# Check for any missing values from dataset columns (Normal to have high number of missing data in Landingpad according to sources)
print(df.isnull().sum() / df.shape[0] * 100)

FlightNumber       0.000000
Date               0.000000
BoosterVersion     0.000000
PayloadMass       12.921348
Orbit              0.561798
LaunchSite         0.000000
Outcome           14.606742
Flights            0.000000
GridFins           0.000000
Reused             0.000000
Legs               0.000000
LandingPad        17.415730
Block              0.000000
ReusedCount        0.000000
Serial             0.000000
Longitude          0.000000
Latitude           0.000000
dtype: float64


In [5]:
# Target Variable Cleaning of 'Outcome' column due to messy text rersults. Converting to Success (1) and Failure (0)
print(df['Outcome'].value_counts())

Outcome
True     141
False     11
Name: count, dtype: int64


In [6]:
# ------- CODE DIDNT WORK CORRECTLY - KEEP AS EVIDENCE ---------

# Create Class Column using the converting criteria and filter out 'bad' outcomes (1 = Landed Successfully, 0 = Failed)
# bad_outcomes = {
#    'False ASDS', 
#    'False Ocean', 
#    'False RTLS', 
#    'None ASDS', 
#    'None None'
# }

# List for new filtered column
# landing_class = []

# for outcome in df['Outcome']:
#    if outcome in bad_outcomes:
#        landing_class.append(0) # Mark as Fail
#    else:
#        landing_class.append(1) # Mark as Success

# Add to the DataFrame
# df['Class'] = landing_class

# ---- SAMPLE OF FILTERED DATA FROM ABOVE CODE ----

#      Outcome  Class
# 0      NaN      1
# 1      NaN      1
# 2      NaN      1
# 3      NaN      1
# 4      NaN      1
# 5    False      1 <--- 
# 6      NaN      1
# 7      NaN      1
# 8     True      1 <---


In [12]:
# Fix - The Outcome column contains True, False, or NaN.
# 1 for True, and 0 for everything else (False or NaN).

# List for new filtered column and new logic
landing_class = []

for outcome in df['Outcome']:
    # Check if the value is explicitly True
    if outcome == True:
        landing_class.append(1) # Success
    else:
        landing_class.append(0) # Fail (False, NaN etc)

# Add to the dataframe
df['Class'] = landing_class

In [13]:
# Print our new outcome list
print(df[['Outcome', 'Class']].head(10))

  Outcome  Class
0     NaN      0
1     NaN      0
2     NaN      0
3     NaN      0
4     NaN      0
5   False      0
6     NaN      0
7     NaN      0
8    True      1
9    True      1


In [14]:
# Calculate the success rate as a %age
success_rate = df['Class'].mean()
print(f"Success Rate: {success_rate:.2%}")

Success Rate: 79.21%


In [None]:
# Save the dataset into 2_data.wrangling.csv
df.to_csv('2_data_wrangling.csv', index=False)
print("Data wrangled and saved to 2_data_wrangling.csv")