# Data Wrangling

In [1]:
import pandas as pd
import numpy as np
print("Imported Libraries")

Imported Libraries


## Read Data

In [2]:
df=pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_1.csv")
print(f"Read Data ({df.shape})")

Read Data ((90, 17))


In [3]:
for column in df.columns:
    nullPercent = np.round(df[column].isnull().sum()/len(df),3)
    if nullPercent > 0:
        print(f'{column} is {nullPercent*100}% null.')

LandingPad is 28.9% null.


In [4]:
df.drop('LandingPad', axis=1, inplace=True)

## TASK 1: Calculate the number of launches on each site

In [5]:
df[['LaunchSite']].groupby(['LaunchSite']).value_counts()

LaunchSite
CCAFS SLC 40    55
KSC LC 39A      22
VAFB SLC 4E     13
Name: count, dtype: int64

## TASK 2: Calculate the number and occurrence of each orbit

In [6]:
print(df[['Orbit']].groupby(['Orbit']).value_counts())

Orbit
ES-L1     1
GEO       1
GTO      27
HEO       1
ISS      21
LEO       7
MEO       3
PO        9
SO        1
SSO       5
VLEO     14
Name: count, dtype: int64


In [7]:
print(pd.DataFrame(df[['Orbit','Outcome']].groupby(['Orbit']).value_counts()).reset_index().Outcome.unique())

['True Ocean' 'True ASDS' 'None None' 'False ASDS' 'None ASDS' 'True RTLS'
 'False Ocean' 'False RTLS']


In [8]:
df = df.assign(Class = df.Outcome.apply(lambda x: 1 if 'True' in x else 0))

## TASK 3: Calculate the number and occurence of mission outcome of the orbits

In [9]:
df = df.assign(Success = df.Outcome.apply(lambda x: x[:x.find(' ')]))
df = df.assign(Target = df.Outcome.apply(lambda x: x[1+x.find(' '):]))
df.insert(7, "Success", df.pop("Success"))
df.insert(8, "Target", df.pop("Target"))

In [10]:
print(df[['Success', 'Target', 'Class']].groupby(['Success', 'Target']).value_counts())

Success  Target  Class
False    ASDS    0         6
         Ocean   0         2
         RTLS    0         1
None     ASDS    0         2
         None    0        19
True     ASDS    1        41
         Ocean   1         5
         RTLS    1        14
Name: count, dtype: int64


In [11]:
df.drop(['Success','Target'], axis=1, inplace=True)

## TASK 4: Create a landing outcome label from Outcome column

In [12]:
df[(df['Outcome'].str.contains("False") | df['Outcome'].str.contains("None"))][21:24]

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,Block,ReusedCount,Serial,Longitude,Latitude,Class
47,48,2018-04-02,Falcon 9,2760.0,ISS,CCAFS SLC 40,None None,2,True,True,True,4.0,1,B1039,-80.577366,28.561857,0
50,51,2018-06-04,Falcon 9,5383.85,GTO,CCAFS SLC 40,None None,2,False,True,False,4.0,1,B1040,-80.577366,28.561857,0
51,52,2018-06-29,Falcon 9,2410.0,ISS,CCAFS SLC 40,None None,2,False,True,False,4.0,1,B1045,-80.577366,28.561857,0


In [13]:
df[df['Outcome'].str.contains("True")][26:29]

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,Block,ReusedCount,Serial,Longitude,Latitude,Class
48,49,2018-04-18,Falcon 9,350.0,HEO,CCAFS SLC 40,True ASDS,1,True,False,True,4.0,1,B1045,-80.577366,28.561857,1
49,50,2018-05-11,Falcon 9,3750.0,GTO,KSC LC 39A,True ASDS,1,True,False,True,5.0,3,B1046,-80.603956,28.608058,1
52,53,2018-07-22,Falcon 9,7076.0,GTO,CCAFS SLC 40,True ASDS,1,True,False,True,5.0,2,B1047,-80.577366,28.561857,1


In [14]:
df = df.assign(Class = df['Outcome'].str.contains("True"))
df[47:53]

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,Block,ReusedCount,Serial,Longitude,Latitude,Class
47,48,2018-04-02,Falcon 9,2760.0,ISS,CCAFS SLC 40,None None,2,True,True,True,4.0,1,B1039,-80.577366,28.561857,False
48,49,2018-04-18,Falcon 9,350.0,HEO,CCAFS SLC 40,True ASDS,1,True,False,True,4.0,1,B1045,-80.577366,28.561857,True
49,50,2018-05-11,Falcon 9,3750.0,GTO,KSC LC 39A,True ASDS,1,True,False,True,5.0,3,B1046,-80.603956,28.608058,True
50,51,2018-06-04,Falcon 9,5383.85,GTO,CCAFS SLC 40,None None,2,False,True,False,4.0,1,B1040,-80.577366,28.561857,False
51,52,2018-06-29,Falcon 9,2410.0,ISS,CCAFS SLC 40,None None,2,False,True,False,4.0,1,B1045,-80.577366,28.561857,False
52,53,2018-07-22,Falcon 9,7076.0,GTO,CCAFS SLC 40,True ASDS,1,True,False,True,5.0,2,B1047,-80.577366,28.561857,True


In [15]:
df['Class'] = df['Class'].map({True: 1, False: 0})
df[47:53]

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,Block,ReusedCount,Serial,Longitude,Latitude,Class
47,48,2018-04-02,Falcon 9,2760.0,ISS,CCAFS SLC 40,None None,2,True,True,True,4.0,1,B1039,-80.577366,28.561857,0
48,49,2018-04-18,Falcon 9,350.0,HEO,CCAFS SLC 40,True ASDS,1,True,False,True,4.0,1,B1045,-80.577366,28.561857,1
49,50,2018-05-11,Falcon 9,3750.0,GTO,KSC LC 39A,True ASDS,1,True,False,True,5.0,3,B1046,-80.603956,28.608058,1
50,51,2018-06-04,Falcon 9,5383.85,GTO,CCAFS SLC 40,None None,2,False,True,False,4.0,1,B1040,-80.577366,28.561857,0
51,52,2018-06-29,Falcon 9,2410.0,ISS,CCAFS SLC 40,None None,2,False,True,False,4.0,1,B1045,-80.577366,28.561857,0
52,53,2018-07-22,Falcon 9,7076.0,GTO,CCAFS SLC 40,True ASDS,1,True,False,True,5.0,2,B1047,-80.577366,28.561857,1


In [16]:
landing_class = df['Class']

## Save Data to CSV File

In [17]:
df.head()

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,Block,ReusedCount,Serial,Longitude,Latitude,Class
0,1,2010-06-04,Falcon 9,6104.959412,LEO,CCAFS SLC 40,None None,1,False,False,False,1.0,0,B0003,-80.577366,28.561857,0
1,2,2012-05-22,Falcon 9,525.0,LEO,CCAFS SLC 40,None None,1,False,False,False,1.0,0,B0005,-80.577366,28.561857,0
2,3,2013-03-01,Falcon 9,677.0,ISS,CCAFS SLC 40,None None,1,False,False,False,1.0,0,B0007,-80.577366,28.561857,0
3,4,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1,False,False,False,1.0,0,B1003,-120.610829,34.632093,0
4,5,2013-12-03,Falcon 9,3170.0,GTO,CCAFS SLC 40,None None,1,False,False,False,1.0,0,B1004,-80.577366,28.561857,0


In [18]:
df.to_csv("dataset_part_2.csv", index=False)