In [1]:
%cd "E:\AIML\clg"

# Introduction

This DataSet was scraped from https://nextspaceflight.com/launches/past/?page=1 and includes all the space missions since the beginning of Space Race (1957).

# 1.Import Libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install cufflinks

Note: you may need to restart the kernel to use updated packages.


In [5]:
import numpy as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme = 'pearl')
import plotly.graph_objs as go
import plotly
from plotly import tools
import plotly.express as px
from scipy.stats import boxcox
init_notebook_mode(connected =True)
pd.set_option('display.max_columns',100)

# 2. Load the data

In [6]:
space = pd.read_csv("Space.csv")

In [7]:
space.head(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Company Name,Location,Datum,Detail,Status Rocket,Rocket,Status Mission
0,0,0,SpaceX,"LC-39A, Kennedy Space Center, Florida, USA","Fri Aug 07, 2020 05:12 UTC",Falcon 9 Block 5 | Starlink V1 L9 & BlackSky,StatusActive,50.0,Success
1,1,1,CASC,"Site 9401 (SLS-2), Jiuquan Satellite Launch Ce...","Thu Aug 06, 2020 04:01 UTC",Long March 2D | Gaofen-9 04 & Q-SAT,StatusActive,29.75,Success
2,2,2,SpaceX,"Pad A, Boca Chica, Texas, USA","Tue Aug 04, 2020 23:57 UTC",Starship Prototype | 150 Meter Hop,StatusActive,,Success
3,3,3,Roscosmos,"Site 200/39, Baikonur Cosmodrome, Kazakhstan","Thu Jul 30, 2020 21:25 UTC",Proton-M/Briz-M | Ekspress-80 & Ekspress-103,StatusActive,65.0,Success
4,4,4,ULA,"SLC-41, Cape Canaveral AFS, Florida, USA","Thu Jul 30, 2020 11:50 UTC",Atlas V 541 | Perseverance,StatusActive,145.0,Success
5,5,5,CASC,"LC-9, Taiyuan Satellite Launch Center, China","Sat Jul 25, 2020 03:13 UTC","Long March 4B | Ziyuan-3 03, Apocalypse-10 & N...",StatusActive,64.68,Success
6,6,6,Roscosmos,"Site 31/6, Baikonur Cosmodrome, Kazakhstan","Thu Jul 23, 2020 14:26 UTC",Soyuz 2.1a | Progress MS-15,StatusActive,48.5,Success
7,7,7,CASC,"LC-101, Wenchang Satellite Launch Center, China","Thu Jul 23, 2020 04:41 UTC",Long March 5 | Tianwen-1,StatusActive,,Success
8,8,8,SpaceX,"SLC-40, Cape Canaveral AFS, Florida, USA","Mon Jul 20, 2020 21:30 UTC",Falcon 9 Block 5 | ANASIS-II,StatusActive,50.0,Success
9,9,9,JAXA,"LA-Y1, Tanegashima Space Center, Japan","Sun Jul 19, 2020 21:58 UTC",H-IIA 202 | Hope Mars Mission,StatusActive,90.0,Success


# 3. Inspect the data

In [8]:
space.shape


(4324, 9)

In [9]:
space.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4324 entries, 0 to 4323
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      4324 non-null   int64 
 1   Unnamed: 0.1    4324 non-null   int64 
 2   Company Name    4324 non-null   object
 3   Location        4324 non-null   object
 4   Datum           4324 non-null   object
 5   Detail          4324 non-null   object
 6   Status Rocket   4324 non-null   object
 7    Rocket         964 non-null    object
 8   Status Mission  4324 non-null   object
dtypes: int64(2), object(7)
memory usage: 304.2+ KB


In [10]:
space.describe()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1
count,4324.0,4324.0
mean,2161.5,2161.5
std,1248.375611,1248.375611
min,0.0,0.0
25%,1080.75,1080.75
50%,2161.5,2161.5
75%,3242.25,3242.25
max,4323.0,4323.0


3.1. Finding null values

In [11]:
space.isnull().sum()

Unnamed: 0           0
Unnamed: 0.1         0
Company Name         0
Location             0
Datum                0
Detail               0
Status Rocket        0
 Rocket           3360
Status Mission       0
dtype: int64

In [12]:
null_perc = space.isnull().sum()/len(space)*100

In [13]:
null_perc.sort_values(ascending=False).head(9)

 Rocket           77.705828
Status Mission     0.000000
Status Rocket      0.000000
Detail             0.000000
Datum              0.000000
Location           0.000000
Company Name       0.000000
Unnamed: 0.1       0.000000
Unnamed: 0         0.000000
dtype: float64

It is evident that the column 'Rocket' is the only column with null values and has 77.705828% of null values. We need to impute this before moving on to analysis.

We will drop the column 'Rocket' because of the high percentage of null values persent. This will help in the analysis process.

In [14]:
def remove(dataframe,percent = 0.50):
    df = dataframe.copy()
    isshape = df.shape
    colnames = (df.isnull().sum()/len(df))
    colnames = list(colnames[colnames.values>=percent].index)
    df.drop(labels = colnames,axis =1 , inplace = True)
    print("Number of Columns Dropped :",len(colnames))
    print("\nNo of rows and columns  before dropping",isshape,"\nNo of rows and columns after dropping",df.shape)
    return df

In [15]:
space = remove(space,percent=0.50)

Number of Columns Dropped : 1

No of rows and columns  before dropping (4324, 9) 
No of rows and columns after dropping (4324, 8)


In [16]:
space

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Company Name,Location,Datum,Detail,Status Rocket,Status Mission
0,0,0,SpaceX,"LC-39A, Kennedy Space Center, Florida, USA","Fri Aug 07, 2020 05:12 UTC",Falcon 9 Block 5 | Starlink V1 L9 & BlackSky,StatusActive,Success
1,1,1,CASC,"Site 9401 (SLS-2), Jiuquan Satellite Launch Ce...","Thu Aug 06, 2020 04:01 UTC",Long March 2D | Gaofen-9 04 & Q-SAT,StatusActive,Success
2,2,2,SpaceX,"Pad A, Boca Chica, Texas, USA","Tue Aug 04, 2020 23:57 UTC",Starship Prototype | 150 Meter Hop,StatusActive,Success
3,3,3,Roscosmos,"Site 200/39, Baikonur Cosmodrome, Kazakhstan","Thu Jul 30, 2020 21:25 UTC",Proton-M/Briz-M | Ekspress-80 & Ekspress-103,StatusActive,Success
4,4,4,ULA,"SLC-41, Cape Canaveral AFS, Florida, USA","Thu Jul 30, 2020 11:50 UTC",Atlas V 541 | Perseverance,StatusActive,Success
...,...,...,...,...,...,...,...,...
4319,4319,4319,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA","Wed Feb 05, 1958 07:33 UTC",Vanguard | Vanguard TV3BU,StatusRetired,Failure
4320,4320,4320,AMBA,"LC-26A, Cape Canaveral AFS, Florida, USA","Sat Feb 01, 1958 03:48 UTC",Juno I | Explorer 1,StatusRetired,Success
4321,4321,4321,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA","Fri Dec 06, 1957 16:44 UTC",Vanguard | Vanguard TV3,StatusRetired,Failure
4322,4322,4322,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan","Sun Nov 03, 1957 02:30 UTC",Sputnik 8K71PS | Sputnik-2,StatusRetired,Success


3.2. Data Types

We will split the column 'Datum' to respective dates and years.

In [17]:
space['DateTime']= pd.to_datetime(space['Datum'])
 
space['Year'] = space['DateTime'].apply(lambda datetime:datetime.year)
space.head(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Company Name,Location,Datum,Detail,Status Rocket,Status Mission,DateTime,Year
0,0,0,SpaceX,"LC-39A, Kennedy Space Center, Florida, USA","Fri Aug 07, 2020 05:12 UTC",Falcon 9 Block 5 | Starlink V1 L9 & BlackSky,StatusActive,Success,2020-08-07 05:12:00+00:00,2020
1,1,1,CASC,"Site 9401 (SLS-2), Jiuquan Satellite Launch Ce...","Thu Aug 06, 2020 04:01 UTC",Long March 2D | Gaofen-9 04 & Q-SAT,StatusActive,Success,2020-08-06 04:01:00+00:00,2020
2,2,2,SpaceX,"Pad A, Boca Chica, Texas, USA","Tue Aug 04, 2020 23:57 UTC",Starship Prototype | 150 Meter Hop,StatusActive,Success,2020-08-04 23:57:00+00:00,2020
3,3,3,Roscosmos,"Site 200/39, Baikonur Cosmodrome, Kazakhstan","Thu Jul 30, 2020 21:25 UTC",Proton-M/Briz-M | Ekspress-80 & Ekspress-103,StatusActive,Success,2020-07-30 21:25:00+00:00,2020
4,4,4,ULA,"SLC-41, Cape Canaveral AFS, Florida, USA","Thu Jul 30, 2020 11:50 UTC",Atlas V 541 | Perseverance,StatusActive,Success,2020-07-30 11:50:00+00:00,2020
5,5,5,CASC,"LC-9, Taiyuan Satellite Launch Center, China","Sat Jul 25, 2020 03:13 UTC","Long March 4B | Ziyuan-3 03, Apocalypse-10 & N...",StatusActive,Success,2020-07-25 03:13:00+00:00,2020
6,6,6,Roscosmos,"Site 31/6, Baikonur Cosmodrome, Kazakhstan","Thu Jul 23, 2020 14:26 UTC",Soyuz 2.1a | Progress MS-15,StatusActive,Success,2020-07-23 14:26:00+00:00,2020
7,7,7,CASC,"LC-101, Wenchang Satellite Launch Center, China","Thu Jul 23, 2020 04:41 UTC",Long March 5 | Tianwen-1,StatusActive,Success,2020-07-23 04:41:00+00:00,2020
8,8,8,SpaceX,"SLC-40, Cape Canaveral AFS, Florida, USA","Mon Jul 20, 2020 21:30 UTC",Falcon 9 Block 5 | ANASIS-II,StatusActive,Success,2020-07-20 21:30:00+00:00,2020
9,9,9,JAXA,"LA-Y1, Tanegashima Space Center, Japan","Sun Jul 19, 2020 21:58 UTC",H-IIA 202 | Hope Mars Mission,StatusActive,Success,2020-07-19 21:58:00+00:00,2020


Now we will extract the country from the location.

In [18]:
space["Country"] = space["Location"].apply(lambda location : location.split(", ")[-1])
space.head(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Company Name,Location,Datum,Detail,Status Rocket,Status Mission,DateTime,Year,Country
0,0,0,SpaceX,"LC-39A, Kennedy Space Center, Florida, USA","Fri Aug 07, 2020 05:12 UTC",Falcon 9 Block 5 | Starlink V1 L9 & BlackSky,StatusActive,Success,2020-08-07 05:12:00+00:00,2020,USA
1,1,1,CASC,"Site 9401 (SLS-2), Jiuquan Satellite Launch Ce...","Thu Aug 06, 2020 04:01 UTC",Long March 2D | Gaofen-9 04 & Q-SAT,StatusActive,Success,2020-08-06 04:01:00+00:00,2020,China
2,2,2,SpaceX,"Pad A, Boca Chica, Texas, USA","Tue Aug 04, 2020 23:57 UTC",Starship Prototype | 150 Meter Hop,StatusActive,Success,2020-08-04 23:57:00+00:00,2020,USA
3,3,3,Roscosmos,"Site 200/39, Baikonur Cosmodrome, Kazakhstan","Thu Jul 30, 2020 21:25 UTC",Proton-M/Briz-M | Ekspress-80 & Ekspress-103,StatusActive,Success,2020-07-30 21:25:00+00:00,2020,Kazakhstan
4,4,4,ULA,"SLC-41, Cape Canaveral AFS, Florida, USA","Thu Jul 30, 2020 11:50 UTC",Atlas V 541 | Perseverance,StatusActive,Success,2020-07-30 11:50:00+00:00,2020,USA
5,5,5,CASC,"LC-9, Taiyuan Satellite Launch Center, China","Sat Jul 25, 2020 03:13 UTC","Long March 4B | Ziyuan-3 03, Apocalypse-10 & N...",StatusActive,Success,2020-07-25 03:13:00+00:00,2020,China
6,6,6,Roscosmos,"Site 31/6, Baikonur Cosmodrome, Kazakhstan","Thu Jul 23, 2020 14:26 UTC",Soyuz 2.1a | Progress MS-15,StatusActive,Success,2020-07-23 14:26:00+00:00,2020,Kazakhstan
7,7,7,CASC,"LC-101, Wenchang Satellite Launch Center, China","Thu Jul 23, 2020 04:41 UTC",Long March 5 | Tianwen-1,StatusActive,Success,2020-07-23 04:41:00+00:00,2020,China
8,8,8,SpaceX,"SLC-40, Cape Canaveral AFS, Florida, USA","Mon Jul 20, 2020 21:30 UTC",Falcon 9 Block 5 | ANASIS-II,StatusActive,Success,2020-07-20 21:30:00+00:00,2020,USA
9,9,9,JAXA,"LA-Y1, Tanegashima Space Center, Japan","Sun Jul 19, 2020 21:58 UTC",H-IIA 202 | Hope Mars Mission,StatusActive,Success,2020-07-19 21:58:00+00:00,2020,Japan


# 4. Analysis

4.1. Country wise launches

In [19]:
temp = space["Country"].value_counts()
temp.iplot(kind = 'bar',xTitle = 'Country', yTitle = 'Count', title = 'Number of launches according to country', color = 'orange')

We observe that Russia has more launches than USA. Kazakhstan has the third hisghes number of launches beating France, China, Japan and India to the space race.

4.2.  Company wise launches

In [20]:
temp = space["Company Name"].value_counts()
temp.iplot(kind = 'bar', xTitle = 'Company', yTitle = ' Count',title = 'number of launches acc to Compaanies', color = 'blue')

We canobserve that RVSN USSR has the hisghest number of launches by a huge margin with a count of 1777. NASA is 5th, SpaceX is 11th and ISRO is 15th.

4.3. Year wise launches

In [21]:
temp = space["Year"].value_counts()
temp.iplot(kind = 'bar',xTitle = 'Year',yTitle = 'Count',title = 'Number of launches acc to year', color = 'cyan')

The year with the hishest number of launches is 1971 with 119 launches. 2018 was also a good year for space launches with 117 launches.

4.4. Rocket Status

In [22]:
im = space["Status Rocket"].value_counts()
df = pd.DataFrame({'labels' : im.index,'values': im.values})
df.iplot(kind = 'pie',labels ='labels',values ='values',title= 'Rocket Status Distribution' ,hole = 0.4)

Only 18.3% rockets are Active and 81.7% rockets have retired.

4.5. Mission Status

In [23]:
temp = space["Status Mission"].value_counts()
df = pd.DataFrame({'labels' : temp.index, 'values' : temp.values})
df.iplot(kind = 'pie',labels = 'labels',values = 'values', title = 'Status of previous applications', hole = 0.4,)

Majority of the missions are success comprising of 89.7% of the data.

4.6. Top 30 Launch Sites

In [24]:
top_loc = space['Location'].value_counts().head(30)
top_loc.iplot(kind = 'bar',xTitle = 'Launch Site', yTitle = 'Count',title = 'Top 30 Launch Sites',color = 'magenta')

'Site 31/6, Baikonur Cosmodrome, Kazakhstan' is the most used launch site with 235 launches.

# 5. Last decade metrics (2010)

In [25]:
5.1. Most active countries last decade

SyntaxError: invalid syntax (<ipython-input-25-99acabdd9fce>, line 1)

In [None]:
space_decade = space.query('Year > 2010')
space_decade["Country"].value_counts().iplot(kind = 'bar',xTitle = 'Country', yTitle = 'Count', title = 'Launches by country before 2010',color ='fuchsia')

USA was the most active country in the last decade whereas Russia dropped 5 places.

5.2. Most active companies last decade

In [None]:
space_decade["Company Name"].value_counts().iplot(kind= 'bar',xTitle = 'Company', yTitle= 'Count', title = 'Lunches by Companies before 2010', color = '#9bf708')

CASC, ULA and SpaceX were the top 3 most active companies exploring the space whereas NASA only launched 3 rockets in the last decade. ISRO was also very active in the last decade with 44 launches placing it in 6th position

# 6. Conclusions

* Russia was the world leader of space exploration but in the last decade USA lead the space race
* In the last decade the China Aerospace Science and Technology Corporation (CASC) launched the most number of rockets making      it the world leader company for the last decade but RVSN USSR still holds the title for the most number of launches.
* 1970s was when the space race was booming but it is again booming in this decade.
* Kazakhstan has the worlds most used launch sites.
* Of all the missions, 18.3% missions are still ac