In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
s=pd.read_csv('/kaggle/input/all-space-missions-from-1957/Space_Corrected.csv',parse_dates=['Datum'])
s.head()

* The dataset contains first two columns 'Unnamed: 0','Unnamed: 0.1' are irrelevant for EDA,so they should be dropped.
* Many columns have their name incorrectly spelled or ordered so they should be corrected.

In [None]:
#dropping the unnecessary columns
s.drop(columns=['Unnamed: 0','Unnamed: 0.1'],inplace=True)

In [None]:
#correcting column names
s.rename(columns= {'Datum':'Date','Status Rocket':'Rocket_status',' Rocket':'Mission_Cost',
                   'Status Mission':'Mission_Status'},inplace=True)

In [None]:
s.head()

In [None]:
s.info()

* Dataset contains 7 columns and 4324 rows.
* All columns are of object type. 
* The 'Date' and 'Mission_Cost' column is of incorrect datatype, so it should be corrected.
* Index is of Int64 type.

In [None]:
import missingno as msno
msno.matrix(s)

By visulization it is depicted that only the Mission_Cost column has the missing values in the dataset.

In [None]:
s.isnull().sum()

3360 values are missing in the Mission_Cost column in the dataset.

## VARIABLE EXPLORATION

In [None]:
s['Company Name'].value_counts(normalize=True).head()*100

Out of top 5 companies,majority of missions are conducted by RVSN USSR.(41.09%).

In [None]:
s['Location'].value_counts(normalize=True).head()*100

It seems that majority of space programs are conducted in Kazakhstan and Russia.

In [None]:
#converting date column into datetime format
s['Date'] = pd.to_datetime(s['Date'],format ="%Y-%m-%d",utc=True)

In [None]:
s['Date'].head()

In [None]:
s['Detail'].value_counts(normalize=True).head()*100

The 'Cosmos-3MRB' (65MRB) and 'BOR-5 Shuttle' rockets are launched at majority of space programs(1.3%). 

In [None]:
s['Rocket_status'].value_counts()

As we can see that the values of the Rocket_status column are not properly depicted,so they should be changed.

In [None]:
#changing the values of Rocket_status column to 'Retired' and 'Active'
s['Rocket_status'].replace({'StatusRetired':'Retired','StatusActive':'Active'},inplace=True)

In [None]:
s['Rocket_status'].value_counts(normalize=True)*100

Approx 82% of the rockets launched in the space missions from 1957 till date are retired.Rest of them (approx 18%) are still being launched in space missions.

In [None]:
s.head()

In [None]:
#filling up the missing values
s['Mission_Cost']=s['Mission_Cost'].fillna(0.0)

As most of the values in mission_cost are zero we will fill the samples with 0.

In [None]:
s.isnull().sum()

In [None]:
s['Mission_Status'].value_counts(normalize=True)*100

Most of the space missions conducted are successful (Approx 90%).

In [None]:
#adding separate year,month and day columns
s['Year']=s['Date'].dt.year
s['Month']=s['Date'].dt.month
s['Day']=s['Date'].dt.day

In [None]:
#converting mission_cost column from object to float type
s['Mission_Cost'] = s['Mission_Cost'].fillna(0.0).str.replace(',', '')
s['Mission_Cost'] = s['Mission_Cost'].astype(np.float64).fillna(0.0)

In [None]:
s.head()

## DATA VISUALIZATION

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import plotly
plotly.offline.init_notebook_mode(connected = True)
import seaborn as sns
import plotly.graph_objs as go

## **Which Company has the most number of launches?**

In [None]:
ds = s['Company Name'].value_counts().reset_index()
ds.columns = ['Company', 'Number of launches']
px.bar(
    ds, 
    x='Number of launches', 
    y="Company", 
    orientation='h', 
    title='Number of Space Missions Launched By Every Company', 
    height=1000,width=1000,color='Company')

RVNN USR leads the way with most number of launches(1777) followed by Arianespace(279).

## **How many rockets are currently working?**

In [None]:
px.pie(s,'Rocket_status')

Around (18.3%) rockets are currently working,rest are retired.

## **What is the fraction of successful missions?**

In [None]:
px.pie(s,'Mission_Status')

Around 89.7% of missions are successful.

## **Which period of year has the highest number of launches?**

In [None]:
y=s['Year'].value_counts().reset_index()
y.columns=['Year','Number of launches']
px.bar(y,x='Year',y='Number of launches',color='Year')

During 1965-1977 most number of launches take place.

## **What company dominates which period?**

In [None]:
ds = s.groupby(['Year', 'Company Name'])['Mission_Status'].count().reset_index().sort_values(['Year', 'Mission_Status']
                                                                                             ,ascending=False)
ds.columns = ['Year', 'Company', 'Number of Launches']
px.scatter(
    ds, 
    x="Year", 
    y="Number of Launches", 
    color='Company',
    size='Number of Launches',
    title='Distribution of launches over the year by companies',height=1000)

* 1970-1980 USSR dominated the world in terms of launches.
* From 2016-2020 CASC is dominating the world in terms of highest number of launches.

## **Which is the most successful Company?**

In [None]:
su=s[s['Mission_Status']=='Success']
su=su[su['Mission_Cost']>0.0]
su.rename(columns={'Mission_Cost':'Successful_missions_count'},inplace=True)
t=su.groupby('Company Name')['Successful_missions_count'].count().reset_index()
px.bar(t,y='Company Name',x='Successful_missions_count'
       ,title='Distribution of Success Rate Over Companies',color='Successful_missions_count',height=1000)

NASA and CASC are the most successful companies having 146 and 152 successful missions respectively.

## **How much companies invested in the space programs per year?**

In [None]:
e=s.groupby(['Company Name','Year'])['Mission_Cost'].sum().reset_index()
e=e[e['Mission_Cost']>0.0]
px.scatter(e,x='Year',y='Mission_Cost',color='Company Name'
           ,size='Mission_Cost',height=1000,title='Yearly distribution of Cost by Companies')

* In 1969,NASA invested their highest budget of 4640 US million dollars.
* During 1970-1980 minimum number of space missions are executed.
* During 1987-1988 again RVSN USSR launched its two highest budget missions.(around 5000 US million dollars).
* From beginning of 21st century there is an decrease in the cost of space program because of better technology leading to      optimizations in cost cutting and increase in number of space programs by other countries.
* During 1970-2000,a large number of space programs initiated were expensive.

## **Does more cost always guarantee success of an space mission?**

In [None]:
m=s[s['Mission_Cost']>0.0]
px.bar(m,y='Company Name',x='Mission_Cost',color='Mission_Status',height=1000,width=1000,
       title='Distribution of Mission Status and Cost Over Companies')

* NASA has the highest budget among all the companies.
* Higher budgets doesn't deliever successful missions or cost doesn't guarantee higher success rate.

## **How much cost does each company invested per month?**

In [None]:
me=s.groupby(['Company Name','Month'])['Mission_Cost'].sum().reset_index()
me=me[me['Mission_Cost']>0.0]
px.bar(me,x='Month',y='Mission_Cost',color='Company Name'
           ,height=500,title='Monthly distribution of Cost by Companies')

* Most of the high budget space missions happened around May and November.
* Arianespace has the lowest budget space programs around each month.

## **Which company has the most active rockets?**

In [None]:
a=s[s['Rocket_status']=='Active']
r=a.groupby(['Company Name'])['Rocket_status'].count().reset_index()
r.rename(columns={'Rocket_status':'Active_Rockets'},inplace=True)
px.bar(r,y='Company Name',x='Active_Rockets',title='Distribution of Active Rockets Over Companies',color='Active_Rockets',height=1000)

CASC has the most number of active rockets working currently(around 211).

## **Which company has the most number of retired rockets?**

In [None]:
r=s[s['Rocket_status']=='Retired']
f=r.groupby(['Company Name'])['Rocket_status'].count().reset_index()
f.rename(columns={'Rocket_status':'Retired_Rockets'},inplace=True)
px.bar(f,y='Company Name',x='Retired_Rockets'
       ,title='Distribution of Retired Rockets Over Companies',color='Retired_Rockets',height=1000)

RVSN USSR has most number of retired rockets (1777).

In [None]:
#extracting country from the location column
s['Country'] = s['Location'].str.split(', ').str[-1]

In [None]:
s.head()

In [None]:
s['Country'].value_counts()

It is seen that some space stations and areas are present instead of countries,so they should be mapped to the countries.

In [None]:
#mapping of space stations and areas to their respective countries
countries_dict = {
    'Russia' : 'Russian Federation',
    'New Mexico' : 'USA',
    "Yellow Sea": 'China',
    "Shahrud Missile Test Site": "Iran",
    "Pacific Missile Range Facility": 'USA',
    "Barents Sea": 'Russian Federation',
    "Gran Canaria": 'USA'
}
s['Country'] = s['Country'].replace(countries_dict)

In [None]:
s['Country'].value_counts()

During the cold war era 1991,USA and Russian Federation are among the top countries in space race.

## **Which country has maximum number of launches?**

In [None]:
ds = s['Country'].value_counts().reset_index()
ds.columns = ['Country', 'Number of launches']
px.bar(
    ds, 
    x='Number of launches', 
    y="Country", 
    orientation='h', 
    title='Number of Space Missions Launched by the countries', 
    height=1000,width=1000,color='Country')

Russian Federation has the most number of launches(1398) followed by USA(1351).

## **Which country has the higher average budget?**

In [None]:
b=s[s['Mission_Cost']>0.0]
av=b.groupby(['Country'])['Mission_Cost'].mean().reset_index()
av.rename(columns={'Mission_Cost':'Average_Budget'},inplace=True)
px.bar(av,x='Average_Budget',y='Country',title='Average Budget of Each Country')

Kazakhstan has the highest average budget of 264.14 US million dollars.

## **How much money the countries spend on an average per year and per month?**

In [None]:
mo = s[s['Mission_Cost']>0]
mo = mo.groupby(['Year','Country'])['Mission_Cost'].mean().reset_index()
px.line(
    mo, 
    x="Year", 
    y="Mission_Cost",
    facet_row='Country',height=1500,width=1000,
    title='Average Money Spent By Countries per Year',color='Country')

In [None]:
mo = s[s['Mission_Cost']>0]
mo = mo.groupby(['Month','Country'])['Mission_Cost'].mean().reset_index()
px.line(
    mo, 
    x="Month", 
    y="Mission_Cost",
    facet_row='Country',height=1500,width=1000,
    title='Average Money Spent By Countries per Month',color='Country')

## **Which country has most successful number of launches?**

In [None]:
ds = s[s['Mission_Status']=='Success']
ds = s.groupby(['Year','Country'])['Mission_Status'].count().reset_index().sort_values(['Year', 'Mission_Status'], ascending=False)
ds.columns = ['Year','Country', 'Number of launches']
px.scatter(
    ds, 
    x='Year', 
    y="Number of launches", 
    title='Number of Successful Space Missions Launched by the countries', 
    color='Country',size="Number of launches")

* From 1970-1990 Russian Federation has the most successful number of launches.(20 years)
* From 1990-2017 NASA dominated the world.(27 years)
* From 2017-present,China is starting to show growth as well as other asian countries like Kazahakstan etc.

**But the big picture is USA is the most successful country in terms of its space expeditions as well as experience NASA.**

If you like this notebook do upvote it.

Do provide your valuable feedback.

Do checkout my other notebooks at https://www.kaggle.com/tmchls