In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [None]:
space_df=pd.read_csv('../input/all-space-missions-from-1957/Space_Corrected.csv')

# let's take a quick peek into the dataset

In [None]:
space_df.info()

In [None]:
space_df.head(10)

In [None]:
# Looks like unnamed columns are not needed here! So removing those two columns before proceeding further!
space_df=space_df.iloc[:,2:len(space_df.columns)]
space_df

# In total we have 7 variables, which gives the basic description of the rocket launches!

### Visualizing the missing values with missingmno library!

In [None]:
import missingno as mno
print(space_df.shape)
mno.matrix(space_df)

### Variables with their number of missing values and percentage!!

In [None]:
missing=pd.DataFrame(space_df.isna().sum().reset_index())
missing.columns=['Variables','Missing']
missing['Percentage']=(missing['Missing']/space_df.shape[0])*100
missing

* ### The Rocket variables speaks about the Cost of the mission: in million
* ### The rocket variable has nearly 77 percent of missing values, which makes it quite vulnerable! 
* ### If the missing values are <5 percent then it would be fine to remove those missing records! But since it too high nearly 77.7 percent, we have to impute the missing values!
* ### Before imputing the values with either mean or median value, we must know the distribution of the rocket variable

In [None]:
#Since the values are in string format! We replace the value with the mode value!
print(space_df[' Rocket'].mode())
#The mode value is 450$M
space_df[' Rocket']=space_df[' Rocket'].fillna('450.0')


# Let's look into some statistics of each variable!!

In [None]:
Stat=pd.DataFrame(space_df.describe().T)
Stat['Percent']=(Stat['freq']/Stat['count'])*100
Stat

# Space launches - country wise!

In [None]:
space_df['Country'] = space_df['Location'].apply(lambda x:x.split(',')[-1])
country = space_df.groupby('Country').count()['Detail'].sort_values(ascending=False).reset_index()
country.rename(columns={"Detail":"No of Launches"},inplace=True)
country.head(10).style.background_gradient(cmap='Oranges').hide_index()


# Key take aways 
1. ### 41% of the rocket launches are by RVSN USSR
2. ### 5% od the launches take place from the Kazakhasthan
3. ### 81.7% of the rockets have been retired 
4. ### 89% of the rockets have been successfully launched into the sky!

# Since RVSN USSR PLAYS A MAJOR ROLE IN ROCKET LAUNCHES! Let's see what their success rate 

In [None]:
space_df[space_df['Company Name']=='RVSN USSR']

# Rocket status and mission status of USSR!

### Most of the rockets lauched by RVSN-USSR is retired, which is obvious as there is no USSR anymore! It's interesting that nearly 90% of their launches were successful!

In [None]:
ussr=pd.DataFrame(space_df[space_df['Company Name']=='RVSN USSR'][['Status Rocket','Status Mission']].value_counts())
ussr.columns=['Count']
ussr['Percentage']=(ussr['Count']/space_df[space_df['Company Name']=='RVSN USSR'].shape[0])*100
ussr

# Let's try to explore the other areas of interest aswell!

### Visualization between the companies and their mission status!

In [None]:
df_active = space_df[space_df['Status Rocket'] == "StatusActive"]
df_active = df_active.groupby('Company Name').count()['Detail'].sort_values(ascending=False).reset_index()
len(df_active)

companies = space_df.groupby(['Company Name'])['Detail'].count().sort_values(ascending=False).reset_index()
len(companies)

top_20 = companies[1:40]
cmp = space_df.groupby(['Company Name','Status Rocket']).count()['Detail'].reset_index()
cmp = cmp[cmp['Company Name'].isin(top_20['Company Name'])]
active = cmp[cmp['Status Rocket']=="StatusActive"].sort_values('Detail')
retired = cmp[cmp['Status Rocket']!="StatusActive"]
fig = go.Figure()
fig.add_bar(y=active['Detail'],x=active['Company Name'],name='Status Active')
fig.add_bar(y=retired['Detail'],x=retired['Company Name'],name='Status Retired')
fig.update_layout(barmode="stack",title="Companies and Mission Status",yaxis_title="No of Missions")
fig.show()

### CASC has a very high percentage of active rockets comapred to the other companies!!

# Companies with active rocket status

In [None]:
activers=cmp[cmp['Status Rocket']=='StatusActive']['Company Name'].unique()
activers

# Lets see how geographically the launches are distributed!

In [None]:
map_data = [go.Choropleth( 
           locations = country['Country'],
           locationmode = 'country names',
           z = country["No of Launches"], 
           text = country['Country'],
           colorbar = {'title':'No of Launches'},
           colorscale='purples')]

layout = dict(title = 'Countries wise Rocket Launches', 
             geo = dict(showframe = False, 
                       projection = dict(type = 'equirectangular')))

world_map = go.Figure(data=map_data, layout=layout)
iplot(world_map)
print("The more darker the region denotes more number of launches from that respective country!!")

# Now let's look into the data from time perspective!!

#### Split the datum columns into year, day and month!! or you could use the datetime library 

In [None]:

space_df['day'] = space_df['Datum'].apply(lambda x:x.split()[0])
space_df['Month']=space_df['Datum'].apply(lambda x:x.split()[1])
space_df['year'] = space_df['Datum'].apply(lambda x:x.split()[3])


# Number of launches - Month wise!

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
ax.set_title('No. of Launches by Month', fontsize=20)
order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
sns.countplot(x='Month', data=space_df, order=order)
ax.set_xlabel('Month', fontsize=15)
ax.set_ylabel('No. of Launches', fontsize=15)
plt.show()

### December grabs the most number of launches! Looks like a Christmas present!!

# Number of launches - Days basis!

In [None]:
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
df_days = space_df.groupby('day').count()['Detail'].reset_index()

df_days['day'] = pd.Categorical(df_days['day'], categories=days, ordered=True)
df_days = df_days.sort_values('day')
plt.figure(figsize=(11,4))
sns.barplot(x='day', y='Detail', data=df_days)
plt.ylabel('No of launches')
b=plt.title(' Day vs No of launches')

### Looks like most of the launches are between Tuesday and Friday!

# Number of launches - yearly basis

In [None]:
date= space_df.groupby('year').count()['Detail'].reset_index()
plt.figure(figsize=(20,6))
b=sns.barplot(x='year', y='Detail', data=date)
plt.ylabel('no of launches')
plt.title(' No of launches per year')
_=b.set_xticklabels(b.get_xticklabels(), rotation=90, horizontalalignment='right')

### The most number of launches are from the year 2018 and 1971

## Kindly upvote! If you find it useful!!

Thanks to,
References: 
1. https://www.kaggle.com/rude009/space-missions-simple-eda-and-visualizations
2. https://www.kaggle.com/sshuri/space-missions-eda-plotly