In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
df = pd.read_csv('/kaggle/input/all-space-missions-from-1957/Space_Corrected.csv')

In [None]:
df.shape

# **Quick look into the columns and datatypes**

In [None]:
df.info()

# Look at 5 random rows from the dataset

In [None]:
df.sample(5)

# **We observe from above result that unnamed columns are redundant, so we will remove them**

In [None]:
df = df.iloc[:,2:len(df.columns)]


In [None]:
pd.set_option('display.max_columns', None)
df.sample(5)

In [None]:
#Find missing values
df.isnull().sum()

In [None]:
import missingno as mno
print(df.shape)
mno.matrix(df)

In [None]:
missing=pd.DataFrame(df.isna().sum().reset_index())
missing.columns=['Variables','Missing']
missing['Percentage']=(missing['Missing']/df.shape[0])*100
missing

# Rocket column has many missing values, as the datatype is string, let's replace it with mode value

In [None]:
print(df[' Rocket'].mode())
#The mode value is 450$M
df[' Rocket']=df[' Rocket'].fillna('450.0')

# Statistics

In [None]:
stats=pd.DataFrame(df.describe().T)
stats

# **Let us create country column from the location column **

In [None]:
df['Country'] = df.Location.apply(lambda x:x.split(',')[-1])
df.sample(5)

In [None]:
#Top 10 countries
country_df = df.Country.value_counts().head(10)

# **Top 10 countries chart**

In [None]:
sns.set_theme(style="darkgrid")
sns.barplot(x=country_df.values, y=country_df.index)

1. Russia and USA launched almost same number of space vehicles


# **Less number of Rockets are active**

In [None]:
sns.countplot(x='Status Rocket', data=df)

In [None]:
df['Status Rocket'].value_counts()

In [None]:
df.nunique()

# **Visualising the Success Rates**

In [None]:
sns.countplot(x='Status Mission', data=df)

# **Take-aways**
1. Success Rate is more than 90%
2. Negligible or None Prelaunch Failure
 

# **Explore which company has high success rate**

In [None]:
# Histogram 
df['Company Name'].value_counts().head(10)

In [None]:
ussr=pd.DataFrame(df[df['Company Name']=='RVSN USSR'][['Status Rocket','Status Mission']].value_counts())
ussr.columns=['Count']
ussr['Percentage']=(ussr['Count']/df[df['Company Name']=='RVSN USSR'].shape[0])*100
ussr

# Interesting that 90.8% of RVSN USSR company were successful, but all are retired

# other companies status

In [None]:
df_active = df[df['Status Rocket'] == "StatusActive"]
df_active = df_active.groupby('Company Name').count()['Detail'].sort_values(ascending=False).reset_index()
len(df_active)

companies = df.groupby(['Company Name'])['Detail'].count().sort_values(ascending=False).reset_index()
len(companies)

top_20 = companies[1:40]
cmp = df.groupby(['Company Name','Status Rocket']).count()['Detail'].reset_index()
cmp = cmp[cmp['Company Name'].isin(top_20['Company Name'])]
active = cmp[cmp['Status Rocket']=="StatusActive"].sort_values('Detail')
retired = cmp[cmp['Status Rocket']!="StatusActive"]
fig = go.Figure()
fig.add_bar(y=active['Detail'],x=active['Company Name'],name='Status Active')
fig.add_bar(y=retired['Detail'],x=retired['Company Name'],name='Status Retired')
fig.update_layout(barmode="stack",title="Companies and Mission Status",yaxis_title="No of Missions")
fig.show()

# **Time to convert date datatype from object to datetime and create more features out of it**

In [None]:
df['day'] = df['Datum'].apply(lambda x:x.split()[0])
df['Month']=df['Datum'].apply(lambda x:x.split()[1])
df['year'] = df['Datum'].apply(lambda x:x.split()[3])
df.head()

# Monthwise Space Launches

In [None]:
fig, ax = plt.subplots(figsize=(16,6))
ax.set_title('No. of Launches by Month', fontsize=20)
order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
sns.countplot(x='Month', data=df, order=order)
ax.set_xlabel('Month', fontsize=10)
ax.set_ylabel('No. of Launches', fontsize=10)
plt.show()

# Daywise Space Launches

In [None]:
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
df_days = df.groupby('day').count()['Detail'].reset_index()

df_days['day'] = pd.Categorical(df_days['day'], categories=days, ordered=True)
df_days = df_days.sort_values('day')
plt.figure(figsize=(11,4))
sns.barplot(x='day', y='Detail', data=df_days)
plt.ylabel('No of launches')
b=plt.title(' Day vs No of launches')

# Yearwise Space Launches

In [None]:
date= df.groupby('year').count()['Detail'].reset_index()
plt.figure(figsize=(16,6))
b=sns.barplot(x='year', y='Detail', data=date)
plt.ylabel('no of launches')
plt.title(' No of launches per year')
_=b.set_xticklabels(b.get_xticklabels(), rotation=90, horizontalalignment='right')