In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

In [None]:
## read in the dataset

sd = pd.read_csv('/kaggle/input/all-space-missions-from-1957/Space_Corrected.csv')

In [None]:
## look at the dataset 

sd.head()

In [None]:
## changing datum to datetime

sd['Datum'] = pd.to_datetime(sd['Datum'], utc=True)

In [None]:
## rename rocket, comapny name, Status rocket, and Status Mission

sd = sd.rename(columns = {' Rocket' : 'Rocket_Price'})
sd = sd.rename(columns = {'Company Name': 'Company_Name'})
sd = sd.rename(columns = {'Status Rocket': 'Status_Rocket'})
sd = sd.rename(columns = {'Status Mission': 'Status_Mission'})

In [None]:
## find missing values

sd.isnull().sum()

In [None]:
##fill missing values for Rocket_Price with 0

sd = sd.fillna(0)

In [None]:
## change datatype for rocket_price

sd['Rocket_Price'] = pd.to_numeric(sd['Rocket_Price'], errors = 'coerce')

In [None]:
## DROP UNWANTED COLUMNS

sd = sd.drop(sd.columns[[0,1,5]], axis = 1)

In [None]:
## add new columns

sd['Day'] = sd['Datum'].dt.day_name()
sd['Month'] = sd['Datum'].dt.month_name()
sd['Year'] = sd['Datum'].dt.year

sd['Exact_Location'] = sd['Location'].apply(lambda Location:Location.split(',')[-1])

In [None]:
## fix index starting at 0 to 1

sd.index = np.arange(1, len(sd) + 1)

In [None]:
## check new dataset

sd.head(20)

In [None]:
## create null and non null datasets

sd_null = sd[sd['Rocket_Price']== 0]
sd_valid = sd[sd['Rocket_Price'] != 0]



#  Data Visualization for Null and Non-Null Entries


In [None]:
## launches by country 

plt.figure(figsize = (17,17))
figure_country_null = sns.countplot(y='Exact_Location', data = sd, order = sd['Exact_Location'].value_counts().index, 
                             palette = 'Blues_r')
plt.title('Number of Launches Per Country', fontsize = 35)
plt.xlabel('Frequency', fontsize = 30)
plt.ylabel('Location', fontsize = 30)
plt.show()

In [None]:
## ratio of valid rocket price records to total records 

country_full = sd['Exact_Location'].value_counts()
country_null = sd_null['Exact_Location'].value_counts()
country_valid =sd_valid['Exact_Location'].value_counts()
print(country_valid/country_full)

In [None]:
## ratio of null rocket price to total records 

print(country_null/country_full)

In [None]:
## recorded launches with absent rocket price information

plt.figure(figsize = (17,17))
figure_country_null = sns.countplot(y='Exact_Location', data = sd_null, order = sd_null['Exact_Location'].value_counts().index, 
                             palette = 'Blues_r')
plt.title('Number of Launches Without Rocket Price Record', fontsize = 35)
plt.ylabel('Location', fontsize = 30)
plt.xlabel('Frequency', fontsize = 30)
plt.show()

In [None]:
## recorded launches with valid rocket price information 

plt.figure(figsize = (15,17))
figure_country_nonnull = sns.countplot(y='Exact_Location', data = sd_valid, order = sd_valid['Exact_Location'].value_counts().index, 
                                 palette= 'BuGn_r')
plt.title('Number of Launches With Rocket Price Record', fontsize = 35)
plt.ylabel('Number of Launches Into Space', fontsize = 22)
plt.xlabel('Location', fontsize = 30)
plt.show()

Things to take away: 


1) Both Kazakhstan and Russia have approximately 94% and 96% missing values for rocket price, respectively.  

2) France and Japan both than approximately 68% missing vlaues for Rocket Price.

3) USA has over 63% missing vlaues for Rocket Price. 

4) India has the highest percentage of valid Rocket Price entries with 88%.

5) USA has the highest frequency of launches with recorded Rocket Price. 

In [None]:
## Years with most frequent missing values

plt.figure(figsize = (16,24))
figure_year_null = sns.countplot(y='Year', data = sd_null, order = sd_null['Year'].value_counts().index, 
                             palette = 'BuGn_r')
plt.title('Years With Most Null Rocket Price Values', fontsize = 30)
plt.ylabel('Year', fontsize = 30)
plt.xlabel('Frequency', fontsize = 30)
plt.show()

In [None]:
## Years with most valid values

plt.figure(figsize = (20,24))
figure_year_nonnull = sns.countplot(y='Year', data = sd_valid, order = sd_valid['Year'].value_counts().index, 
                             palette = 'Blues_r')
plt.title('Years With Most Non-Null Rocket Price Values', fontsize = 30)
plt.ylabel('Year', fontsize = 30)
plt.xlabel('Frequency', fontsize = 30)
plt.show()

In [None]:
## finding ratios 

year_null = sd_null['Year'].value_counts()
year_valid = sd_valid['Year'].value_counts()
year_origional = sd['Year'].value_counts()

ratio = year_null/year_valid
null_less_than_valid = ratio[ratio < 1]
null_greater_than_valid = ratio[ratio > 1]


In [None]:
## year percentage of null to valid rocket price where null is less than valid (top 10)

null_less_than_valid = ratio[ratio < 1]
null_less_than_valid[:10]

In [None]:
## year percentage of null to valid rocket price where null is greater than valid (top 10)

null_greater_than_valid = ratio[ratio > 1]
null_greater_than_valid[:10]

In [None]:
## launches by year 

plt.figure(figsize = (20,20))
year_launches = sns.countplot(y = 'Year', data= sd, order = sd['Year'].value_counts().index, palette = 'Blues_r')
plt.title('Launches per Year', fontsize = 30)
plt.ylabel('Year', fontsize = 30)
plt.xlabel('Number of Launches', fontsize = 30)
plt.show()

In [None]:
sd['Year'].value_counts()[:10]

Things to take away: 

1) Years between 1964 and 1973 had the highest frequency of null Rocket Price values.

2) Excluding years 2018 and 2019, from between 1968 to 1977, the frequency of rocket launches were the highest. 

3) More modern years, 1990 to 2020, have the highest recorded Rocket Price entries. 

In [None]:
## null values by company

plt.figure(figsize = (15,25))
figure_companyname_null = sns.countplot(y='Company_Name', data = sd_null, order = sd_null['Company_Name'].value_counts().index, 
                             palette = 'Blues_r', )
plt.title('Company Launches Without Rocket Price Record', fontsize = 35)
plt.ylabel('Company', fontsize = 30)
plt.xlabel('Frequency', fontsize = 30)
plt.show()

In [None]:
## valid values by company

plt.figure(figsize = (15,20))
figure_companyname_null = sns.countplot(y='Company_Name', data = sd_valid, order = sd_valid['Company_Name'].value_counts().index, 
                             palette = 'Blues_r', )
plt.title('Company Launches With Rocket Price Record', fontsize = 35)
plt.ylabel('Company', fontsize = 30)
plt.xlabel('Frequency', fontsize = 30)
plt.show()

Thins to take away: 

1) RVSN USSR had significatly higher Rocket Price null entries than any other particular company. 

2) Most of USA's null values were spread between several companies. 

3) CASC has the highest valid Rocket Price frequency, followed closely by NASA. 

In [None]:
plt.figure(figsize = (20, 12))
launches_by_day = sns.countplot(x='Month', data=sd_null, order = sd_null['Month'].value_counts().index, palette = 'Blues_r')
plt.title('Launch Month With Null Record', fontsize = 25)
plt.xlabel('Month', fontsize= 25)
plt.ylabel('Frequency', fontsize = 25)
plt.show()

In [None]:
plt.figure(figsize = (20, 11))
launches_by_day = sns.countplot(x='Month', data=sd_valid, order = sd_valid['Month'].value_counts().index, palette = 'Blues_r')
plt.title('Launch Month With Valid Record', fontsize = 25)
plt.ylabel('Frequency', fontsize= 25)
plt.xlabel('Month', fontsize = 25)
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
launches_by_day = sns.countplot(x='Day', data=sd_valid, order = sd_valid['Day'].value_counts().index, palette = 'Blues_r')
plt.title('Launch Day With Valid Rocket Price', fontsize = 25)
plt.xlabel('Day', fontsize = 25)
plt.ylabel('Frequency', fontsize = 25)
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
launches_by_day = sns.countplot(x='Day', data=sd_null, order = sd_null['Day'].value_counts().index, palette = 'Blues_r')
plt.title('Launch Day With Null Rocket Price', fontsize = 25)
plt.xlabel('Day', fontsize = 25)
plt.ylabel('Frequency', fontsize = 25)
plt.show()

Things to take away: 

1) Most of the launches came in December, regardless of whether the Rocket Price value was valid or null. 

2) All other months are quite similar in frequency with January and February having the least frequencies in general for both the valid and null datasets. 

3) All the days are relatively similar in distribution except for Sunday which has the lowest frequency for both the valid and null datasets. 

# Conclusion

By analyzing the null values of the the origional dataset, there were a couple of key insights that were found. The null values all occured within the variable called 'Rocket Price' which displays the amount of money used to make and launch the rocket itself. This is a usefull value, as it can show how advanced the rocket is (more expensive the more advanced), which can be then used to predict which Countries were making better progress than others. From the analysis the key isights were: 

1) Russia had the most missing Rocket Price data with over 96% missing, then Kazakhstan 94%, Japan 68%, France 68%, and USA 63%. 

2) The years of most frequent missing values was between 1964 and 1973 (Middle of the space race). 

3) The years of most frequent valid values came from 1990-2020. 

3) RVSN USSR was the company with by far the most null values. 

4) The majoirty of the launches were in December. 

From this information we can see that there was definitley some competition between nations during the late 60s and 70s, with USSR coming in with the most launches as well as the most mising values. By observing the frequency of null vlaues for different variables we can see trends to verify suspicions that the Space Race was frought with competition and secrecy as compared to more recent years.  


Feedback is appreciated! Thanks. 
