> ### Import Library

In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

> ### Do analyst on the dataset

In [118]:
df = pd.read_csv("space_missions.csv", encoding='latin1')
df.head()

Unnamed: 0,Company,Location,Date,Time,Rocket,Mission,RocketStatus,Price,MissionStatus
0,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-10-04,19:28:00,Sputnik 8K71PS,Sputnik-1,Retired,,Success
1,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-11-03,02:30:00,Sputnik 8K71PS,Sputnik-2,Retired,,Success
2,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1957-12-06,16:44:00,Vanguard,Vanguard TV3,Retired,,Failure
3,AMBA,"LC-26A, Cape Canaveral AFS, Florida, USA",1958-02-01,03:48:00,Juno I,Explorer 1,Retired,,Success
4,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-02-05,07:33:00,Vanguard,Vanguard TV3BU,Retired,,Failure


> ### Let's handle the dataset

- #### Descriptive statistics for numerical columns

In [119]:
print("\nDescriptive Statistics:")
df.describe()


Descriptive Statistics:


Unnamed: 0,Company,Location,Date,Time,Rocket,Mission,RocketStatus,Price,MissionStatus
count,4630,4630,4630,4503,4630,4630,4630,1265,4630
unique,62,158,4180,1300,370,4556,2,65,4
top,RVSN USSR,"Site 31/6, Baikonur Cosmodrome, Kazakhstan",1962-04-26,12:00:00,Cosmos-3M (11K65M),DSP,Retired,450,Success
freq,1777,251,4,52,446,8,3620,136,4162


- #### Dataset information including data types and missing values

In [120]:
print("\nDataset Info:")
df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4630 entries, 0 to 4629
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company        4630 non-null   object
 1   Location       4630 non-null   object
 2   Date           4630 non-null   object
 3   Time           4503 non-null   object
 4   Rocket         4630 non-null   object
 5   Mission        4630 non-null   object
 6   RocketStatus   4630 non-null   object
 7   Price          1265 non-null   object
 8   MissionStatus  4630 non-null   object
dtypes: object(9)
memory usage: 325.7+ KB


- #### Change the release and last update format from object to datetime

In [121]:
# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Convert 'Time' to time
df['Time'] = pd.to_datetime(df['Time']).dt.time

  df['Time'] = pd.to_datetime(df['Time']).dt.time


- #### Check Duplicated dataset

In [122]:
len(df.drop_duplicates()) / len(df)

0.9997840172786178

There are duplicated data!

In [123]:
df_cleaned = df.drop_duplicates()

In [124]:
len(df_cleaned.drop_duplicates()) / len(df_cleaned)

1.0

it's safe now

- #### Check Missing Values

In [145]:
df.describe()

Unnamed: 0,Date
count,4630
mean,1990-02-19 21:28:50.799136
min,1957-10-04 00:00:00
25%,1973-02-18 06:00:00
50%,1987-03-08 00:00:00
75%,2007-09-25 00:00:00
max,2022-07-29 00:00:00


In [126]:
print("\nMissing Values per Column:")
df_cleaned.isnull().sum()


Missing Values per Column:


Company             0
Location            0
Date                0
Time              127
Rocket              0
Mission             0
RocketStatus        0
Price            3365
MissionStatus       0
dtype: int64

In [127]:
# 1. Handle missing values in 'Time' column using forward-fill
df_cleaned.loc[:, 'Time'] = df_cleaned['Time'].fillna(method='ffill')

  df_cleaned.loc[:, 'Time'] = df_cleaned['Time'].fillna(method='ffill')


In [128]:
# drop price because we are not using it
df_cleaned = df_cleaned.drop(columns=['Price'])

In [129]:
print("\nMissing Values per Column (After Fixing):")
print(df_cleaned.isnull().sum())


Missing Values per Column (After Fixing):
Company          0
Location         0
Date             0
Time             0
Rocket           0
Mission          0
RocketStatus     0
MissionStatus    0
dtype: int64


In [130]:
df_cleaned.head( )

Unnamed: 0,Company,Location,Date,Time,Rocket,Mission,RocketStatus,MissionStatus
0,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-10-04,19:28:00,Sputnik 8K71PS,Sputnik-1,Retired,Success
1,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-11-03,02:30:00,Sputnik 8K71PS,Sputnik-2,Retired,Success
2,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1957-12-06,16:44:00,Vanguard,Vanguard TV3,Retired,Failure
3,AMBA,"LC-26A, Cape Canaveral AFS, Florida, USA",1958-02-01,03:48:00,Juno I,Explorer 1,Retired,Success
4,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-02-05,07:33:00,Vanguard,Vanguard TV3BU,Retired,Failure


In [131]:
print("\nData Types After Fixing:")
print(df_cleaned.dtypes)


Data Types After Fixing:
Company                  object
Location                 object
Date             datetime64[ns]
Time                     object
Rocket                   object
Mission                  object
RocketStatus             object
MissionStatus            object
dtype: object


In [132]:
df_cleaned['Time'] = pd.to_datetime(df_cleaned['Time'], format='%H:%M:%S', errors='coerce').dt.time

In [133]:
print("\nData Types After Fixing:")
print(df_cleaned.dtypes)


Data Types After Fixing:
Company                  object
Location                 object
Date             datetime64[ns]
Time                     object
Rocket                   object
Mission                  object
RocketStatus             object
MissionStatus            object
dtype: object


In [134]:
df_cleaned.head()

Unnamed: 0,Company,Location,Date,Time,Rocket,Mission,RocketStatus,MissionStatus
0,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-10-04,19:28:00,Sputnik 8K71PS,Sputnik-1,Retired,Success
1,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-11-03,02:30:00,Sputnik 8K71PS,Sputnik-2,Retired,Success
2,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1957-12-06,16:44:00,Vanguard,Vanguard TV3,Retired,Failure
3,AMBA,"LC-26A, Cape Canaveral AFS, Florida, USA",1958-02-01,03:48:00,Juno I,Explorer 1,Retired,Success
4,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-02-05,07:33:00,Vanguard,Vanguard TV3BU,Retired,Failure


In [135]:
df_cleaned.to_csv("space.csv")

In [136]:
for col in df_cleaned.columns:
  print(f"==== {col} ====")
  print(df_cleaned[col].value_counts(), '\n')

==== Company ====
Company
RVSN USSR           1777
CASC                 337
Arianespace          293
General Dynamics     251
VKS RF               216
NASA                 203
SpaceX               182
US Air Force         161
ULA                  151
Boeing               136
Martin Marietta      114
Northrop              89
MHI                   87
ISRO                  82
Lockheed              79
Roscosmos             69
ILS                   46
Sea Launch            36
ISAS                  30
Rocket Lab            28
Kosmotras             22
Blue Origin           21
US Navy               17
ExPace                15
ISA                   15
ESA                   14
Eurockot              13
IAI                   11
Starsem               10
ASI                    9
AMBA                   8
CNES                   8
JAXA                   8
Land Launch            7
MITT                   7
Astra                  7
CASIC                  6
KCST                   5
UT                     5

In [137]:
print("==== Location ====")
print(df_cleaned['Location'].value_counts(), '\n')

==== Location ====
Location
Site 31/6, Baikonur Cosmodrome, Kazakhstan                                                 251
Site 132/1, Plesetsk Cosmodrome, Russia                                                    216
Site 43/4, Plesetsk Cosmodrome, Russia                                                     211
Site 41/1, Plesetsk Cosmodrome, Russia                                                     198
Site 1/5, Baikonur Cosmodrome, Kazakhstan                                                  193
Site 132/2, Plesetsk Cosmodrome, Russia                                                    174
Site 133/3, Plesetsk Cosmodrome, Russia                                                    158
LC-39A, Kennedy Space Center, Florida, USA                                                 149
Site 43/3, Plesetsk Cosmodrome, Russia                                                     141
ELA-2, Guiana Space Centre, French Guiana, France                                          118
ELA-3, Guiana Space Ce

> ### Feature Engineering, Country

In [138]:
# List of post-USSR independent countries
independent_states = {
    "Kazakhstan",
    "Ukraine",
    "Belarus",
    "Uzbekistan",
    "Armenia",
    "Azerbaijan",
    "Georgia",
    "Moldova",
    "Kyrgyzstan",
    "Tajikistan",
    "Turkmenistan",
    "Lithuania",
    "Latvia",
    "Estonia"
}

def get_country(location):
    parts = [p.strip() for p in location.split(",")]
    for part in reversed(parts):
        if part in independent_states or part == "Russia":
            return part
    # Default: last non-empty part
    return parts[-1].strip()

# Apply country mapping
def assign_country(row):
    country = get_country(row["Location"])
    
    if row["Date"] <= pd.Timestamp("1991-12-26"):
        if country in independent_states:
            return "USSR"
    return country

# Apply function to create new column
df_cleaned["Country"] = df_cleaned.apply(assign_country, axis=1)

In [139]:
df_cleaned.head()

Unnamed: 0,Company,Location,Date,Time,Rocket,Mission,RocketStatus,MissionStatus,Country
0,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-10-04,19:28:00,Sputnik 8K71PS,Sputnik-1,Retired,Success,USSR
1,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-11-03,02:30:00,Sputnik 8K71PS,Sputnik-2,Retired,Success,USSR
2,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1957-12-06,16:44:00,Vanguard,Vanguard TV3,Retired,Failure,USA
3,AMBA,"LC-26A, Cape Canaveral AFS, Florida, USA",1958-02-01,03:48:00,Juno I,Explorer 1,Retired,Success,USA
4,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-02-05,07:33:00,Vanguard,Vanguard TV3BU,Retired,Failure,USA


In [140]:
filtered_data = df_cleaned[(df_cleaned['Country'] == "Russia") & (df_cleaned['Date'] <= '1991-12-26')]
filtered_data.head()

Unnamed: 0,Company,Location,Date,Time,Rocket,Mission,RocketStatus,MissionStatus,Country
130,RVSN USSR,"Mayak-2, Kapustin Yar, Russia",1961-10-27,16:30:00,Cosmos-2I (63S1),DS-1 1,Retired,Failure,Russia
140,RVSN USSR,"Mayak-2, Kapustin Yar, Russia",1961-12-21,12:30:00,Cosmos-2I (63S1),DS-1 2,Retired,Failure,Russia
151,RVSN USSR,"Mayak-2, Kapustin Yar, Russia",1962-03-16,11:59:00,Cosmos-2I (63S1),Cosmos 1,Retired,Success,Russia
152,RVSN USSR,"Mayak-2, Kapustin Yar, Russia",1962-04-06,17:15:00,Cosmos-2I (63S1),Cosmos 2,Retired,Success,Russia
156,RVSN USSR,"Mayak-2, Kapustin Yar, Russia",1962-04-24,04:00:00,Cosmos-2I (63S1),Cosmos 3,Retired,Success,Russia


In [141]:
# Pastikan kolom Date dalam format datetime
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'], errors='coerce')

# Filter data berdasarkan tanggal setelah bubarnya USSR
start_date = '1991-12-27'
end_date = '2024-12-31'
filtered_df = df_cleaned[(df_cleaned['Date'] >= start_date) & (df_cleaned['Date'] <= end_date)]

# Simpan ke CSV
filtered_df.to_csv('space_with_country.csv', index=False)

In [142]:
# # Optional: save to new CSV
# df_cleaned.to_csv("space_with_country.csv", index=False)

In [143]:
data = pd.read_csv("space_with_country.csv")
data.head()

Unnamed: 0,Company,Location,Date,Time,Rocket,Mission,RocketStatus,MissionStatus,Country
0,CASC,"LC-3, Xichang Satellite Launch Center, China",1991-12-28,12:00:00,Long March 3,DFH-2A-T5,Retired,Partial Failure,China
1,NASA,"LC-39A, Kennedy Space Center, Florida, USA",1992-01-22,14:52:00,Space Shuttle Discovery,STS-42,Retired,Success,USA
2,RVSN USSR,"Site 43/3, Plesetsk Cosmodrome, Russia",1992-01-24,01:18:00,Molniya-M /Block 2BL,Cosmos 2176,Retired,Success,Russia
3,RVSN USSR,"Site 45/1, Baikonur Cosmodrome, Kazakhstan",1992-02-05,18:14:00,Zenit-2,Tselina-2 nâ Â­10,Retired,Failure,Kazakhstan
4,General Dynamics,"SLC-36A, Cape Canaveral AFS, Florida, USA",1992-02-11,00:41:00,Atlas II,USA-78 (DSCS IIIB-14),Retired,Success,USA


In [144]:
filtered = data[(data['Country'] == "Russia") & (data['Date'] <= '1991-12-26')]
filtered.head()

Unnamed: 0,Company,Location,Date,Time,Rocket,Mission,RocketStatus,MissionStatus,Country
