In [None]:
import pandas as pd

In [None]:
ucs_sats_messy = pd.read_csv('../data/original/UCS-Satellite-Database 5-1-2023.csv')

ucs_sats_messy.head(10)

In [None]:
ucs_sats_messy.shape

In [None]:
ucs_sats_messy.info()

Strip white space from object dtypes using lambda. I moved this closer to the top because I want this done before further cleaning.

In [None]:
print(f"Users: \n\r{ucs_sats_messy['Users'].unique()}\n\r")

In [None]:
text_cols = ucs_sats_messy.select_dtypes(['object']).columns
ucs_sats_messy[text_cols] = ucs_sats_messy[text_cols].apply(lambda x: x.str.strip())

We need to drop useless columns (unnamed, etc). Start with the unnamed columns which are almost all empty.  I assume this is place savers for future data but its useless to us at this point.

In [None]:
ucs_sats_messy.drop( columns=['Unnamed: 28', 'Unnamed: 37',
       'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41',
       'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45',
       'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49',
       'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53',
       'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57',
       'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61',
       'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65',
       'Unnamed: 66', 'Unnamed: 67',' Dry Mass (kg.) ', 'Power (watts)'], inplace=True)

In [None]:
print(f"IsNull: \n\r{ucs_sats_messy.isnull().sum()}\n\r")
print(f"Dtypes: \n\r{ucs_sats_messy.dtypes}\n\r")
print(f"Columns: \n\r{ucs_sats_messy.columns}\n\r")

Need to clean up Perigee and Apogee. Strip ',' and convert object dtype to numeric dtype (ends up being float64). Make sure we dropna and invalid data.  Appears to be at least 1 row that has an invalid apogee of less than 100km (not possibly given all satellite perigee's are greater than 150km and the satellite's apogee must be greater than the satellite's perigee ).

In [None]:
ucs_sats_messy['Perigee (km)'] = ucs_sats_messy['Perigee (km)'].astype(str).str.replace(',', '', regex=False)
ucs_sats_messy['Apogee (km)'] = ucs_sats_messy['Apogee (km)'].astype(str).str.replace(',', '', regex=False)

ucs_sats_messy['Perigee (km)'] = pd.to_numeric(ucs_sats_messy['Perigee (km)'], errors='coerce')
ucs_sats_messy['Apogee (km)'] = pd.to_numeric(ucs_sats_messy['Apogee (km)'], errors='coerce')

ucs_sats_messy.dropna(subset=['Perigee (km)', 'Apogee (km)'], inplace=True)

ucs_sats_messy = ucs_sats_messy[ucs_sats_messy['Apogee (km)'] >= ucs_sats_messy['Perigee (km)']]

Government/Commercial may 'seem' to be the same thing as Commercial/Government but it is not.  The order of the listing matters. Duplicates from original data that had leading/training white space has been cleaned up previously.

Primary Users/Secondary Users/Tertiary Users

In [None]:
print(f"{ucs_sats_messy['Users'].info()}\n\r")

In [None]:
print(f"Users: \n\r{ucs_sats_messy['Users'].unique()}\n\r")

In [None]:
ucs_sats_messy.loc[ucs_sats_messy['Users'] == 'Commercial']

I would like to drop the sources columns from the main csv but I want to maintain a usable list of this data incase I need it in the future. Output source data to a new csv with noradid added for a primary key for later comparison/cross referencing.

In [None]:
sources = ucs_sats_messy[['Source Used for Orbital Data', 'Source', 'Source.1', 'Source.2', 'Source.3', 'Source.4', 'Source.5', 'Source.6', 'Comments']]

sources

In [None]:
norad_data = ucs_sats_messy['NORAD Number']
purpose_data = ucs_sats_messy['Detailed Purpose']

sources.insert(0, 'NORAD Number', norad_data)
sources.insert(10, 'Detailed Purpose', purpose_data)

sources = sources.sort_values(by='NORAD Number')
sources.to_csv('./../data/clean/ucs_dropped.csv', index=False)

In [None]:
ucs_sats_messy.drop(columns=['Source Used for Orbital Data', 'Source', 'Source.1', 'Source.2', 'Source.3', 'Source.4', 'Source.5', 'Source.6', 'Comments', 'Detailed Purpose'], inplace=True)
sources.info()

In [None]:
ucs_sats_messy.head()

In [None]:
ucs_sats_messy.columns

Columns of Importance - Orbital Clutter
  Location (Apogee, Perigee, Inclination),
  Orbit Class (Class of Orbit, Type of Orbit),
  Time (Date of Launch, Expected Lifetime, Mass),
  Ownership( Country of Operator, Operator, Users)


Data: Compare LEO/GEO/MEO

In [None]:
ucs_sats_messy['Class of Orbit'].unique()

In [None]:
ucs_sats_messy['Class of Orbit'] = ucs_sats_messy['Class of Orbit'].str.upper()
ucs_sats_messy['Class of Orbit'].unique()

In [None]:
heo = ucs_sats_messy[ucs_sats_messy['Class of Orbit'] == 'ELLIPTICAL']
heo.value_counts()

In [None]:
ucs_sats_messy['Date of Launch'] = pd.to_datetime(ucs_sats_messy['Date of Launch'], errors='coerce')
print(ucs_sats_messy['Date of Launch'].isnull().sum())

In [None]:
null_dates = ucs_sats_messy[ucs_sats_messy['Date of Launch'].isnull()]
null_dates

In [None]:
ucs_sats_messy['Launch Mass (kg.)'].unique()

In [None]:
ucs_sats_messy['Launch Mass (kg.)'] = pd.to_numeric(ucs_sats_messy['Launch Mass (kg.)'], errors='coerce')
ucs_sats_messy['Launch Mass (kg.)'].isnull().value_counts()

In [None]:
medians = ucs_sats_messy.groupby('Class of Orbit')['Launch Mass (kg.)'].transform('median')

ucs_sats_messy['Launch Mass (kg.)'] = ucs_sats_messy['Launch Mass (kg.)'].fillna(medians)
ucs_sats_messy['Launch Mass (kg.)'].isnull().value_counts()

In [None]:
ucs_sats_messy['Launch Mass (kg.)'].value_counts()

Save the cleaned data to a new csv for use after cleanup.

In [None]:
ucs_sats_messy.to_csv('./../data/clean/ucs_cleaned.csv', index=False)