In [1]:
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
neo_data = pd.read_json("../Resources/neo_data.json")

neo_data = neo_data.set_index("name")

neo_data = neo_data[~neo_data.index.duplicated(keep='first')]

neo_data.head()

Unnamed: 0_level_0,id,absolute_magnitude_h,estimated_diameter_km_min,estimated_diameter_km_max,is_potentially_hazardous,close_approach_date,relative_velocity_kph,miss_distance_km,orbiting_body
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
(2011 HS60),3564720,21.29,0.146742,0.328125,False,2020-01-01,63987.979663,29751750.0,Earth
(2011 YE40),3591759,25.2,0.024241,0.054205,False,2020-01-01,46009.033071,9249996.0,Earth
(2013 EC20),3630817,29.0,0.004213,0.00942,False,2020-01-01,10057.324955,24237650.0,Earth
(2016 EF195),3747497,25.5,0.021113,0.047211,False,2020-01-01,63174.405279,41337820.0,Earth
(2019 WE5),3893737,23.3,0.058151,0.130029,False,2020-01-01,18010.17068,20135490.0,Earth


In [3]:
neo_data_scaled = StandardScaler().fit_transform(neo_data[["absolute_magnitude_h", "estimated_diameter_km_min", "estimated_diameter_km_max", "relative_velocity_kph"]])

neo_data_scaled[0:5]

array([[-0.99931038,  0.22693256,  0.22693256,  0.73259666],
       [ 0.4271559 , -0.32407998, -0.32407998,  0.00950302],
       [ 1.81349142, -0.41416917, -0.41416917, -1.43643549],
       [ 0.53660344, -0.33814982, -0.33814982,  0.69987558],
       [-0.26601186, -0.17155432, -0.17155432, -1.1165807 ]])

In [None]:
df_neo_scaled = pd.DataFrame(neo_data_scaled, columns = ["absolute_magnitude_h", "estimated_diameter_km_min", "estimated_diameter_km_max", "relative_velocity_kph"])

df_neo_scaled["Name"] = neo_data.index

df_neo_scaled = df_neo_scaled.set_index("Name")

df_neo_scaled.head()

Unnamed: 0_level_0,absolute_magnitude_h,estimated_diameter_km_min,estimated_diameter_km_max,relative_velocity_kph
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
(2011 HS60),-0.99931,0.226933,0.226933,0.732597
(2011 YE40),0.427156,-0.32408,-0.32408,0.009503
(2013 EC20),1.813491,-0.414169,-0.414169,-1.436435
(2016 EF195),0.536603,-0.33815,-0.33815,0.699876
(2019 WE5),-0.266012,-0.171554,-0.171554,-1.116581


In [5]:
k = list(range(1, 11))
inertia = []

for i in k:
    k_model = KMeans(n_clusters = i, random_state = 0)
    k_model.fit(df_neo_scaled)
    inertia.append(k_model.inertia_)

In [6]:
elbow_data = {"k" : k, "inertia" : inertia}
df_elbow = pd.DataFrame(elbow_data)

df_elbow.hvplot.line(
    x = "k",
    y = "inertia",
    title = "Elbow Chart",
    xticks = k
)

In [7]:
model = KMeans(n_clusters = 5)

model.fit(df_neo_scaled)

neo_clusters = model.predict(df_neo_scaled)

In [None]:
df_neo_predictions = df_neo_scaled.copy()

df_neo_predictions["cluster"] = neo_clusters

df_neo_predictions.head()

Unnamed: 0_level_0,absolute_magnitude_h,estimated_diameter_km_min,estimated_diameter_km_max,relative_velocity_kph,cluster
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
(2011 HS60),-0.99931,0.226933,0.226933,0.732597,4
(2011 YE40),0.427156,-0.32408,-0.32408,0.009503,0
(2013 EC20),1.813491,-0.414169,-0.414169,-1.436435,0
(2016 EF195),0.536603,-0.33815,-0.33815,0.699876,0
(2019 WE5),-0.266012,-0.171554,-0.171554,-1.116581,3


In [9]:
df_neo_predictions.hvplot.scatter(
    x = "estimated_diameter_km_min",
    y = "relative_velocity_kph",
    by = "cluster",
    hover_cols = "Name"
)

In [None]:
df_neo_predictions["mag"] = neo_data["absolute_magnitude_h"]
df_neo_predictions["dm_min"] = neo_data["estimated_diameter_km_min"]
df_neo_predictions["dm_max"] = neo_data["estimated_diameter_km_max"]
df_neo_predictions["vel"] = neo_data["relative_velocity_kph"]
df_neo_predictions["miss"] = neo_data["miss_distance_km"]

df_neo_predictions.head()

Unnamed: 0_level_0,absolute_magnitude_h,estimated_diameter_km_min,estimated_diameter_km_max,relative_velocity_kph,cluster,mag,dm_min,dm_max,vel,miss
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
(2011 HS60),-0.99931,0.226933,0.226933,0.732597,4,21.29,0.146742,0.328125,63987.979663,29751750.0
(2011 YE40),0.427156,-0.32408,-0.32408,0.009503,0,25.2,0.024241,0.054205,46009.033071,9249996.0
(2013 EC20),1.813491,-0.414169,-0.414169,-1.436435,0,29.0,0.004213,0.00942,10057.324955,24237650.0
(2016 EF195),0.536603,-0.33815,-0.33815,0.699876,0,25.5,0.021113,0.047211,63174.405279,41337820.0
(2019 WE5),-0.266012,-0.171554,-0.171554,-1.116581,3,23.3,0.058151,0.130029,18010.17068,20135490.0


In [12]:
df_neo_predictions.to_csv("neo_predictions.csv")