In [1]:
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
neo_data = pd.read_json("/Resources/neo_data.json")

neo_data.head()

Unnamed: 0,id,name,absolute_magnitude_h,estimated_diameter_km_min,estimated_diameter_km_max,is_potentially_hazardous,close_approach_date,relative_velocity_kph,miss_distance_km,orbiting_body
0,3564720,(2011 HS60),21.29,0.146742,0.328125,False,2020-01-01,63987.979663,29751750.0,Earth
1,3591759,(2011 YE40),25.2,0.024241,0.054205,False,2020-01-01,46009.033071,9249996.0,Earth
2,3630817,(2013 EC20),29.0,0.004213,0.00942,False,2020-01-01,10057.324955,24237650.0,Earth
3,3747497,(2016 EF195),25.5,0.021113,0.047211,False,2020-01-01,63174.405279,41337820.0,Earth
4,3893737,(2019 WE5),23.3,0.058151,0.130029,False,2020-01-01,18010.17068,20135490.0,Earth


In [4]:
neo_data_scaled = StandardScaler().fit_transform(neo_data[["absolute_magnitude_h", "estimated_diameter_km_min", "estimated_diameter_km_max", "relative_velocity_kph"]])

neo_data_scaled[0:5]

array([[-1.12303720e+00,  3.23032126e-01,  3.23032126e-01,
         7.40956576e-01],
       [ 3.28865141e-01, -2.97765765e-01, -2.97765765e-01,
         1.72982784e-03],
       [ 1.73992112e+00, -3.99264695e-01, -3.99264695e-01,
        -1.47646936e+00],
       [ 4.40264297e-01, -3.13617546e-01, -3.13617546e-01,
         7.07505453e-01],
       [-3.76662849e-01, -1.25922843e-01, -1.25922843e-01,
        -1.14947822e+00]])

In [5]:
df_neo_scaled = pd.DataFrame(neo_data_scaled, columns = ["absolute_magnitude_h", "estimated_diameter_km_min", "estimated_diameter_km_max", "relative_velocity_kph"])

df_neo_scaled["Name"] = neo_data["name"]

df_neo_scaled = df_neo_scaled.set_index("Name")

df_neo_scaled.head()

Unnamed: 0_level_0,absolute_magnitude_h,estimated_diameter_km_min,estimated_diameter_km_max,relative_velocity_kph
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
(2011 HS60),-1.123037,0.323032,0.323032,0.740957
(2011 YE40),0.328865,-0.297766,-0.297766,0.00173
(2013 EC20),1.739921,-0.399265,-0.399265,-1.476469
(2016 EF195),0.440264,-0.313618,-0.313618,0.707505
(2019 WE5),-0.376663,-0.125923,-0.125923,-1.149478


In [6]:
k = list(range(1, 11))
inertia = []

for i in k:
    k_model = KMeans(n_clusters = i, random_state = 0)
    k_model.fit(df_neo_scaled)
    inertia.append(k_model.inertia_)

In [7]:
elbow_data = {"k" : k, "inertia" : inertia}
df_elbow = pd.DataFrame(elbow_data)

df_elbow.hvplot.line(
    x = "k",
    y = "inertia",
    title = "Elbow Chart",
    xticks = k
)

In [8]:
model = KMeans(n_clusters = 5)

model.fit(df_neo_scaled)

neo_clusters = model.predict(df_neo_scaled)

In [9]:
df_neo_predictions = df_neo_scaled.copy()

df_neo_predictions["cluster"] = neo_clusters

df_neo_predictions = df_neo_predictions[~df_neo_predictions.index.duplicated(keep='first')]

df_neo_predictions.head()

Unnamed: 0_level_0,absolute_magnitude_h,estimated_diameter_km_min,estimated_diameter_km_max,relative_velocity_kph,cluster
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
(2011 HS60),-1.123037,0.323032,0.323032,0.740957,4
(2011 YE40),0.328865,-0.297766,-0.297766,0.00173,1
(2013 EC20),1.739921,-0.399265,-0.399265,-1.476469,1
(2016 EF195),0.440264,-0.313618,-0.313618,0.707505,0
(2019 WE5),-0.376663,-0.125923,-0.125923,-1.149478,1


In [10]:
df_neo_predictions.hvplot.scatter(
    x = "estimated_diameter_km_min",
    y = "relative_velocity_kph",
    by = "cluster",
    hover_cols = "Name"
)

In [11]:
df_neo_predictions.to_csv("neo_predictions.csv")

In [12]:
# df_neo_predictions["name"] = df_neo_predictions.index

# df_neo_predictions = df_neo_predictions.reset_index()

# df_json = df_neo_predictions.to_json("neows_clusters.json")

df_neo_predictions.head()

Unnamed: 0_level_0,absolute_magnitude_h,estimated_diameter_km_min,estimated_diameter_km_max,relative_velocity_kph,cluster
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
(2011 HS60),-1.123037,0.323032,0.323032,0.740957,4
(2011 YE40),0.328865,-0.297766,-0.297766,0.00173,1
(2013 EC20),1.739921,-0.399265,-0.399265,-1.476469,1
(2016 EF195),0.440264,-0.313618,-0.313618,0.707505,0
(2019 WE5),-0.376663,-0.125923,-0.125923,-1.149478,1
