In [1]:
import pandas as pd
import json

In [2]:
# Import the data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Create a DataFrame from the JSON file
path = 'resources/neo_data.json'

df = pd.read_json(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8218 entries, 0 to 8217
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        8218 non-null   int64  
 1   name                      8218 non-null   object 
 2   absolute_magnitude_h      8218 non-null   float64
 3   est_diameter_min          8218 non-null   float64
 4   est_diameter_max          8218 non-null   float64
 5   relative_velocity         8218 non-null   float64
 6   miss_distance             8218 non-null   float64
 7   orbiting_body             8218 non-null   object 
 8   sentry_object             8218 non-null   int64  
 9   is_potentially_hazardous  8218 non-null   int64  
dtypes: float64(5), int64(3), object(2)
memory usage: 642.2+ KB


In [4]:
df.head(5)


Unnamed: 0,id,name,absolute_magnitude_h,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,is_potentially_hazardous
0,2478784,478784 (2012 UV136),25.6,0.020163,0.045086,41072.828534,38689550.0,Earth,0,0
1,3092479,(2001 VH5),21.16,0.155796,0.348369,51223.885782,47715840.0,Earth,0,0
2,3472549,(2009 UK20),26.3,0.014607,0.032662,12712.104884,23883790.0,Earth,0,0
3,3827311,(2018 RY1),24.44,0.0344,0.07692,25376.064524,48449170.0,Earth,0,0
4,3835821,(2018 UA3),23.6,0.050647,0.11325,88941.661927,62884750.0,Earth,0,0


In [10]:
# Convert object columns to string
df = df.astype(str)

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8218 entries, 0 to 8217
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        8218 non-null   object
 1   absolute_magnitude_h      8218 non-null   object
 2   est_diameter_min          8218 non-null   object
 3   est_diameter_max          8218 non-null   object
 4   relative_velocity         8218 non-null   object
 5   miss_distance             8218 non-null   object
 6   orbiting_body             8218 non-null   object
 7   sentry_object             8218 non-null   object
 8   is_potentially_hazardous  8218 non-null   object
dtypes: object(9)
memory usage: 578.0+ KB


In [14]:
# Drop the label to creat the X data
X = df.drop(columns='orbiting_body')
X.head()
X


Unnamed: 0,id,absolute_magnitude_h,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,sentry_object,is_potentially_hazardous
0,2478784,25.6,0.0201629919,0.0450858206,41072.8285341367,38689550.52261779,0,0
1,3092479,21.16,0.1557955241,0.3483693825,51223.8857821999,47715839.82277051,0,0
2,3472549,26.3,0.0146067964,0.0326617897,12712.1048840399,23883789.100615326,0,0
3,3827311,24.44,0.0343997255,0.0769201245,25376.0645244742,48449174.47566659,0,0
4,3835821,23.6,0.0506471459,0.1132504611,88941.6619267972,62884754.50229493,0,0
...,...,...,...,...,...,...,...,...
8213,54358910,25.12,0.0251509837,0.0562393094,57872.0543866938,2300614.166102933,0,0
8214,54360087,28.23,0.0060055803000000005,0.0134288857,40056.875728794,476299.183010958,0,0
8215,54373237,23.68,0.048815189200000005,0.1091540813,69360.5244234438,9934365.05121523,0,0
8216,54375413,20.48,0.2130860292,0.4764748465,52310.3878303614,39802176.168623984,0,1


In [23]:
# Creat the y set from the "is_potentially_hazardous" column
y = df['is_potentially_hazardous']
y.head()
y


0       0
1       0
2       0
3       0
4       0
       ..
8213    0
8214    0
8215    0
8216    1
8217    0
Name: is_potentially_hazardous, Length: 8218, dtype: object

In [24]:
# Split the data into training and testing sets using random_state=42       
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [25]:
# Scale the x data by using StandardScaler ()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled 

array([[-1.32513718, -1.25171334,  0.58861383, ...,  0.9163374 ,
        -0.24238975, -0.27180962],
       [ 0.77891767,  0.82882888, -0.45134329, ..., -1.29398422,
        -0.24238975, -0.27180962],
       [ 0.76972862, -0.03347698, -0.29048218, ...,  0.75787679,
        -0.24238975, -0.27180962],
       ...,
       [-1.26967722, -1.39115003,  0.80326364, ...,  0.01338941,
        -0.24238975, -0.27180962],
       [-1.26984317,  1.17742062, -0.48055618, ..., -0.78338735,
        -0.24238975, -0.27180962],
       [-1.27100959,  0.06926585, -0.31991635, ..., -0.3335471 ,
        -0.24238975, -0.27180962]])

Model and Fit to a Logistic Regression Classifier

In [26]:
# Create a logistic regression modelclassifier model with a random state of 42
model = LogisticRegression(random_state=42, max_iter=1000)

# Fit the model to the scaled training data
model.fit(X_train_scaled, y_train)


In [28]:
# Validate the model  by checking the model accuracy with model.score
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")



Training Data Score: 1.0
Testing Data Score: 1.0


Model and Fit to a Support Vector Machine

In [29]:
# Create the support vector machine model with a "rbf" kernel
svm = SVC(kernel='rbf')
svm.fit(X_train_scaled, y_train)


In [30]:
# Validate the model by checking the model accuracy with model.score
print(f"Training Data Score: {svm.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {svm.score(X_test_scaled, y_test)}")


Training Data Score: 1.0
Testing Data Score: 1.0


Model and Fit to a KNN Model

In [31]:
# Create the KNN model with a k of 3
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)


In [32]:
# Validate the model by checking the model accuracy with model.score
print(f"Training Data Score: {knn.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {knn.score(X_test_scaled, y_test)}")


Training Data Score: 1.0
Testing Data Score: 1.0


Model and Fit to a Decision Tree Classifier

In [33]:
# Create a decision tree model
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)


In [34]:
# Validate the model by checking the model accuracy with model.score
print(f"Training Data Score: {dtree.score(X_train, y_train)}")
print(f"Testing Data Score: {dtree.score(X_test, y_test)}")


Training Data Score: 1.0
Testing Data Score: 1.0


In [35]:
# Validate the model by checking the model accuracy with model.score
print(f"Training Data Score: {dtree.score(X_train, y_train)}")
print(f"Testing Data Score: {dtree.score(X_test, y_test)}")


Training Data Score: 1.0
Testing Data Score: 1.0
