In [17]:
import pandas as pd
import json

# Import the data
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [5]:
# Create a DataFrame from the JSON file
path = 'resources/neo_data.json'

df_original = pd.read_json(path)
df_original.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8218 entries, 0 to 8217
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        8218 non-null   int64  
 1   name                      8218 non-null   object 
 2   absolute_magnitude_h      8218 non-null   float64
 3   est_diameter_min          8218 non-null   float64
 4   est_diameter_max          8218 non-null   float64
 5   relative_velocity         8218 non-null   float64
 6   miss_distance             8218 non-null   float64
 7   orbiting_body             8218 non-null   object 
 8   sentry_object             8218 non-null   int64  
 9   is_potentially_hazardous  8218 non-null   int64  
dtypes: float64(5), int64(3), object(2)
memory usage: 642.2+ KB


In [6]:
df_columns_removed = df_original.copy()

'''
    Drop the columns that serve to identify the Near Earth Object, which are
    assigned by NASA and not scientifically descriptive of the object itself.
'''

# Drop the 'name' column
df_columns_removed.drop('name', axis=1, inplace=True)

# Drop the 'orbiting_body' column
df_columns_removed.drop('orbiting_body', axis=1, inplace=True)

# Drop the 'id' column
df_columns_removed.drop('id', axis=1, inplace=True)

In [7]:
df_columns_removed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8218 entries, 0 to 8217
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   absolute_magnitude_h      8218 non-null   float64
 1   est_diameter_min          8218 non-null   float64
 2   est_diameter_max          8218 non-null   float64
 3   relative_velocity         8218 non-null   float64
 4   miss_distance             8218 non-null   float64
 5   sentry_object             8218 non-null   int64  
 6   is_potentially_hazardous  8218 non-null   int64  
dtypes: float64(5), int64(2)
memory usage: 449.5 KB


In [9]:
# Define the features set and drop the target column
X = df_columns_removed.copy()
X.drop('is_potentially_hazardous', axis=1, inplace=True)
X.head()

Unnamed: 0,absolute_magnitude_h,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,sentry_object
0,25.6,0.020163,0.045086,41072.828534,38689550.0,0
1,21.16,0.155796,0.348369,51223.885782,47715840.0,0
2,26.3,0.014607,0.032662,12712.104884,23883790.0,0
3,24.44,0.0344,0.07692,25376.064524,48449170.0,0
4,23.6,0.050647,0.11325,88941.661927,62884750.0,0


In [24]:
# Define the target vector
y = df_columns_removed['is_potentially_hazardous'].copy()
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: is_potentially_hazardous, dtype: int64

In [25]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [26]:
# No testing has been done yet to determine the data is distrubuted normally.
# We will use both the StandardScaler and the MinMaxScaler to determine which
# one is best for this dataset. Let's start with the StandardScaler.
standard_scaler = StandardScaler().fit(X_train)
X_train_scaled = standard_scaler.transform(X_train)
X_train_scaled

array([[ 1.09778354, -0.47152658, -0.47152658, -0.79797486,  1.65207245,
        -0.245377  ],
       [ 0.43401193, -0.39557784, -0.39557784, -0.39874631,  0.45099066,
        -0.245377  ],
       [ 0.79171108, -0.44413588, -0.44413588, -0.77043553, -0.9726628 ,
        -0.245377  ],
       ...,
       [ 0.67739485, -0.4309042 , -0.4309042 , -0.63553589, -1.12959817,
        -0.245377  ],
       [-0.27401111, -0.20404563, -0.20404563, -0.55904761, -0.55003616,
        -0.245377  ],
       [ 1.06090734, -0.46875375, -0.46875375, -0.81100614,  1.16565367,
        -0.245377  ]])

In [27]:
# Scale the testing dataset
X_test_scaled = standard_scaler.transform(X_test)
X_test_scaled

array([[-1.51305144,  1.00297937,  1.00297937,  1.26557985,  0.83453882,
        -0.245377  ],
       [ 0.37132239, -0.38460183, -0.38460183, -0.90277169, -0.8850526 ,
        -0.245377  ],
       [-0.18182061, -0.23953223, -0.23953223, -0.35660015, -0.19164171,
        -0.245377  ],
       ...,
       [ 0.58889197, -0.41928123, -0.41928123, -1.32144064, -0.51966709,
        -0.245377  ],
       [ 0.03943659, -0.30974506, -0.30974506, -0.73727885,  1.1297422 ,
        -0.245377  ],
       [ 1.13097212, -0.47391526, -0.47391526, -1.24773121, -1.07862912,
        -0.245377  ]])

In [28]:
# Check the max and min of the scaled training and testing sets
print("Scaled data min/max (StandardScaler):")
print("Training data min:",X_train_scaled.min())
print("Training data max:",X_train_scaled.max())
print("Testing data min:",X_test_scaled.min())
print("Testing data max:",X_test_scaled.max())

Scaled data min/max (StandardScaler):
Training data min: -3.6297453248308886
Training data max: 21.029146296594423
Testing data min: -3.4564271840361647
Testing data max: 16.833201185394756


In [29]:
# Try the MinMaxScaler
minmax_scaler = MinMaxScaler().fit(X_train)
X_train_minmax = minmax_scaler.transform(X_train)
X_train_minmax

array([[0.70131291, 0.00250873, 0.00250873, 0.15494274, 0.93560181,
        0.        ],
       [0.60284464, 0.00603226, 0.00603226, 0.2156308 , 0.56394385,
        0.        ],
       [0.6559081 , 0.00377948, 0.00377948, 0.15912909, 0.12341421,
        0.        ],
       ...,
       [0.63894967, 0.00439335, 0.00439335, 0.17963563, 0.07485276,
        0.        ],
       [0.49781182, 0.01491811, 0.01491811, 0.19126287, 0.25419011,
        0.        ],
       [0.69584245, 0.00263737, 0.00263737, 0.15296181, 0.78508632,
        0.        ]])

In [30]:
X_test_minmax = minmax_scaler.transform(X_test)
X_test_minmax

array([[0.31400438, 0.07091621, 0.07091621, 0.46863054, 0.68262746,
        0.        ],
       [0.59354486, 0.00654147, 0.00654147, 0.13901223, 0.15052396,
        0.        ],
       [0.51148796, 0.01327176, 0.01327176, 0.22203758, 0.36509026,
        0.        ],
       ...,
       [0.62582057, 0.00493258, 0.00493258, 0.07536897, 0.2635874 ,
        0.        ],
       [0.54431072, 0.01001434, 0.01001434, 0.16416934, 0.77397402,
        0.        ],
       [0.70623632, 0.00239791, 0.00239791, 0.08657379, 0.09062442,
        0.        ]])

In [31]:
# Check the max and min of the scaled training and testing sets
print("Scaled data min/max (MinMaxScaler):")
print("Training data min:",X_train_minmax.min())
print("Training data max:",X_train_minmax.max())
print("Testing data min:",X_test_minmax.min())
print("Testing data max:",X_test_minmax.max())

Scaled data min/max (MinMaxScaler):
Training data min: 0.0
Training data max: 1.0000000000000002
Testing data min: 0.0
Testing data max: 1.0107488564339902


In [32]:
svc_standard = SVC(kernel='linear')
svc_standard.fit(X_train_scaled, y_train)

In [34]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % svc_standard.score(X_train_scaled, y_train))
print('Test Accuracy: %.3f' % svc_standard.score(X_test_scaled, y_test))

Train Accuracy: 0.935
Test Accuracy: 0.927
