# Classifying Celestial Objects
Based on information from the dataset neo_v2 from https://www.kaggle.com/datasets/sameepvani/nasa-nearest-earth-objects, this neural network classifies whether asteroids may pose a danger to earth based on a variety of features. 

### Imports

In [1]:
import tensorflow as tf 
from tensorflow import keras 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests
import io

### Data preprocessing
The neo_v2 csv file is imported from the REHS github repo. Then the program splits the dataset 80/20 into training data and testing data respectively

In [10]:
csv_link = 'https://raw.githubusercontent.com/sdsc-hpc-students/REHS2022/main/Final-Project/Extra%20KGs/celesital-bodies/neo_v2.csv'
csv_file = requests.get(csv_link).content

In [11]:
dataset = pd.read_csv(io.StringIO(csv_file.decode('utf-8')))

In [12]:
dataset.head()

Unnamed: 0,404: Not Found


In [5]:
len(dataset)

0

convert boolean values to floats

In [6]:
print(dataset.dtypes.tolist())
print(type(dataset.dtypes.tolist()[0]))

change_to_float = {column:float for column, dtype in zip(dataset.columns.tolist(), dataset.dtypes.tolist()) if dtype == np.bool_}
dataset = dataset.astype(change_to_float)
dataset.head()

[dtype('O')]
<class 'numpy.dtype[object_]'>


Unnamed: 0,404: Not Found


get rid of orbiting body and sentry_object

In [7]:
dataset.drop('sentry_object', axis=1, inplace=True)

KeyError: "['sentry_object'] not found in axis"

In [None]:
dataset.drop('orbiting_body', axis=1, inplace=True)
dataset

KeyError: "['orbiting_body'] not found in axis"

In [None]:
columnns = list(dataset.columns)
columnns

['id',
 'name',
 'est_diameter_min',
 'est_diameter_max',
 'relative_velocity',
 'miss_distance',
 'absolute_magnitude',
 'hazardous']

scale everything into a range of between 0 and 1

In [None]:
dataset_raw = dataset

In [None]:
for column in dataset.columns.tolist()[2:7]:
    dataset[column] = dataset[column].map(lambda x: x / float(dataset[column].idxmax()))

columnize the data

In [None]:
columnized_data = {column:dataset[column].tolist() for column in list(dataset.columns)}
columnized_data

{'id': [2162635,
  2277475,
  2512244,
  3596030,
  3667127,
  54138696,
  54189957,
  54230078,
  2088213,
  3766065,
  54049873,
  54099949,
  54104555,
  54235433,
  2198752,
  3069224,
  3739154,
  3795026,
  3797456,
  3825138,
  3835974,
  3842597,
  54053939,
  2506491,
  3329370,
  3623582,
  3768024,
  3781344,
  54076362,
  54105263,
  54232321,
  2007480,
  2020425,
  2337345,
  3608427,
  3654336,
  54016465,
  54235525,
  2002100,
  2003362,
  2004688,
  2252558,
  2452334,
  3067492,
  3313974,
  3379718,
  3463175,
  3572706,
  3731680,
  3740825,
  3986682,
  2085182,
  3395956,
  3624232,
  3794971,
  54158069,
  3557536,
  3656918,
  54057195,
  54137509,
  2503880,
  3172329,
  3740044,
  3776011,
  54017304,
  54091635,
  2086450,
  2162854,
  2425755,
  3605793,
  3610526,
  3625335,
  3653190,
  3685814,
  3753287,
  54248008,
  2026663,
  2138911,
  2153219,
  2162117,
  2230111,
  3656413,
  3673866,
  3699442,
  54017323,
  54101674,
  54104542,
  54264233,
  2

In [8]:
id_and_names = list(zip(columnized_data['id'], columnized_data['name']))
id_and_names

NameError: name 'columnized_data' is not defined

In [109]:
classifications = dataset.hazardous.tolist()

In [110]:
features = list(zip(columnized_data['est_diameter_min'],
                    columnized_data['est_diameter_max'], 
                    columnized_data['relative_velocity'], 
                    columnized_data['miss_distance'], 
                    columnized_data['absolute_magnitude']))
features = list(map(lambda x: np.asarray(x), features))
features[0]

array([9.03488842e-10, 2.02026247e-09, 4.15468927e-06, 1.32433640e-02,
       6.70567959e-04])

In [111]:
threshold = int(len(features) * 0.8)

train_ids, train_features, train_classifications = id_and_names[:threshold], np.asarray(features[:threshold]), np.asarray(classifications[:threshold]) # 80% of data go to training
test_ids, test_features, test_classifications = id_and_names[threshold:], np.asarray(features[threshold:]), np.asarray(classifications[threshold:])

print(f"{test_ids[0]}, {test_features[0]}, {test_classifications[0]}")
print(threshold)


(3843111, '(2019 MY3)'), [2.00411571e-11 4.48133897e-11 7.37695631e-06 1.86304352e-03
 1.00204417e-03], 0.0
72668


## instantiate and compile the model

In [2]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(5,), activation='sigmoid', kernel_regularizer="l2"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(8, activation='tanh', kernel_regularizer='l2'),
    keras.layers.Dense(8, activation='relu', kernel_regularizer='l2'),
    keras.layers.Dense(6, activation='sigmoid', kernel_regularizer='l2'),
    keras.layers.Dense(4, activation='tanh', kernel_regularizer='l2'),
    keras.layers.Dense(1, activation='sigmoid', kernel_regularizer='l2')
])

NameError: name 'keras' is not defined

In [129]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=["binary_accuracy"])

## train the model

In [130]:
model.fit(train_features, train_classifications, epochs=20)

Epoch 1/20


In [126]:
test_loss, test_acc = model.evaluate(test_features, test_classifications, verbose=1)



In [127]:
test_loss

0.2651752531528473