# PREDICTING ASTEROID DIAMETER

>by Dr Juan H Klopper

- Research Fellow
- School for Data Science and Computational Thinking
- Stellenbosch University

## PACKAGES USED IN THIS NOTEBOOK

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
from sklearn.metrics import r2_score

In [None]:
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = 'plotly_white'

In [None]:
!pip install tensorflow_decision_forests --upgrade -q

## DATA IMPORT

| VARIABLE NAME  | DESCRIPTION                                      |
| :--------------| :------------------------------------------------|
| a              | semi-major axis[au]                              |
| e              | eccentricity                                     |
| i              | inclination wrt x-y ecliptic plane [deg]         |
| om             | longitude of the ascending node                  |
| w              | argument of perihelion                           |
| q              | perihelion distance [au]                         |
| ad             | aphelion distance [au]                           |
| per_y          | orbital period [years]                           |
| data_arc       | data arc-span [d]                                |
| condition_code | orbit condition code                             |
| n_obs_use      | number of observations used                      |
| H              | absolute magnitude parameter                     |
| diameter       | diameter of asteroid [km]                        |
| extent         | object bi or tri-axial ellipsoid dimensions [km] |
| albedo         | geometric albedo                                 |
| rot_per        | standard gravitational parameter [$m \times G$]  |
| bv             | color index B-V magnitude difference             |
| ub             | color index U-B magnitude difference             |
| IR             | color index I-R magnitude difference             |
| spec_B         | spectral taxonomic type (SMASSII)                |
| spec_T         | spectral taxonomic type (Tholen)                 |
| neo            | near earth object                                |
| pha            | physically hazardous asteroid                    |
| moid           | earth minimum orbit intersection distance [au]   |

In [None]:
df = pd.read_csv('/kaggle/input/prediction-of-asteroid-diameter/Asteroid.csv', low_memory=False)

In [None]:
df.info()

In [None]:
df = df.dropna(subset=['diameter'])

In [None]:
df.info()

NASA uses the absolute magnitude parameter `H` and geometric albedo `albedo` in their model.

In [None]:
df = df.drop(columns=['full_name', 'H', 'albedo', 'G', 'extent', 'rot_per', 'GM', 'BV', 'UB', 'IR', 'spec_B', 'spec_T'])
df.info()

In [None]:
df.diameter.dtype

In [None]:
df.diameter = pd.to_numeric(df.diameter, errors='coerce')

In [None]:
np.sum(df.diameter.isna())

In [None]:
df = df.dropna(subset=['diameter'])
df.info()

In [None]:
df.diameter.describe()

In [None]:
df = df.dropna(subset=['data_arc'])
df.info()

In [None]:
df.condition_code.value_counts()

In [None]:
df.condition_code = pd.to_numeric(df.condition_code, errors='coerce')

In [None]:
df.neo.value_counts()

In [None]:
df.neo = df.neo.replace({'N':0, 'Y':1})

In [None]:
df.pha.value_counts()

In [None]:
df.pha = df.pha.replace({'N':0, 'Y':1})

In [None]:
df.info()

In [None]:
for column in df.columns:
    if df[column].dtype == float:
        df[column] = df[column].astype('float32')

## TRAIN TEST SPLIT

In [None]:
features = df.drop('diameter', axis=1)
target = df.diameter

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features,
                                                   target,
                                                   test_size=0.2,
                                                   random_state=28)

## TRAIN

In [None]:
forest = RandomForestRegressor(max_depth=32,
                              n_estimators=50)

In [None]:
X_train

In [None]:
for column in df.columns:
    print(column, np.sum(df[column].isna()))

In [None]:
forest.fit(X_train,
          np.ravel(y_train))

## PREDICTION

In [None]:
y_pred = forest.predict(X_test)

## METRICS

In [None]:
r2_score(y_test,
        y_pred)

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=y_test,
                        y=y_pred,
                        mode='markers'))

fig.update_layout(title='Actual vs predicted diameters',
                 xaxis=dict(title='Actual diameters'),
                 yaxis=dict(title='Predicted diameters'))