In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, f1_score, confusion_matrix
from sklearn import tree
from xgboost import plot_tree

In [None]:
data = pd.read_csv("/kaggle/input/nasa-asteroids-classification/nasa.csv")
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
plt.figure(figsize=(15, 10))
corr=data.corr()
sns.heatmap(corr,annot=True)

<h2> As you can see in the heatmap above, there is a lot of similar data in the dataset. We have to exclude them.</h2>

In [None]:
data.isna().sum()

# Data Preparation

In [None]:
label_encoder = LabelEncoder()
data["Label"] = label_encoder.fit_transform(data["Hazardous"]) 
data.head()

In [None]:
data["Label"].value_counts()

In [None]:
categories = list(label_encoder.inverse_transform([0, 1]))
categories

In [None]:
classes = list(set(data["Hazardous"]))
data.drop(["Miss Dist.(Astronomical)","Miss Dist.(lunar)","Miss Dist.(miles)","Relative Velocity km per sec","Est Dia in M(max)","Relative Velocity km per hr","Est Dia in Feet(max)", "Est Dia in Feet(min)", "Est Dia in Miles(max)", "Est Dia in Miles(min)","Est Dia in KM(max)","Est Dia in KM(min)","Neo Reference ID","Orbit ID","Name","Close Approach Date","Equinox","Epoch Date Close Approach","Orbiting Body","Orbit Determination Date","Hazardous"], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
plt.figure(figsize=(15, 10))
corr=data.corr()
sns.heatmap(corr,annot=True)

In [None]:
X, y = data.iloc[: , :-1], data.iloc[: , -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Naive Bayes

In [None]:
nb_model = GaussianNB().fit(X_train,y_train)
print("Accuracy of train:",nb_model.score(X_train,y_train))
print("Accuracy of test:",nb_model.score(X_test,y_test))

In [None]:
#predictions
nb_model_preds = nb_model.predict(X_test)
print(classification_report(y_test,nb_model_preds))

In [None]:
# Metrics
print("Precision = {}".format(precision_score(y_test, nb_model_preds, average='macro')))
print("Recall = {}".format(recall_score(y_test, nb_model_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, nb_model_preds)))
print("F1 Score = {}".format(f1_score(y_test, nb_model_preds,average='macro')))

In [None]:
cm = confusion_matrix(y_test, nb_model_preds)
ax =sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='g')
ax.xaxis.set_ticklabels(categories, fontsize = 12)
ax.yaxis.set_ticklabels(categories, fontsize = 12, rotation=0)
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
plt.show()

# Logistic Regression

In [None]:
log_reg_model = LogisticRegression().fit(X_train, y_train)
print("Accuracy of train:",log_reg_model.score(X_train,y_train))
print("Accuracy of test:",log_reg_model.score(X_test,y_test))

In [None]:
#predictions
log_reg_preds = log_reg_model.predict(X_test)
print(classification_report(y_test,log_reg_preds))

In [None]:
# Metrics
print("Precision = {}".format(precision_score(y_test, log_reg_preds, average='macro')))
print("Recall = {}".format(recall_score(y_test, log_reg_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, log_reg_preds)))
print("F1 Score = {}".format(f1_score(y_test, log_reg_preds,average='macro')))

In [None]:
cm = confusion_matrix(y_test, log_reg_preds)
ax =sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='g')
ax.xaxis.set_ticklabels(categories, fontsize = 12)
ax.yaxis.set_ticklabels(categories, fontsize = 12, rotation=0)
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
plt.show()

# Random Forest

In [None]:
rf_model = RandomForestClassifier().fit(X_train, y_train)
print("Accuracy of train:",rf_model.score(X_train,y_train))
print("Accuracy of test:",rf_model.score(X_test,y_test))

In [None]:
#Feature Importance
plt.figure(figsize=(12, 8))
sns.barplot(x=rf_model.feature_importances_, y=X.columns)
plt.show()

In [None]:
#predictions
rf_model_preds = rf_model.predict(X_test)
print(classification_report(y_test,rf_model_preds))

In [None]:
# Metrics
print("Precision = {}".format(precision_score(y_test, rf_model_preds, average='macro')))
print("Recall = {}".format(recall_score(y_test, rf_model_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, rf_model_preds)))
print("F1 Score = {}".format(f1_score(y_test, rf_model_preds,average='macro')))

In [None]:
cm = confusion_matrix(y_test, rf_model_preds)
ax =sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='g')
ax.xaxis.set_ticklabels(categories, fontsize = 12)
ax.yaxis.set_ticklabels(categories, fontsize = 12, rotation=0)
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
plt.show()

# Decision Tree

In [None]:
dec_tree = DecisionTreeClassifier().fit(X_train,y_train)
print("Accuracy of train:",dec_tree.score(X_train,y_train))
print("Accuracy of test:",dec_tree.score(X_test,y_test))

In [None]:
plt.figure(figsize=(15, 10))
tree.plot_tree(dec_tree)

In [None]:
#Feature Importance
plt.figure(figsize=(12, 8))
sns.barplot(x=dec_tree.feature_importances_, y=X.columns)
plt.show()

In [None]:
#predictions
dec_tree_preds = dec_tree.predict(X_test)
print(classification_report(y_test,dec_tree_preds))

In [None]:
# Metrics
print("Precision = {}".format(precision_score(y_test, dec_tree_preds, average='macro')))
print("Recall = {}".format(recall_score(y_test, dec_tree_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, dec_tree_preds)))
print("F1 Score = {}".format(f1_score(y_test, dec_tree_preds,average='macro')))

In [None]:
cm = confusion_matrix(y_test, dec_tree_preds)
ax =sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='g')
ax.xaxis.set_ticklabels(categories, fontsize = 12)
ax.yaxis.set_ticklabels(categories, fontsize = 12, rotation=0)
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
plt.show()

# Deep Learning

In [None]:
dl_model = Sequential()
dl_model.add(Dense(20, activation='relu', input_shape=X_train.shape))
dl_model.add(Dense(50, activation='relu'))
dl_model.add(Dense(50, activation='relu'))
dl_model.add(Dense(1, activation='sigmoid'))

In [None]:
dl_model.summary()

In [None]:
dl_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
dl_model_history=dl_model.fit(X_train,y_train, epochs=100, batch_size=100, validation_data=(X_test,y_test))

In [None]:
plt.plot(dl_model_history.history['accuracy'])
plt.plot(dl_model_history.history['val_accuracy'])
plt.legend(["accuracy","val_accuracy"])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

In [None]:
plt.plot(dl_model_history.history['loss'])
plt.plot(dl_model_history.history['val_loss'])
plt.legend(["loss","val_loss"])
plt.xlabel('Epoch')
plt.ylabel('Loss')

# Looks like we have a overfitting problem.

In [None]:
#predictions
dl_model_preds = dl_model.predict(X_test)
print(classification_report(y_test,dl_model_preds.round()))

In [None]:
# Metrics
print("Precision = {}".format(precision_score(y_test, dl_model_preds.round(), average='macro')))
print("Recall = {}".format(recall_score(y_test, dl_model_preds.round(), average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, dl_model_preds.round())))
print("F1 Score = {}".format(f1_score(y_test, dl_model_preds.round(),average='macro')))

In [None]:
cm = confusion_matrix(y_test, dl_model_preds.round())
ax =sns.heatmap(cm, square=True, annot=True, cbar=False, fmt='g')
ax.xaxis.set_ticklabels(categories, fontsize = 12)
ax.yaxis.set_ticklabels(categories, fontsize = 12, rotation=0)
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
plt.show()