## Import Statements

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns

## Reading CSV Data

In [None]:
df = pd.read_csv('../input/nasa-asteroids-classification/nasa.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

## Dropping the columns which are not useful in classification

In [None]:
df = df.drop(['Neo Reference ID', 'Name', 'Orbit ID', 'Close Approach Date',
                        'Epoch Date Close Approach', 'Orbit Determination Date'] , axis = 1)
df.head()

## One Hot Encoding of Hazardous Column

In [None]:
hazardous_labels = pd.get_dummies(df['Hazardous'])
hazardous_labels

In [None]:
df = pd.concat([df, hazardous_labels], axis = 1)
df.head()

In [None]:
df = df.drop(['Hazardous'], axis = 1)
df.head()

In [None]:
df.info()

## Observing Unique Values in Orbiting Body and Equinox

In [None]:
df['Orbiting Body'].value_counts()

In [None]:
df['Equinox'].value_counts()

## Only single unique value, so both can be dropped

In [None]:
df = df.drop(['Orbiting Body', 'Equinox'], axis = 1)

## Heatmap of Correlation

In [None]:
plt.figure(figsize = (20,20))
sns.heatmap(df.corr(),annot = True)

## Many Columns are there which are having correlation 1 because they are identical columns with a different unit. So, can be dropped.

In [None]:
df = df.drop(['Est Dia in KM(max)', 'Est Dia in M(min)', 'Est Dia in M(max)', 'Est Dia in Miles(min)'
             ,'Est Dia in Miles(max)', 'Est Dia in Feet(min)', 'Est Dia in Feet(max)', 
             'Relative Velocity km per hr', 'Miles per hour', 'Miss Dist.(lunar)', 
             'Miss Dist.(kilometers)', 'Miss Dist.(miles)'], axis = 1)
df.head()

## Final Heatmap

In [None]:
plt.figure(figsize = (20,20))
sns.heatmap(df.corr(),annot = True)

## Drop the False Column, True is enough for classification

In [None]:
df.drop([False], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df.describe()

## Model Building

In [None]:
x = df.drop([True], axis = 1)
y = df[True].astype(int)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0 , test_size = 0.3)

## XGBoost Classifier

### XGBoost is a decision-tree-based ensemble Machine Learning algorithm that uses a gradient boosting framework. In prediction problems involving unstructured data (images, text, etc.) ... A wide range of applications: Can be used to solve regression, classification, ranking, and user-defined prediction problems.

### The XGBoost library provides a built-in function to plot features ordered by their importance.

In [None]:
from xgboost import XGBClassifier
from matplotlib import pyplot
from xgboost import plot_importance

xbg_model = XGBClassifier()
xbg_model.fit(x_train, y_train)
plot_importance(xbg_model)
pyplot.show()

### Final Accuracy Score

In [None]:
from sklearn.metrics import accuracy_score

predictions = xbg_model.predict(x_test)
acc = accuracy_score(y_test, predictions)
print(str(np.round(acc*100, 2))+'%')