Importing the required libraries and the csv file

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics 
import os
from sklearn.model_selection import GridSearchCV
!pip install pydotplus
!pip install graphviz
%matplotlib inline
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.metrics import mean_squared_error

#data = pd.read_csv(r"C:\Users\sneha\OneDrive\Documents\Downloads\archive\dataset.csv",low_memory=False)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
data = pd.read_csv('/kaggle/input/asteroid-dataset/dataset.csv')

Displaying part of the data

In [None]:
pd.set_option('display.max_columns', 100)
data.head()

Printing the columns of the dataset

In [None]:
columnnames=data.columns
print(columnnames)

Describing the Dataset

In [None]:
data.describe()

Printing the shape of the dataset

In [None]:
data.shape

In [None]:
data.hist(figsize=(30,25))
plt.show()

Checking to see if there are unique names for the asteroids

In [None]:
print(data['id'].nunique())
print(data['spkid'].nunique())
print(data['full_name'].nunique())
print(data['pdes'].nunique())

Printing the count of potential hazard asteroid

In [None]:
print(data['pha'].value_counts(normalize=True))
name = ['Not a Potential Hazard', 'Potential Hazard']
plt.title("Asteroid Potential Hazard")
plt.pie(data['pha'].value_counts(), labels=name, autopct='%0.4f%%', shadow=True, startangle=90)
plt.show()

Printing the count of near earth objects

In [None]:
neocount=data['neo'].value_counts(normalize=True)
name = ['Not a Near Earth Object', 'Near Earth Object']
plt.title("Near Earth Objects")
plt.pie(neocount, labels=name, autopct='%0.4f%%', shadow=True, startangle=90)
plt.show()

Deciding to drop the following columns as they contain just one value

In [None]:
print(data['prefix'].unique())
print(data['prefix'].nunique())
print(data['equinox'].unique())
print(data['equinox'].nunique())

Deleting the following columns as they are redundant

In [None]:
cleandata = data.drop(['id', 'pdes', 'name', 'prefix', 'equinox'], axis='columns', inplace=False)

Cleaning the dataset and handling the missing values by adding the mean value

In [None]:
cleandata["diameter"] = cleandata["diameter"].replace(np.NaN, cleandata["diameter"].mean())

In [None]:
cleandata["albedo"] = cleandata["albedo"].replace(np.NaN, cleandata["albedo"].mean())

In [None]:
cleandata["diameter_sigma"] = cleandata["diameter_sigma"].replace(np.NaN, cleandata["diameter_sigma"].mean())

In [None]:
cleandata["H"] = cleandata["H"].replace(np.NaN, cleandata["H"].mean())

Checking to see if all the columns have any remaining missing values

In [None]:
cleandata.describe()

Removing the missing rows in the following two columns

In [None]:
cleandata = cleandata[cleandata['sigma_ad'].notna()]
cleandata = cleandata[cleandata['ma'].notna()]

Checking the shape of the dataset

In [None]:
cleandata.shape

Checking for the remaining rows after cleaning

In [None]:
cleandata.columns

Since the following columns do not having numerical values converting them to category type

In [None]:
dataframe_asteroid=cleandata
dataframe_asteroid['neo'] = dataframe_asteroid['neo'].astype('category')
dataframe_asteroid['pha'] = dataframe_asteroid['pha'].astype('category')
dataframe_asteroid['class'] = dataframe_asteroid['class'].astype('category')
dataframe_asteroid['orbit_id'] = dataframe_asteroid['orbit_id'].astype('category')

Checking for the count of neo and pha once again after cleaning

In [None]:
dataframe_asteroid['neo'].value_counts(normalize=True)*100

In [None]:
dataframe_asteroid[dataframe_asteroid['neo']=='Y']['pha'].value_counts(normalize=True)*100
#All near earth asteroids are not potentially hazardous

In [None]:
dataframe_asteroid['pha'].value_counts(normalize=True)*100

In [None]:
dataframe_asteroid[dataframe_asteroid['pha']=='Y']['neo'].value_counts(normalize=True)*100
#All PHA asteroids are near earth

Checking for the counts of classes and orbit_ids

In [None]:
dataframe_asteroid['class'].value_counts(normalize=True)*100

In [None]:
counts = dataframe_asteroid.groupby('class')['pha'].count().reset_index()
counts

In [None]:
counts = counts.sort_values(by = "pha",ascending=True)
sns.set(style="whitegrid")
f, ax = plt.subplots(figsize=(5,5))
xlabel="asteroid count"
sns.barplot(y="class", x="pha", data=counts)
ax.set(ylabel='Class', xlabel='Count of Asteroids')
plt.title("Asteroids and classes")
plt.show()
plt.savefig("class.png")

In [None]:
dataframe_asteroid['orbit_id'].nunique()

In [None]:
dataframe_asteroid = dataframe_asteroid.reset_index(drop=True)

Before performing Minmax scaling taking a subset of the dataset that contains just the numerical values and keeping aside the rest

In [None]:
dataframe_subset = dataframe_asteroid[dataframe_asteroid.columns[~dataframe_asteroid.columns.isin(['spkid', 'full_name','neo', 'pha', 'orbit_id', 'class'])]]

In [None]:
dataframe_asteroid.head()

In [None]:
correlation_matrix=dataframe_asteroid.corr()
plt.figure(figsize=(30,25))
plt.title("Correlation matrix of Potential Hazard Asteroids data")
sns.heatmap(data=correlation_matrix,cmap='jet',vmin=-1,vmax=1,linewidth=1.5,annot=True)
fig1 = plt.gcf()
plt.show()
fig1.savefig("correlationmatrix.png")

After looking at the correlation matrix, we are removing epoch_mjd and epoch_cal since they are highly corrrelated to epoch and sigma_w, sigma_a and a are highly correlated as well

In [None]:
dataframe_subset = dataframe_subset.drop(['epoch_mjd', 'epoch_cal','sigma_w','sigma_a','a'], axis='columns', inplace=False)

After removing the highly correlated redundant features.

In [None]:
correlation_matrix=dataframe_subset.corr()
plt.figure(figsize=(30,25))
plt.title("Correlation matrix of Potential Hazard Asteroids data")
sns.heatmap(data=correlation_matrix,cmap='jet',vmin=-1,vmax=1,linewidth=1.5,annot=True)
fig1 = plt.gcf()
plt.show()
fig1.savefig("correlationmatrix1.png")

Performing minmax scaling so that it is easier for the models to predict.
Also performing concatenation of the non numeric columns back to get the entire dataset after scaling

In [None]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
dataframe_scaled = scaler.fit_transform(dataframe_subset)
dataframe_scaled = pd.DataFrame(dataframe_scaled, columns=dataframe_subset.columns)
dataframe_asteroid = pd.concat([dataframe_asteroid[['spkid', 'full_name','neo', 'pha', 'orbit_id', 'class']],dataframe_scaled],axis=1)
dataframe_scaled.head()
dataframe_asteroid.head()
dataframe_asteroid.describe()

Applying get_dummies for the columns which had non numeric values

In [None]:
dataframe_asteroid1 = pd.get_dummies(dataframe_asteroid, columns=['neo', 'class', 'orbit_id'])
dataframe_asteroid1.head()

Splitting into training and test data and also using stratify to equally split the Y and N value between training and testing

In [None]:
from sklearn.model_selection import train_test_split
X = dataframe_asteroid1.drop(['spkid', 'full_name', 'pha'], axis=1)
y = dataframe_asteroid1.iloc[:]['pha']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1,stratify=y)
print("Rows with label 'Y': {}".format(sum(y_train == 'Y')))
print("Rows with label 'N': {}".format(sum(y_train == 'N')))


Checking to see if any of the values are null

In [None]:
X.isnull().sum()

Checking the dataset to see if all the scaling, one hot encoding and missing values are handled properly

In [None]:
dataframe_asteroid1.head()

Creating a function to calculate the metrics easily for precision,recall,accuracy and f1

In [None]:
def CalculationOfMetric(y_test, pred):
    
    precision_metric = metrics.precision_score(y_test, pred, average = "macro")
    recall_metric = metrics.recall_score(y_test, pred, average = "macro")
    accuracy_metric = metrics.accuracy_score(y_test, pred)
    f1_metric = metrics.f1_score(y_test, pred, average = "macro")
    
    print('Precision metric:',round(precision_metric, 4))
    print('Recall Metric:',round(recall_metric, 4))
    print('Accuracy Metric:',round(accuracy_metric, 4))
    print('F1 score:',round(f1_metric, 4))

Trying SMOTE since we have a minority of potentially hazardous asteroids in order to get equal number of Y and N cases

In [None]:
#Synthetic Minority Oversampling Technique( you need to install it on your laptop)
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state = 12) 
x_train_res, y_train_res = sm.fit_sample(x_train, y_train.ravel()) 
  
print("After SMOTE, counts of label 'N': {}".format(sum(y_train_res == 'N'))) 
print("After SMOTE, counts of label 'Y': {}".format(sum(y_train_res == 'Y'))) 

Logistic regression model without using SMOTE, it has a lot of false negatives

In [None]:
from sklearn.linear_model import LogisticRegression

# Instantiate the model
logisticRegr = LogisticRegression(max_iter= 10000) # create object for the class

# Fit to train model with features and labels
logisticRegr.fit(x_train, y_train)

# Predict for test set
lr_pred = logisticRegr.predict(x_test)

In [None]:
CalculationOfMetric(y_test, lr_pred)

In [None]:
print(metrics.confusion_matrix(y_test, lr_pred))

Using SMOTE dataset to check if the false negatives are eliminated

In [None]:
from sklearn.linear_model import LogisticRegression

# Instantiate the model
logisticRegr = LogisticRegression(max_iter= 10000) # create object for the class

# Fit to train model with features and labels using SMOTE
logisticRegr.fit(x_train_res, y_train_res)

# Predict for test set
slr_pred = logisticRegr.predict(x_test)

In [None]:
CalculationOfMetric(y_test, slr_pred)

In [None]:
print(metrics.confusion_matrix(y_test, slr_pred))

Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with 150 decision trees
rf = RandomForestClassifier(n_estimators = 150, random_state = 1551)
# Train the model on training data
rf.fit(x_train, y_train)
# Predict for test set
rf_pred = rf.predict(x_test)

In [None]:
CalculationOfMetric(y_test, rf_pred)

In [None]:
print(metrics.confusion_matrix(y_test, rf_pred))

Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier
FEATURE_NAMES=X.columns
model = DecisionTreeClassifier()
model.fit(x_train,y_train)
model_pred=model.predict(x_test)

In [None]:
CalculationOfMetric(y_test, model_pred)

In [None]:
print(metrics.confusion_matrix(y_test, model_pred))

printing the Decision trees

In [None]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree
import graphviz
dot_data = tree.export_graphviz(model, out_file=None, 
                                feature_names=FEATURE_NAMES, 
                                class_names=['N','Y'],
                                filled=True)

graph = graphviz.Source(dot_data, format="png") 
graph


Saving the decision tree as a png file

In [None]:
from sklearn.tree import export_graphviz
from six import StringIO  
import pydotplus

dot_data = StringIO()
export_graphviz(model, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = FEATURE_NAMES,class_names=['N','Y'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('my_decision_tree.png')

GaussianNB model performing hyper parameter tuning as the model was giving a lesser accuracy score than other models

In [None]:
from sklearn.naive_bayes import GaussianNB 
nb_classifier = GaussianNB()

params_NB = {'var_smoothing': np.logspace(0,-9, num=20)}
gs_NB = GridSearchCV(estimator=nb_classifier, 
                 param_grid=params_NB,
                 verbose=1, 
                 scoring='accuracy') 
gs_NB.fit(x_train, y_train)
gs_NB.best_params_

In [None]:
gs_NBbest = gs_NB.predict(x_test)
CalculationOfMetric(y_test, gs_NBbest)

In [None]:
print(metrics.confusion_matrix(y_test, gs_NBbest))

In [None]:
from sklearn.naive_bayes import GaussianNB 

gnb = GaussianNB()
gnb.fit(x_train, y_train)

# Predict for test set
gnb_pred = gnb.predict(x_test)

In [None]:
CalculationOfMetric(y_test, gnb_pred)

In [None]:
print(metrics.confusion_matrix(y_test, gnb_pred))

Printing the important features in order for random forest

In [None]:
feature_imp = pd.DataFrame(rf.feature_importances_,index=x_train.columns, columns = ['Importance']).sort_values(by='Importance', ascending=False)

In [None]:
feature_imp[0:10]

Printing the Metrics for all the models

In [None]:
print("logistic regression metrics")
CalculationOfMetric(y_test, lr_pred)
print("*******************************************")
print("logistic regression metrics with SMOTE")
# use after installing SMOTE CalculationOfMetric(y_test, slr_pred)
print("*******************************************")
print("Random forest metrics")
CalculationOfMetric(y_test, rf_pred)
print("*******************************************")
print("Gaussian naive bayes after hyper parameter turning metrics")
CalculationOfMetric(y_test, gs_NBbest)
print("*******************************************")
print("Gaussian naive bayes metrics")
CalculationOfMetric(y_test, gnb_pred)
print("*******************************************")
print("Decision Tree metrics")
CalculationOfMetric(y_test, model_pred)

Plotting a bar graph to see the accuracy scores for all the models

In [None]:
accuracy_scores= {'Logistic regression with Smote': metrics.accuracy_score(y_test, slr_pred),'Logistic regression without Smote': metrics.accuracy_score(y_test, lr_pred), 'Random forest': metrics.accuracy_score(y_test, rf_pred), 'decision trees': metrics.accuracy_score(y_test, model_pred),'GNB with Hyperparameter tuning': metrics.accuracy_score(y_test,gs_NBbest ), 'Gaussian naive bayes': metrics.accuracy_score(y_test, gnb_pred) }
classifiers = accuracy_scores.keys()
accuracy = accuracy_scores.values()
plt.figure(figsize = (20, 5))
plt.bar(classifiers, accuracy, 0.4)
plt.plot(range(-1,7), np.ones(8)*max(accuracy), color = 'red', label = 'Maximum Accuracy')
plt.xlim(-0.4,5.9)
plt.xlabel('Classifiers')
plt.ylabel('Accuracy')
plt.legend(loc = 'right')
plt.title('Accuracy for Classifier models')
print("maximum accuracy is",max(accuracy))

Plotting a bar graph to see the F1 scores for all the models

In [None]:
F1_scores= {'Logistic regression using SMOTE': metrics.f1_score(y_test, slr_pred, average = "macro"),'Logistic regression': metrics.f1_score(y_test, lr_pred, average = "macro"), 'Random forest':  metrics.f1_score(y_test, rf_pred, average = "macro"), 'decision trees':  metrics.f1_score(y_test, model_pred, average = "macro"),'GNB with hyperparameter tuning': metrics.f1_score(y_test, gs_NBbest, average = "macro"), 'Gaussian naive bayes':  metrics.f1_score(y_test, gnb_pred, average = "macro") }
classifiers = F1_scores.keys()
f1 = F1_scores.values() 
plt.figure(figsize = (20, 5))
plt.bar(classifiers, f1, 0.4)
plt.plot(range(-1,7), np.ones(8)*max(f1), color = 'red', label = 'Maximum F1 score')
plt.xlim(-0.4,5.9)
plt.xlabel('Classifiers')
plt.ylabel('F1 Score')
plt.legend(loc = 'right')
plt.title('F1 score for Classifier models')
print("maximum F1 score is",max(f1))