In [None]:
import pandas as pd
import json

# Import the data
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (balanced_accuracy_score,
                             classification_report, confusion_matrix)
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

**Create the base dataframe from which the analysis will take place.**

In [None]:
# Create a DataFrame from the JSON file
path = 'resources/neo_data.json'

df_original = pd.read_json(path)
df_original.info()

def remove_unnecessary_columns(df):
    '''
        Drop the columns that serve to identify the Near Earth Object, which are
        assigned by NASA and not scientifically descriptive of the object itself.
        Return a dataframe without the unnecessary columns.
    '''
    # Create a new DataFrame
    df_columns_removed = df.copy()
    superfluous_columns = ['name', 'id', 'orbiting_body']
    # Drop the superfluous columns
    df_columns_removed.drop(superfluous_columns, axis=1, inplace=True)
    return df_columns_removed

base_df = remove_unnecessary_columns(df_original)

**Preliminary model experiments**

In [None]:
def get_features_set(df):
    X = df.copy()
    # Drop the target column
    X.drop('is_potentially_hazardous', axis=1, inplace=True)
    return X

# Define the features set and drop the target column
X = get_features_set(base_df)
X.head()

In [None]:
# Define the target vector
y = base_df['is_potentially_hazardous'].copy()
y[:5]

In [None]:
# check value_counts
y.value_counts()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# No testing has been done yet to determine the data is distrubuted normally.
# We will use both the StandardScaler and the MinMaxScaler to determine which
# one is best for this dataset. Let's start with the StandardScaler.
standard_scaler = StandardScaler().fit(X_train)
X_train_scaled = standard_scaler.transform(X_train)
X_train_scaled

In [None]:
# Scale the testing dataset
X_test_scaled = standard_scaler.transform(X_test)
X_test_scaled

In [None]:
# Check the max and min of the scaled training and testing sets
print("Scaled data min/max (StandardScaler):")
print("Training data min:",X_train_scaled.min())
print("Training data max:",X_train_scaled.max())
print("Testing data min:",X_test_scaled.min())
print("Testing data max:",X_test_scaled.max())

In [None]:
# Try the MinMaxScaler
minmax_scaler = MinMaxScaler().fit(X_train)
X_train_minmax = minmax_scaler.transform(X_train)
X_train_minmax

In [None]:
X_test_minmax = minmax_scaler.transform(X_test)
X_test_minmax

In [None]:
# Check the max and min of the scaled training and testing sets
print("Scaled data min/max (MinMaxScaler):")
print("Training data min:",X_train_minmax.min())
print("Training data max:",X_train_minmax.max())
print("Testing data min:",X_test_minmax.min())
print("Testing data max:",X_test_minmax.max())

KNN and DecisionTree models were created on a separate file and showed accuracy scores of 1.0, demonstrating that the model is overfitting.

In [None]:
svc_standard = SVC(kernel='rbf')
svc_standard.fit(X_train_scaled, y_train)

In [None]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % svc_standard.score(X_train_scaled, y_train))
print('Test Accuracy: %.3f' % svc_standard.score(X_test_scaled, y_test))

In [None]:
svc_minmax = SVC(kernel='rbf')
svc_minmax.fit(X_train_minmax, y_train)

In [None]:
# validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % svc_minmax.score(X_train_minmax, y_train))
print('Test Accuracy: %.3f' % svc_minmax.score(X_test_minmax, y_test))

In [None]:
# Create a loop to vary the max_depth parameter
# Make sure to record the train and test scores 
# for each pass.

# Depths should span from 1 up to 40 in steps of 2
depths = range(1, 10, 1)

# The scores dataframe will hold depths and scores
# to make plotting easy
scores = {'train': [], 'test': [], 'depth': []}

# Loop through each depth (this will take time to run)
for depth in depths:
    clf = RandomForestClassifier(max_depth=depth)
    clf.fit(X_train_scaled, y_train)

    train_score = clf.score(X_train_scaled, y_train)
    test_score = clf.score(X_test_scaled, y_test)

    scores['depth'].append(depth)
    scores['train'].append(train_score)
    scores['test'].append(test_score)

# Create a dataframe from the scores dictionary and
# set the index to depth
scores_df = pd.DataFrame(scores).set_index('depth')

scores_df.head()

In [None]:
# Plot the scores dataframe with the plot method
scores_df.plot()

In [None]:
# Create a Random Forest model
classifier = RandomForestClassifier(random_state=13, max_depth=3, n_estimators=100)

# Fit (train) or model using the training data
classifier.fit(X_train_scaled, y_train)

# Calculate the accuracy of the model on the testing data
classifier.score(X_test_scaled, y_test)


In [None]:
# Calculate the accuracy of the model on the training data
classifier.score(X_train_scaled, y_train)

In [None]:
# Check the model's balanced accuracy on the test set

y_test_pred = classifier.predict(X_test_scaled)
print(balanced_accuracy_score(y_test, y_test_pred))

It looks like the model does well at identifying when an object is not dangerous, but it does relatively poorly when trying to identify dangerous objects. This is a problem! 

In [None]:
# Check the model's confusion matrix
confusion_matrix(y_test, y_test_pred)

**Model Optimization**

In [None]:
# Fine Tuning: Should do some overfitting and balanced_score testing
# Import RandomOverSampler from imblearn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the RandomOverSampler instance
random_oversampler = RandomOverSampler(random_state=1)

# Fit the data to the model
X_resampled, y_resampled = random_oversampler.fit_resample(
                                                X_train_scaled, y_train)


In [None]:
# Count distinct values
y_resampled.value_counts()

In [None]:
# Create a RandomForestClassifier instance and fit it to the resampled data
resampled_model = RandomForestClassifier(random_state=13, max_depth=3, n_estimators=100)
resampled_model.fit(X_resampled, y_resampled)

In [None]:
# Make predictions for testing features
y_pred = classifier.predict(X_test_scaled)
y_pred_resampled = resampled_model.predict(X_test_scaled)

In [None]:
# Print the classification reports for the two models
original_classification_report = classification_report(y_test, y_pred)
print(original_classification_report)
print('----------')
print('Resampled using Random Oversampler:')
print(classification_report(y_test, y_pred_resampled))

In [None]:
# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE instance 
# Set the sampling_strategy parameter equal to auto
smote_sampler = SMOTE(random_state=1, sampling_strategy='auto')

In [None]:
# Fit the training data to the smote_sampler model
X_resampled_smote, y_resampled_smote = smote_sampler.fit_resample(X_train_scaled, y_train)

In [None]:
# Count distinct values for the resampled target data
y_resampled_smote.value_counts()

In [None]:
# Instantiate a new RandomForestClassier model 
smote_model = RandomForestClassifier()

# Fit the resampled data to the new model
smote_model.fit(X_resampled_smote, y_resampled_smote)

In [None]:
# Predict labels for resampled testing features
smote_y_pred = smote_model.predict(X_test_scaled)

In [None]:
# Print classification reports
print(f"Classification Report - Original Data")
print(original_classification_report)
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, smote_y_pred))

**Performance is improving, but not enough. Let's try adding more data from the NASA API**

In [None]:
# Import the additional data file from JSON into a dataframe
path = 'resources/additional_neo_data.json'
df_additional = pd.read_json(path)

# Count the instances of is_potentially_hazardous == 1
df_additional['is_potentially_hazardous'].value_counts()

In [None]:
# Get a dataframe of the minority class
minority_class_df = df_additional[df_additional['is_potentially_hazardous'] == 1]

minority_class_df = remove_unnecessary_columns(minority_class_df)
minority_class_df.info()

In [None]:
# Combine the minority class dataframe with the base dataframe
combined_df = pd.concat([base_df, minority_class_df], ignore_index=True)
combined_df.info()

In [None]:
# Check the value counts of the target column
combined_df['is_potentially_hazardous'].value_counts()

In [None]:
# Define the new features set and drop the target column
X = get_features_set(combined_df)


In [None]:
# Define the target vector
y = combined_df['is_potentially_hazardous'].copy()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Scale with Standard Scaler
standard_scaler = StandardScaler().fit(X_train)
X_train_scaled = standard_scaler.transform(X_train)

# Scale the testing dataset
X_test_scaled = standard_scaler.transform(X_test)


In [None]:
# Create a loop to vary the max_depth parameter
# Make sure to record the train and test scores 
# for each pass.

# Depths should span from 1 up to 20 in steps of 2
depths = range(1, 10, 1)

# The scores dataframe will hold depths and scores
# to make plotting easy
scores = {'train': [], 'test': [], 'depth': []}

# Loop through each depth (this will take time to run)
for depth in depths:
    clf = RandomForestClassifier(max_depth=depth)
    clf.fit(X_train_scaled, y_train)

    train_score = clf.score(X_train_scaled, y_train)
    test_score = clf.score(X_test_scaled, y_test)

    scores['depth'].append(depth)
    scores['train'].append(train_score)
    scores['test'].append(test_score)

# Create a dataframe from the scores dictionary and
# set the index to depth
scores_df = pd.DataFrame(scores).set_index('depth')

scores_df.plot()

In [None]:
# Create a Random Forest model
classifier = RandomForestClassifier(random_state=13, max_depth=3, n_estimators=100)

# Fit (train) or model using the training data
classifier.fit(X_train_scaled, y_train)

# Calculate the accuracy of the model on the testing data
print(f"Testing data accuracy score: {classifier.score(X_test_scaled, y_test)}")

# Calculate the accuracy of the model on the training data
print(f"Training data accuracy score: {classifier.score(X_train_scaled, y_train)}")

# Check the model's balanced accuracy on the test set
y_test_pred = classifier.predict(X_test_scaled)
print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_test_pred)}")

In [None]:
# Make predictions for testing features
y_pred = classifier.predict(X_test_scaled)

In [None]:
# Print the classification reports for the two models
print("Classification Report - Original Data")
print(original_classification_report)
print('----------------')
print('Classification Report: New Data Added')
print(classification_report(y_test, y_pred))

**Let's see if we can get precision and f1-score up a bit more by trying to use SMOTE with the new data set**

In [None]:
def generate_smote_models(X_train_scaled, y_train, X_test_scaled, y_test, original_classification_report):
    # Instantiate the SMOTE instance 
    # Set the sampling_strategy parameter equal to auto
    smote_sampler = SMOTE(random_state=1, sampling_strategy='auto')
    # Fit the training data to the smote_sampler model
    X_resampled_smote, y_resampled_smote = smote_sampler.fit_resample(X_train_scaled, y_train)
    # Instantiate a new RandomForestClassier model 
    smote_model = RandomForestClassifier()

    # Fit the resampled data to the new model
    smote_model.fit(X_resampled_smote, y_resampled_smote)
    # Predict labels for resampled testing features
    smote_y_pred = smote_model.predict(X_test_scaled)
    # Calculate the accuracy of the model on the testing data
    print(f"Testing data accuracy score: {smote_model.score(X_test_scaled, y_test)}")

    # Calculate the accuracy of the model on the training data
    print(f"Training data accuracy score: {smote_model.score(X_resampled_smote, y_resampled_smote)}")

    # Check the model's balanced accuracy on the test set
    print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, smote_y_pred)}")
    print('---------')
    print(f"Classification Report - Original Data")
    print(original_classification_report)
    print("---------")
    print(f"Classification Report - Resampled and Added Data - SMOTE")
    print(classification_report(y_test, smote_y_pred))
    return smote_model

generate_smote_models(X_train_scaled, y_train, X_test_scaled, y_test, original_classification_report)