In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import tensorflow as tf
from sqlalchemy import create_engine
import psycopg2 as psy
from urllib.parse import quote 

## Preprocessing

In [None]:
# We want to find out about the Nativeness of species in the U.S. National Parks. Some national parks contain more than 50%
# of non-native species.
# The data was first cleaned in Python, and then merged in PostgreSQL. We decided to first clean the table to remove 
# unwanted columns that does not pertain to our Machine Learning. 

In [2]:
# We created a connection to PostgreSQL. The password to the PostgreSQL was replaced with 'password' before posting the code 
# on GitHub. An engine connection was created to the learningspecies table from PostgreSQL

engine = create_engine('postgresql+psycopg2://postgres:%s@localhost:5432/National_Parks' % quote('password'))

In [3]:
# Connection to the DataFrame

conn = engine.connect()

park_df = pd.read_sql('SELECT * FROM learningspecies;', conn)

conn.close()

park_df.head()

Unnamed: 0,genus,category,park_name,conservation_status,nativeness
0,Alces,Mammal,Acadia,Least Concern,Native
1,Odocoileus,Mammal,Acadia,Least Concern,Native
2,Canis,Mammal,Acadia,Species of Concern,Not Native
3,Canis,Mammal,Acadia,Endangered,Native
4,Lynx,Mammal,Acadia,Least Concern,Native


In [4]:
# Generate our categorical variable list
park_cat = park_df.dtypes[park_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
park_df[park_cat].nunique()

genus                  6184
category                 14
park_name                56
conservation_status       5
nativeness                2
dtype: int64

In [None]:
# Checking if binning is required
park_df.nativeness.value_counts()

# Feature Engineering

In [None]:
# In order to achieve a model that tells about the nativeness of species, we choose to use OneHotEncoding to present 
# our data to a test and train environment. We wil use a Random Forest Model, based on the pros and cons in our ReadMe.md

In [7]:
# First, we create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# We then fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(park_df[park_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(park_cat)
encode_df.head()

Unnamed: 0,genus_Abaeis,genus_Abelia,genus_Abgrallaspis,genus_Abies,genus_Abietinella,genus_Abildgaardia,genus_Ablabesmyia,genus_Ablennes,genus_Abronia,genus_Abrus,...,park_name_Yellowstone,park_name_Yosemite,park_name_Zion,conservation_status_Endangered,conservation_status_In Recovery,conservation_status_Least Concern,conservation_status_Species of Concern,conservation_status_Threatened,nativeness_Native,nativeness_Not Native
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [8]:
# Merge one-hot encoded features and drop the originals
park_df = park_df.merge(encode_df,left_index=True, right_index=True)
park_df = park_df.drop(park_cat,1)
park_df.head()

Unnamed: 0,genus_Abaeis,genus_Abelia,genus_Abgrallaspis,genus_Abies,genus_Abietinella,genus_Abildgaardia,genus_Ablabesmyia,genus_Ablennes,genus_Abronia,genus_Abrus,...,park_name_Yellowstone,park_name_Yosemite,park_name_Zion,conservation_status_Endangered,conservation_status_In Recovery,conservation_status_Least Concern,conservation_status_Species of Concern,conservation_status_Threatened,nativeness_Native,nativeness_Not Native
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Training and Testing datasets

In [None]:
# National Parks contain both native and non-native Biodiversity. We want to see the 'nativeness' of these species. As such,
# our target features are first extabliched then split into X_train, X_test, y_train, y_test, where x is the nativeness and
# y is the non-nativeness of species.Our data will first be trained the it will predict the accuracy of the nativeness 
# of species.

In [14]:
# Remove Nativeness target from features data
y = park_df.nativeness_Native
X = park_df.drop(columns=["nativeness_Native","nativeness_Not Native"])


# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Random Forest Model

In [None]:
# We will run our Random Frest Model to predict the accuracy of the nativeness of our dataset.

In [15]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.916


In [None]:
# Additionally, we will build a Deep Neural Network to further substantiate our learned 'evidence'

In [16]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 4

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
653/653 - 1s - loss: 0.2821 - accuracy: 0.9125 - 1s/epoch - 2ms/step
Loss: 0.2821284234523773, Accuracy: 0.912458062171936


In [None]:
# As we can see here, the accuracy of our Deep Neural Network is not much diffrence from our Random Forest Model. 
# We conclude that the test and train data was predictably accurate in the 90s. It was not overfit, as we tested with 
# various number of hidden nodes and came to similar conclusions.