In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func
from config import db_password
import psycopg2
from sklearn.preprocessing import StandardScaler


In [3]:
Base = automap_base()
engine = create_engine(f"postgres://caznoe:{db_password}@aves.cos0wnwxlodh.us-east-2.rds.amazonaws.com:5432/aves")

Base.prepare(engine, reflect=True)
# print(engine.table_names())

In [4]:
bird_df = pd.read_sql("SELECT * FROM bird_a_w_p", engine)

bird_df.head()


Unnamed: 0,county_name,species,date,aqi,category,defining_parameter,heavy_rain,high_wind,year,population
0,Kern,Calypte anna,2018-04-16,46,Good,Ozone,,1,2018,905801
1,Kern,Calypte anna,2018-04-16,46,Good,Ozone,,1,2018,905801
2,Kern,Calypte anna,2018-04-16,46,Good,Ozone,,1,2018,905801
3,Kern,Calypte anna,2018-04-16,46,Good,Ozone,,1,2018,905801
4,Kern,Calypte anna,2018-04-16,46,Good,Ozone,,1,2018,905801


## Cleaning

In [None]:
bird_df.heavy_rain.unique()

In [None]:
bird_df.high_wind.unique()

In [None]:
bird_df.category.unique()

In [5]:
bird_df.replace(to_replace=[None], value="0", inplace=True)


In [None]:
bird_df.head()

In [None]:
# One way to encode hummingbird species
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# df2 = bird_df.copy()
# df2['species'] = le.fit_transform(df2['species'])
# df2.head()
# df2.species.unique()

In [6]:
species_num = {
   "Calypte anna": 1,
   "Selasphorus sasin": 2,
   "Calypte costae": 3,
   "Selasphorus rufus": 4,
   "Archilochus alexandri": 5,
   "Selasphorus calliope": 6,
   "Cynanthus latirostris": 7,
   "Selasphorus platycercus": 8,
   "Archilochus colubris": 9,
   "Amazilia violiceps": 10
}

In [7]:
bird_df["species_num"] = bird_df["species"].apply(lambda x: species_num[x])


In [8]:
bird_df = bird_df.drop(["species"], axis=1)
# bird_df.head()

In [None]:
bird_df.species.unique()

In [None]:
bird_df.dtypes

In [None]:
bird_df['population'] = pd.to_numeric(bird_df['population'])


In [9]:
# Preparing for modeling
bird_binary_encoded = pd.get_dummies(bird_df, columns=["defining_parameter"])
# bird_binary_encoded.head()

In [10]:
df = bird_binary_encoded.drop(["category", "county_name","date"], axis=1)
df.head()

Unnamed: 0,aqi,heavy_rain,high_wind,year,population,species_num,defining_parameter_CO,defining_parameter_NO2,defining_parameter_Ozone,defining_parameter_PM10,defining_parameter_PM2.5,defining_parameter_SO2
0,46,0,1,2018,905801,1,0,0,1,0,0,0
1,46,0,1,2018,905801,1,0,0,1,0,0,0
2,46,0,1,2018,905801,1,0,0,1,0,0,0
3,46,0,1,2018,905801,1,0,0,1,0,0,0
4,46,0,1,2018,905801,1,0,0,1,0,0,0


In [11]:
df_np=df.drop(["population"], axis=1)

In [None]:
df.isnull().values.any()

In [None]:
df.sum()

In [None]:
df.isnull().sum()

In [None]:
# Scale 
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()

In [None]:
df_scaled = data_scaler.fit_transform(df)


In [None]:
df_scaled[:5]


# Stats Summary

In [None]:
df.describe()

In [None]:
df_scaled=np.array(df_scaled)

In [12]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Create training and test sets

In [13]:
# Remove bird observation outcome target from features data
y = df_np.species_num
X = df_np.drop(columns="species_num")

# Split training/test datasets
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Apply scaling

In [15]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build Models 

In [16]:
## Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

print('Accuracy of logistic regression on training', logreg.score(X_train_scaled, y_train))
print('Accuracy of logistic regression on testing', logreg.score(X_test_scaled, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy of logistic regression on training 0.7179102387840333
Accuracy of logistic regression on testing 0.717754705163088


In [None]:
## Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train_scaled, y_train)

print('Accuracy of Decision tree on training', dt.score(X_train_scaled, y_train))
print('Accuracy of Decision tree on testing', dt.score(X_test_scaled, y_test))

In [None]:
# Setting max decision tree depth to help avoid overfitting
dt2 = DecisionTreeClassifier(max_depth=3)
dt2.fit(X_train_scaled, y_train)
print('Accuracy of Decision tree on training', dt2.score(X_train_scaled, y_train))
print('Accuracy of Decision tree on testing', dt2.score(X_test_scaled, y_test))

In [None]:
# K-nearest neighbor 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
print('Accuracy of Knn on training', knn.score(X_train_scaled, y_train))
print('Accuracy of Knn on testing', knn.score(X_test_scaled, y_test))

In [None]:
# Linear Discriminant Analysis 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_scaled,y_train)
print('Accuracy of Knn on training', lda.score(X_train_scaled, y_train))
print('Accuracy of Knn on testing', lda.score(X_test_scaled, y_test))

In [None]:
# Gaussian Naive Bayes 
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)
print('Accuracy of GNB on training', gnb.score(X_train_scaled, y_train))
print('Accuracy of GNB on testing', gnb.score(X_test_scaled, y_test))

In [None]:
# Support Vector Machine
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train_scaled, y_train)
print('Accuracy of SVM on training', svm.score(X_train_scaled, y_train))
print('Accuracy of SVM on testing', svm.score(X_test_scaled, y_test))

In [None]:
# save model
    nn.save(file_name)
    # import model back in
    nn_imported = tensorflow.keras.models.load_model(file_name)