# Applying ML methods to Records objects

In [25]:
# conda install folium

In [3]:
import records
import toyplot
import folium
import pandas as pd
import numpy as np


### Load data and filter it
The Epochs call is commented out below to instead use the `load_epochs_from_csv()` function, which is faster when the data has already been downloaded. This code raises a warning (which is not an error) and recommends a change to our code in Records that would avoid the warning. When reading a dataframe this big it wants you do tell it what the datatype will be so it can load faster. We'll skip that for now. 

In [6]:
# get Bombus records
#ep = records.Epochs("Bombus", 1900, 2010, 10)
ep = records.load_epochs_from_csv(
    "/home/deren/PDSB/records/data/Bombus-1900-2015-df.csv",
    )

  if self.run_code(code, result):


### The data set 
Select just the columns we plan to analyze. 

In [7]:
# select columns
cols = [
    "specificEpithet", 
    "decimalLatitude", 
    "decimalLongitude", 
    "stateProvince",
    "year", 
    "elevation",
]

# rename cols
data = ep.df[cols]
data.columns = ["species", "lat", "long", "state", "year", "elevation"]
data.head(10)

Unnamed: 0,species,lat,long,state,year,elevation
0,fervidus,38.7775,-95.1875,Kansas,1900,
1,fervidus,38.7775,-95.1875,Kansas,1900,
2,vosnesenskii,37.4274,-122.1698,California,1900,
3,affinis,43.39812,-76.47744,New York,1900,
4,fervidus,38.7775,-95.1875,Kansas,1900,
5,pensylvanicus,41.0465,-100.747,Nebraska,1900,
6,fervidus,38.7775,-95.1875,Kansas,1900,
7,variabilis,39.27972,-89.88167,Illinois,1900,
8,fervidus,38.7775,-95.1875,Kansas,1900,
9,sylvicola,64.50111,-165.40639,Alaska,1900,


### Filter for NaN and remove taxa that occurred rarely

In [8]:
# filter out any rows that contain NaN
df = data[np.invert(np.any(data.isna(), axis=1))]

# filter out records for species that occur < 500 times
spc = df.species.value_counts()
df = df[[spc[spp] > 500 for spp in df.species]]

# shuffle row order
df = df.sample(frac=1).reset_index(drop=True)

In [9]:
# the data set now includes 20 species
df.species.unique().shape

(20,)

In [10]:
# show data
df.head()

Unnamed: 0,species,lat,long,state,year,elevation
0,bifarius,64.8585,-147.8177,Alaska,2010,153.0
1,huntii,37.5913,-112.2575,Utah,2008,2393.0
2,flavifrons,37.6423,-105.222,Colorado,1976,2743.0
3,bifarius,46.20903,-117.77352,Washington,1955,1524.0
4,balteatus,39.010269,-107.042545,Colorado,1970,3200.0


### Split data into features and labels

In [11]:
X = df[["lat", "long", "elevation"]]
y = df["species"]

X.shape

(41459, 3)

### Fit a GaussianNB model

In [12]:
# next we measure the fit?
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

In [13]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)

In [14]:
# measure accuracy of prediction for real data
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_model)

0.22643511818620357

In [15]:
dat = {
    "inferred": y_model,
    "actual": ytest,
}
dat.update(Xtest)
pdf = pd.DataFrame(dat)

In [28]:
pdf.head(10)

Unnamed: 0,actual,elevation,inferred,lat,long
36397,jonellus,701.04,jonellus,64.883333,-148.05
24449,vosnesenskii,991.0,vosnesenskii,40.87766,-121.56048
4528,bifarius,393.0,jonellus,63.9331,-145.3575
40069,centralis,2822.0,bifarius,35.33116,-111.71185
11725,bifarius,1377.0,vosnesenskii,45.32514,-121.63603
8924,frigidus,136.0,jonellus,64.84903,-147.82957
40892,sylvicola,193.0,jonellus,65.3463,-148.2874
37675,flavifrons,153.0,jonellus,64.8585,-147.8177
33824,occidentalis,1768.0,huntii,40.094,-105.347
10008,rufocinctus,2079.0,bifarius,45.0618,-111.2555


### Examine confusion matrix
This tells us which taxa were most often mixed up in predictions. 

In [18]:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(
    ytest,
    y_model,
    labels=ytest.unique(),
).astype(float)

In [19]:
# get matrix as a proportion of correct predictions for each taxon
props = (mat) / (mat.sum(axis=1))[:, None]

In [30]:
# add labels to left side
llocator = toyplot.locator.Explicit(
    range(props.shape[0]),
    ytest.unique(),
)
toyplot.matrix(props, llocator=llocator, tlabel="Confusion matrix");

### Examine distribution plots of species

In [22]:
def species_map(name, df, maxrecords=100):
    m = folium.Map(
        location=[50,-110],
        zoom_start=2)

    sdata = df[df.inferred==name]
    for i in range(min(sdata.shape[0], maxrecords)):
        folium.Marker(
            popup="inferred to be {}".format(name),
            location=(sdata.iloc[i]['lat'], sdata.iloc[i]['long']),
            ).add_to(m)
        
    sdata = df[df.actual==name]
    for i in range(min(sdata.shape[0], maxrecords)):
        folium.Marker(
            icon=folium.Icon(color='red'),
            popup="true {}".format(name),
            location=(sdata.iloc[i]['lat'], sdata.iloc[i]['long']),
            ).add_to(m)
    return m

In [27]:
species_map("vosnesenskii", pdf, 100)