# Basic modelling module

In [1]:
# make plots be included into this doc
%matplotlib inline

# Importing modules

In [2]:
import numpy as np
import pandas as pd
import os
from IPython.core.display import display, HTML
from bin.model import *
from sklearn.pipeline import Pipeline
from bin.conf import PREDICTOR_LOADERS
from bin.loader import get_predictor_data
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Constants definitions

In [3]:
SOURCE_DATA_PATH = './data' # relative (or absolute) path to the data directory
CSV_SEPARATOR = r';' # separator used in csv data files
DATA_FILE_NAMES = ['broad_leaf_GBIF.csv',# all data files should be in the same format
                   'широколиственные.csv',
                  # 'broad_leaved_herbarium_and_observation.csv',
                   'GBIF_coordinate.csv'
                  ] 
ALLOWED_COLUMNS = ['species', 'latitude', 'longitude'] # only these columns will be retained for computations
COLUMNS_DTYPES = [np.str, np.float64, np.float64] # Should have the same length as ALLOWED_COLUMNS
MODEL_SPECIES = ['quercus',
                 'quercus mongolica',
                 'quercus crispula',
                 'kalopanax septemlobus',
                 'fraxinus mandshurica'] # all  species should be given in lowercase format

# Initial set of variables (see conf.py: PREDICTOR_LOADERS parameter for details)
VARIABLE_SET = tuple(['BIO' + str(k) for k in range(1, 20)])
VARIABLE_SET += ('PWKI0', 'PCKI0','IT', 'IC', 'TMINM', 'TMAXM')
VARIABLE_SET += tuple(['PREC' + str(k) for k in range(1, 13)])
VARIABLE_SET += tuple(['TAVG' + str(k) for k in range(1, 13)])
VARIABLE_SET += tuple(['WKI' + str(k) for k in range(2, 7)])
VARIABLE_SET += tuple(['CKI' + str(k) for k in range(2, 7)])
CLASSIFIERS = [ ('tree', DecisionTreeClassifier(random_state=10)),
                ('MaxEnt', LogisticRegression()),
                ('SVM', SVC(kernel='linear'))
                #('LDA', LinearDiscriminantAnalysis())
              ]
KFOLDS_NUMBER = 20
PSEUDO_ABSENCE_DENSITY = 1

# Source data loading and preprocessing

In [4]:
original_presence_data = pd.DataFrame({col: [] for col in ALLOWED_COLUMNS}) #initialize dataframe-accumulator
for filename in DATA_FILE_NAMES:
    try:
        # data loading procedure
        data = pd.read_csv(os.path.join(SOURCE_DATA_PATH, filename),
                           sep=CSV_SEPARATOR, dtype={a:b for a,b in zip(ALLOWED_COLUMNS, COLUMNS_DTYPES)})
    except IOError:
        print("Couldn't read the file %s." % filename)
    if any(data):
        print('The file %s succesfully loaded.' % filename)
        print('File overview:')
        data.info()
        print('='*50)
    # data concatenation procedure
    original_presence_data = pd.concat([original_presence_data, data[ALLOWED_COLUMNS]], ignore_index=True)



# make species names lowercased and stripped
original_presence_data['species'] = original_presence_data['species'].apply(str.lower).apply(str.strip)

display(HTML('<h3>Original size: %s</h3>'%original_presence_data['species'].size))

# remove duplicate rows and nan values
original_presence_data = original_presence_data.dropna().drop_duplicates(ALLOWED_COLUMNS).reset_index(drop=True)
display(HTML('<h3>The size after duplications removal: %s</h3>'%original_presence_data['species'].size))


# remove duplicate values


The file broad_leaf_GBIF.csv succesfully loaded.
File overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3036 entries, 0 to 3035
Data columns (total 4 columns):
species        3036 non-null object
countrycode    3036 non-null object
latitude       3034 non-null float64
longitude      3034 non-null float64
dtypes: float64(2), object(2)
memory usage: 95.0+ KB
The file широколиственные.csv succesfully loaded.
File overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 605 entries, 0 to 604
Data columns (total 3 columns):
species      605 non-null object
latitude     604 non-null float64
longitude    604 non-null float64
dtypes: float64(2), object(1)
memory usage: 14.3+ KB
The file GBIF_coordinate.csv succesfully loaded.
File overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2443 entries, 0 to 2442
Data columns (total 3 columns):
species      2443 non-null object
latitude     2437 non-null float64
longitude    2437 non-null float64
dtypes: float64(2), object(1)
memo

## Initial dataset overview

In [5]:
display(HTML('<h3>General info:</h3>'))
original_presence_data.info()
display(HTML('<h3>Species occurences overview:</h3>'))
original_presence_data['species'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2410 entries, 0 to 2409
Data columns (total 3 columns):
latitude     2410 non-null float64
longitude    2410 non-null float64
species      2410 non-null object
dtypes: float64(2), object(1)
memory usage: 56.6+ KB


quercus mongolica         329
carpinus cordata          307
kalopanax septemlobus     303
juglans mandshurica       299
fraxinus lanuginosa       274
quercus crispula          163
phellodendron amurense    152
ulmus davidiana           135
ulmus laciniata           104
acer mono                  78
acer pictum                75
pinus koraiensis           51
fraxinus mandshurica       40
fraxinus rhynchophylla     34
juglans ailanthifolia      27
tilia amurensis            23
abies holophylla           16
Name: species, dtype: int64

# MAIN LOOP OVER ALL SPECIES

In [6]:

for species in MODEL_SPECIES:
    display(HTML('<h5>=============== %s ======================</h5>' % species))
    classifier_stats_acc, classifier_stats_auc = [], []
    model = Pipeline([('select_species', SelectSpecies(species)), 
                      ('prune_suspicious', PruneSuspiciousCoords()),
                      ('ps_absence', FillPseudoAbsenceData(density=PSEUDO_ABSENCE_DENSITY)),
                      ('fill_env', FillEnvironmentalData(VARIABLE_SET))]
                     )
    print("Constructing the dataset...")
    aux_result = model.fit_transform(original_presence_data)
    X, y = aux_result[list(VARIABLE_SET)].values, list(map(int, ~aux_result.absence))
    print("Dataset is formed.")
    aux_result.info()
    for name, clf in CLASSIFIERS:
        std_clf=TweakedPipeline([('scaler', StandardScaler()),
                         ('classificator', clf)])
        # Not yet completed!
        print("Preforming recursive feature ellimination for the <%s> classifier..." % name)
        rfecv_acc = RFECV(estimator=std_clf, step=1, cv=StratifiedKFold(KFOLDS_NUMBER, shuffle=True),
                      scoring='accuracy')
        rfecv_acc.fit(X, y)
        print("=" * 20, "accuracy case", "=" * 20)
        print("Optimal composition of variables: ", np.array(VARIABLE_SET)[rfecv_acc.support_])
        acc_score = np.array(rfecv_acc.grid_scores_)[np.argmax(rfecv_acc.grid_scores_)]
        print("Classification score (acc): ", acc_score)
        print("=" * 60)
        print("=" * 20, "roc/auc case", "=" * 20)
        rfecv_auc = RFECV(estimator=std_clf, step=1, cv=StratifiedKFold(KFOLDS_NUMBER, shuffle=True),
                      scoring='roc_auc')
        rfecv_auc.fit(X, y)
        print("Optimal composition of variables: ", np.array(VARIABLE_SET)[rfecv_auc.support_])
        auc_score = np.array(rfecv_auc.grid_scores_)[np.argmax(rfecv_auc.grid_scores_)]
        print("Classification score (auc): ", auc_score)
        print("=" * 60)
        print("The following predictors composition was selected: ",  np.array(VARIABLE_SET)[rfecv_acc.support_])
        classifier_stats_acc.append((name, acc_score, clf, rfecv_acc.support_))
        classifier_stats_auc.append((name, auc_score, clf, rfecv_auc.support_))
    acc_optimal_name, acc_optimal_score, acc_optimal_clf, acc_optimal_mask = tuple(classifier_stats_acc[np.argmax(list(map(lambda x: x[1], classifier_stats_acc)))])
    auc_optimal_name, auc_optimal_score, auc_optimal_clf, auc_optimal_mask = tuple(classifier_stats_auc[np.argmax(list(map(lambda x: x[1], classifier_stats_auc)))])
    display(HTML('<h5> --------------- Summary for %s: --------------- </h5>' % species))
    print("The best classifier is %s. Its accuracy score is %s." % (acc_optimal_name, acc_optimal_score))
    print("Optimal predictor set (acc): ",  np.array(VARIABLE_SET)[acc_optimal_mask])
    print("The best classifier is %s. Its roc/auc score is %s." % (auc_optimal_name, auc_optimal_score))
    print("Optimal predictor set (auc): ",  np.array(VARIABLE_SET)[auc_optimal_mask])
    display(HTML('<h5> %s </h5>' % ("~" * 90,)))


Constructing the dataset...


  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]
  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  inds = _ < vals_avg
  result[vals > t] = result[vals > t] + vals[vals > t] - t
  result[vals < t] = result[vals < t] + vals[vals < t] - t


Dataset is formed.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4474 entries, 0 to 4473
Data columns (total 63 columns):
absence      4474 non-null bool
latitude     4474 non-null float64
longitude    4474 non-null float64
species      4474 non-null object
BIO1         4474 non-null float64
BIO2         4474 non-null float64
BIO3         4474 non-null float64
BIO4         4474 non-null float64
BIO5         4474 non-null float64
BIO6         4474 non-null float64
BIO7         4474 non-null float64
BIO8         4474 non-null float64
BIO9         4474 non-null float64
BIO10        4474 non-null float64
BIO11        4474 non-null float64
BIO12        4474 non-null float64
BIO13        4474 non-null float64
BIO14        4474 non-null float64
BIO15        4474 non-null float64
BIO16        4474 non-null float64
BIO17        4474 non-null float64
BIO18        4474 non-null float64
BIO19        4474 non-null float64
PWKI0        4474 non-null float64
PCKI0        4474 non-null float64
IT 

The best classifier is MaxEnt. Its accuracy score is 0.9995535714285715.
Optimal predictor set (acc):  ['BIO15' 'BIO18' 'PWKI0' 'PCKI0' 'PREC5' 'PREC7' 'TAVG8']
The best classifier is MaxEnt. Its roc/auc score is 1.0.
Optimal predictor set (auc):  ['BIO3' 'BIO12' 'BIO15' 'BIO16' 'BIO18' 'PWKI0' 'PCKI0' 'PREC4' 'PREC5'
 'PREC6' 'PREC7' 'PREC9' 'TAVG8' 'TAVG9' 'TAVG10']


Constructing the dataset...


  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]
  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  inds = _ < vals_avg
  result[vals > t] = result[vals > t] + vals[vals > t] - t
  result[vals < t] = result[vals < t] + vals[vals < t] - t


Dataset is formed.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4248 entries, 0 to 4247
Data columns (total 63 columns):
absence      4248 non-null bool
latitude     4248 non-null float64
longitude    4248 non-null float64
species      4248 non-null object
BIO1         4248 non-null float64
BIO2         4248 non-null float64
BIO3         4248 non-null float64
BIO4         4248 non-null float64
BIO5         4248 non-null float64
BIO6         4248 non-null float64
BIO7         4248 non-null float64
BIO8         4248 non-null float64
BIO9         4248 non-null float64
BIO10        4248 non-null float64
BIO11        4248 non-null float64
BIO12        4248 non-null float64
BIO13        4248 non-null float64
BIO14        4248 non-null float64
BIO15        4248 non-null float64
BIO16        4248 non-null float64
BIO17        4248 non-null float64
BIO18        4248 non-null float64
BIO19        4248 non-null float64
PWKI0        4248 non-null float64
PCKI0        4248 non-null float64
IT 

The best classifier is SVM. Its accuracy score is 0.9995294091593585.
Optimal predictor set (acc):  ['PCKI0' 'PREC5' 'PREC7' 'PREC12' 'TAVG9']
The best classifier is SVM. Its roc/auc score is 0.999984056122449.
Optimal predictor set (auc):  ['BIO9' 'BIO16' 'PWKI0' 'PCKI0' 'PREC1' 'PREC5' 'PREC6' 'PREC7' 'PREC8'
 'PREC12' 'TAVG1' 'TAVG6' 'TAVG8' 'TAVG9' 'TAVG10']


Constructing the dataset...


  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]
  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  inds = _ < vals_avg
  result[vals > t] = result[vals > t] + vals[vals > t] - t
  result[vals < t] = result[vals < t] + vals[vals < t] - t


Dataset is formed.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4149 entries, 0 to 4148
Data columns (total 63 columns):
absence      4149 non-null bool
latitude     4149 non-null float64
longitude    4149 non-null float64
species      4149 non-null object
BIO1         4149 non-null float64
BIO2         4149 non-null float64
BIO3         4149 non-null float64
BIO4         4149 non-null float64
BIO5         4149 non-null float64
BIO6         4149 non-null float64
BIO7         4149 non-null float64
BIO8         4149 non-null float64
BIO9         4149 non-null float64
BIO10        4149 non-null float64
BIO11        4149 non-null float64
BIO12        4149 non-null float64
BIO13        4149 non-null float64
BIO14        4149 non-null float64
BIO15        4149 non-null float64
BIO16        4149 non-null float64
BIO17        4149 non-null float64
BIO18        4149 non-null float64
BIO19        4149 non-null float64
PWKI0        4149 non-null float64
PCKI0        4149 non-null float64
IT 

The best classifier is SVM. Its accuracy score is 1.0.
Optimal predictor set (acc):  ['BIO1' 'BIO2' 'BIO3' 'BIO4' 'BIO5' 'BIO6' 'BIO7' 'BIO8' 'BIO9' 'BIO10'
 'BIO11' 'BIO12' 'BIO13' 'BIO14' 'BIO15' 'BIO16' 'BIO17' 'BIO18' 'BIO19'
 'PWKI0' 'PCKI0' 'IT' 'IC' 'TMINM' 'TMAXM' 'PREC1' 'PREC2' 'PREC3' 'PREC4'
 'PREC5' 'PREC6' 'PREC7' 'PREC8' 'PREC9' 'PREC10' 'PREC11' 'PREC12'
 'TAVG1' 'TAVG2' 'TAVG3' 'TAVG4' 'TAVG5' 'TAVG6' 'TAVG7' 'TAVG8' 'TAVG9'
 'TAVG10' 'TAVG11' 'TAVG12' 'WKI2' 'WKI3' 'WKI4' 'WKI5' 'WKI6' 'CKI2'
 'CKI3' 'CKI4' 'CKI5' 'CKI6']
The best classifier is MaxEnt. Its roc/auc score is 1.0.
Optimal predictor set (auc):  ['BIO1' 'BIO2' 'BIO3' 'BIO4' 'BIO5' 'BIO6' 'BIO7' 'BIO8' 'BIO9' 'BIO10'
 'BIO11' 'BIO12' 'BIO13' 'BIO14' 'BIO15' 'BIO16' 'BIO17' 'BIO18' 'BIO19'
 'PWKI0' 'PCKI0' 'IT' 'IC' 'TMINM' 'TMAXM' 'PREC1' 'PREC2' 'PREC3' 'PREC4'
 'PREC5' 'PREC6' 'PREC7' 'PREC8' 'PREC9' 'PREC10' 'PREC11' 'PREC12'
 'TAVG1' 'TAVG2' 'TAVG3' 'TAVG4' 'TAVG5' 'TAVG6' 'TAVG7' 'TAVG8' 'TAVG9'
 'TAVG

Constructing the dataset...


  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]
  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  inds = _ < vals_avg
  result[vals > t] = result[vals > t] + vals[vals > t] - t
  result[vals < t] = result[vals < t] + vals[vals < t] - t


Dataset is formed.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4252 entries, 0 to 4251
Data columns (total 63 columns):
absence      4252 non-null bool
latitude     4252 non-null float64
longitude    4252 non-null float64
species      4252 non-null object
BIO1         4252 non-null float64
BIO2         4252 non-null float64
BIO3         4252 non-null float64
BIO4         4252 non-null float64
BIO5         4252 non-null float64
BIO6         4252 non-null float64
BIO7         4252 non-null float64
BIO8         4252 non-null float64
BIO9         4252 non-null float64
BIO10        4252 non-null float64
BIO11        4252 non-null float64
BIO12        4252 non-null float64
BIO13        4252 non-null float64
BIO14        4252 non-null float64
BIO15        4252 non-null float64
BIO16        4252 non-null float64
BIO17        4252 non-null float64
BIO18        4252 non-null float64
BIO19        4252 non-null float64
PWKI0        4252 non-null float64
PCKI0        4252 non-null float64
IT 

The best classifier is MaxEnt. Its accuracy score is 1.0.
Optimal predictor set (acc):  ['BIO1' 'BIO2' 'BIO3' 'BIO4' 'BIO5' 'BIO6' 'BIO7' 'BIO8' 'BIO9' 'BIO10'
 'BIO11' 'BIO12' 'BIO13' 'BIO14' 'BIO15' 'BIO16' 'BIO17' 'BIO18' 'BIO19'
 'PWKI0' 'PCKI0' 'IT' 'IC' 'TMINM' 'TMAXM' 'PREC1' 'PREC2' 'PREC3' 'PREC4'
 'PREC5' 'PREC6' 'PREC7' 'PREC8' 'PREC9' 'PREC10' 'PREC11' 'PREC12'
 'TAVG1' 'TAVG2' 'TAVG3' 'TAVG4' 'TAVG5' 'TAVG6' 'TAVG7' 'TAVG8' 'TAVG9'
 'TAVG10' 'TAVG11' 'TAVG12' 'WKI2' 'WKI3' 'WKI4' 'WKI5' 'WKI6' 'CKI2'
 'CKI3' 'CKI4' 'CKI5' 'CKI6']
The best classifier is MaxEnt. Its roc/auc score is 1.0.
Optimal predictor set (auc):  ['BIO1' 'BIO2' 'BIO3' 'BIO4' 'BIO5' 'BIO6' 'BIO7' 'BIO8' 'BIO9' 'BIO10'
 'BIO11' 'BIO12' 'BIO13' 'BIO14' 'BIO15' 'BIO16' 'BIO17' 'BIO18' 'BIO19'
 'PWKI0' 'PCKI0' 'IT' 'IC' 'TMINM' 'TMAXM' 'PREC1' 'PREC2' 'PREC3' 'PREC4'
 'PREC5' 'PREC6' 'PREC7' 'PREC8' 'PREC9' 'PREC10' 'PREC11' 'PREC12'
 'TAVG1' 'TAVG2' 'TAVG3' 'TAVG4' 'TAVG5' 'TAVG6' 'TAVG7' 'TAVG8' 'TAVG9'
 'T

Constructing the dataset...


  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]
  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  inds = _ < vals_avg
  result[vals > t] = result[vals > t] + vals[vals > t] - t
  result[vals < t] = result[vals < t] + vals[vals < t] - t


Dataset is formed.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4026 entries, 0 to 4025
Data columns (total 63 columns):
absence      4026 non-null bool
latitude     4026 non-null float64
longitude    4026 non-null float64
species      4026 non-null object
BIO1         4026 non-null float64
BIO2         4026 non-null float64
BIO3         4026 non-null float64
BIO4         4026 non-null float64
BIO5         4026 non-null float64
BIO6         4026 non-null float64
BIO7         4026 non-null float64
BIO8         4026 non-null float64
BIO9         4026 non-null float64
BIO10        4026 non-null float64
BIO11        4026 non-null float64
BIO12        4026 non-null float64
BIO13        4026 non-null float64
BIO14        4026 non-null float64
BIO15        4026 non-null float64
BIO16        4026 non-null float64
BIO17        4026 non-null float64
BIO18        4026 non-null float64
BIO19        4026 non-null float64
PWKI0        4026 non-null float64
PCKI0        4026 non-null float64
IT 

The best classifier is SVM. Its accuracy score is 1.0.
Optimal predictor set (acc):  ['BIO1' 'BIO2' 'BIO3' 'BIO4' 'BIO5' 'BIO6' 'BIO7' 'BIO8' 'BIO9' 'BIO10'
 'BIO11' 'BIO12' 'BIO13' 'BIO14' 'BIO15' 'BIO16' 'BIO17' 'BIO18' 'BIO19'
 'PWKI0' 'PCKI0' 'IT' 'IC' 'TMINM' 'TMAXM' 'PREC1' 'PREC2' 'PREC3' 'PREC4'
 'PREC5' 'PREC6' 'PREC7' 'PREC8' 'PREC9' 'PREC10' 'PREC11' 'PREC12'
 'TAVG1' 'TAVG2' 'TAVG3' 'TAVG4' 'TAVG5' 'TAVG6' 'TAVG7' 'TAVG8' 'TAVG9'
 'TAVG10' 'TAVG11' 'TAVG12' 'WKI2' 'WKI3' 'WKI4' 'WKI5' 'WKI6' 'CKI2'
 'CKI3' 'CKI4' 'CKI5' 'CKI6']
The best classifier is MaxEnt. Its roc/auc score is 1.0.
Optimal predictor set (auc):  ['BIO1' 'BIO2' 'BIO3' 'BIO4' 'BIO5' 'BIO6' 'BIO7' 'BIO8' 'BIO9' 'BIO10'
 'BIO11' 'BIO12' 'BIO13' 'BIO14' 'BIO15' 'BIO16' 'BIO17' 'BIO18' 'BIO19'
 'PWKI0' 'PCKI0' 'IT' 'IC' 'TMINM' 'TMAXM' 'PREC1' 'PREC2' 'PREC3' 'PREC4'
 'PREC5' 'PREC6' 'PREC7' 'PREC8' 'PREC9' 'PREC10' 'PREC11' 'PREC12'
 'TAVG1' 'TAVG2' 'TAVG3' 'TAVG4' 'TAVG5' 'TAVG6' 'TAVG7' 'TAVG8' 'TAVG9'
 'TAVG