# Basic modelling module

In [1]:
# make plots be included into this doc
%matplotlib auto

Using matplotlib backend: TkAgg


# Importing modules

In [2]:
import numpy as np
import pandas as pd
import os
from IPython.core.display import display, HTML
from bin.model import *
from sklearn.pipeline import Pipeline
from bin.conf import PREDICTOR_LOADERS
from bin.loader import get_predictor_data
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt

# Constants definitions

In [3]:
SOURCE_DATA_PATH = './data' # relative (or absolute) path to the data directory
CSV_SEPARATOR = r';' # separator used in csv data files
DATA_FILE_NAMES = ['Quercus.csv',# all data files should be in the same format
                   'All_data.csv',
                   ] 
ALLOWED_COLUMNS = ['species', 'latitude', 'longitude'] # only these columns will be retained for computations
COLUMNS_DTYPES = [np.str, np.float64, np.float64] # Should have the same length as ALLOWED_COLUMNS
MODEL_SPECIES = ['quercus mongolica',
                 'kalopanax septemlobus',
                 'quercus',
                 'quercus crispula',
                 'fraxinus mandshurica'] # all  species should be given in lowercase format

# Initial set of variables (see conf.py: PREDICTOR_LOADERS parameter for details)
VARIABLE_SET = ('WKI5', 'PCKI0','PWKI0', 'CKI5', 'IT', 'IC', 'TMINM', 'TMAXM')
VARIABLE_SET += tuple(['WKI' + str(k) for k in range(2, 7)])
VARIABLE_SET += tuple(['CKI' + str(k) for k in range(2, 7)])
VARIABLE_SET += tuple(['BIO' + str(k) for k in range(1, 20)])
VARIABLE_SET += ('PWKI0', 'PCKI0','IT', 'IC', 'TMINM', 'TMAXM')
VARIABLE_SET += tuple(['PREC' + str(k) for k in range(1, 13)])
VARIABLE_SET += tuple(['TAVG' + str(k) for k in range(1, 13)])
VARIABLE_SET += tuple(['TMIN' + str(k) for k in range(1, 13)])
VARIABLE_SET += tuple(['TMAX' + str(k) for k in range(1, 13)])
VARIABLE_SET = tuple(set(VARIABLE_SET)) # remove duplicate variables if they are exist

CLASSIFIERS = [ ('tree', DecisionTreeClassifier(random_state=10)),
                ('MaxEnt', LogisticRegression()),
                #('SVM', SVC(kernel='linear'))
                #('LDA', LinearDiscriminantAnalysis())
              ]
KFOLDS_NUMBER = 20
PSEUDO_ABSENCE_DENSITY = 0.02

# Source data loading and preprocessing

In [4]:
original_presence_data = pd.DataFrame({col: [] for col in ALLOWED_COLUMNS}) #initialize dataframe-accumulator
for filename in DATA_FILE_NAMES:
    try:
        # data loading procedure
        data = pd.read_csv(os.path.join(SOURCE_DATA_PATH, filename),
                           sep=CSV_SEPARATOR, dtype={a:b for a,b in zip(ALLOWED_COLUMNS, COLUMNS_DTYPES)})
    except IOError:
        print("Couldn't read the file %s." % filename)
    if any(data):
        print('The file %s succesfully loaded.' % filename)
        print('File overview:')
        data.info()
        print('='*50)
    # data concatenation procedure
    original_presence_data = pd.concat([original_presence_data, data[ALLOWED_COLUMNS]], ignore_index=True)

# make species names lowercased and stripped
original_presence_data['species'] = original_presence_data['species'].apply(str.lower).apply(str.strip)

display(HTML('<h3>Original size: %s</h3>'%original_presence_data['species'].size))

# remove duplicate rows and nan values
original_presence_data = original_presence_data.dropna().drop_duplicates(ALLOWED_COLUMNS).reset_index(drop=True)
display(HTML('<h3>The size after duplications removal: %s</h3>'%original_presence_data['species'].size))


The file Quercus.csv succesfully loaded.
File overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 605 entries, 0 to 604
Data columns (total 3 columns):
species      605 non-null object
latitude     604 non-null float64
longitude    604 non-null float64
dtypes: float64(2), object(1)
memory usage: 14.3+ KB
The file All_data.csv succesfully loaded.
File overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
species      500 non-null object
latitude     500 non-null float64
longitude    500 non-null float64
dtypes: float64(2), object(1)
memory usage: 11.8+ KB


## Initial dataset overview

In [5]:
display(HTML('<h3>General info:</h3>'))
original_presence_data.info()
display(HTML('<h3>Species occurences overview:</h3>'))
original_presence_data['species'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 853 entries, 0 to 852
Data columns (total 3 columns):
latitude     853 non-null float64
longitude    853 non-null float64
species      853 non-null object
dtypes: float64(2), object(1)
memory usage: 20.1+ KB


quercus crispula          266
quercus mongolica          93
phellodendron amurense     77
fraxinus mandshurica       63
ulmus japonica             62
acer mono                  52
kalopanax septemlobus      51
ulmus laciniata            44
tilia amurensis            41
acer mayrii                28
carpinus cordata           24
pinus koraiensis           23
juglans mandshurica        17
abies holophylla            9
fraxinus rhynchophylla      3
Name: species, dtype: int64

# MAIN LOOP OVER ALL SPECIES

In [None]:
for species in MODEL_SPECIES:
    display(HTML('<h5>=============== %s ======================</h5>' % species))
    classifier_stats_acc, classifier_stats_auc = [], []
    model = Pipeline([('select_species', SelectSpecies(species)), 
                      ('prune_suspicious', PruneSuspiciousCoords()),
                      ('ps_absence', FillPseudoAbsenceData(density=0.8)),
                      ('fill_env', FillEnvironmentalData(VARIABLE_SET)),
                  #    ('fill_by_cond', FillPseudoAbsenceByConditions(species=species,
                  #                                                   similarity=0.2,
                  #                                                   density=0.1,
                  #                                                   area=[(22,100),(65,169)])),
                      ('exclude_by_corr', CorrelationPruner(threshold=0.95, variables=VARIABLE_SET))
                     ]
                     )
    print("Constructing the dataset...")
    aux_result = model.fit_transform(original_presence_data)
    aux_result.info()
    current_variable_set = set(VARIABLE_SET).intersection(set(aux_result.columns.values))
    print("Removed correlated features: ", set(VARIABLE_SET) - current_variable_set)
    print("Leaved features: ", current_variable_set)
    current_variable_set = list(current_variable_set)
    X, y = aux_result[current_variable_set].values, list(map(int, ~aux_result.absence))
    print("Dataset is formed.")
    aux_result.info()
    for name, clf in CLASSIFIERS:
        std_clf = TweakedPipeline([('scaler', StandardScaler()),
                         ('classificator', clf)])
        print("Preforming recursive feature ellimination for the <%s> classifier..." % name)
        rfecv_acc = RFECV(estimator=std_clf, step=1, cv=StratifiedKFold(KFOLDS_NUMBER, shuffle=True),
                      scoring='accuracy')
        rfecv_acc.fit(X, y)
        acc_score = np.array(rfecv_acc.grid_scores_)[np.argmax(rfecv_acc.grid_scores_)]
        rfecv_auc = RFECV(estimator=std_clf, step=1, cv=StratifiedKFold(KFOLDS_NUMBER, shuffle=True),
                      scoring='roc_auc')
        rfecv_auc.fit(X, y)
        auc_score = np.array(rfecv_auc.grid_scores_)[np.argmax(rfecv_auc.grid_scores_)]
        classifier_stats_acc.append((name, acc_score, std_clf, rfecv_acc.support_))
        classifier_stats_auc.append((name, auc_score, std_clf, rfecv_auc.support_))
    acc_optimal_name, acc_optimal_score, acc_optimal_clf, acc_optimal_mask = tuple(classifier_stats_acc[np.argmax(list(map(lambda x: x[1], classifier_stats_acc)))])
    auc_optimal_name, auc_optimal_score, auc_optimal_clf, auc_optimal_mask = tuple(classifier_stats_auc[np.argmax(list(map(lambda x: x[1], classifier_stats_auc)))])
    display(HTML('<h5> --------------- Summary for %s: --------------- </h5>' % species))
    print("The best classifier is %s. Its accuracy score is %s." % (acc_optimal_name, acc_optimal_score))
    print("Optimal predictor set (acc): ",  np.array(current_variable_set)[acc_optimal_mask])
    print("The best classifier is %s. Its roc/auc score is %s." % (auc_optimal_name, auc_optimal_score))
    print("Optimal predictor set (auc): ",  np.array(current_variable_set)[auc_optimal_mask])
    print("Statistic over all classifiers: ")
    print("AUC/ROC - Case:")
    df = pd.DataFrame({n[0]: [n[1], np.array(current_variable_set)[n[-1]][:5]] for n in classifier_stats_auc})
    display(df)
    print("Precision - Case:")
    df = pd.DataFrame({n[0]: [n[1], np.array(current_variable_set)[n[-1]][:5]] for n in classifier_stats_acc})
    display(df)
    display(HTML('<h5> %s </h5>' % ("~" * 90,)))
    
    # ---- 
    #optimal_vars = list(np.array(current_variable_set)[auc_optimal_mask])
    optimal_vars = current_variable_set
    X, y = aux_result[optimal_vars].values, list(map(int, ~aux_result.absence))
    auc_optimal_clf.fit(X, y)

    
    fig, ax, XMAP = plot_map([22, 65], [100, 169], 1000, auc_optimal_clf,
                            optimal_vars, train_df=aux_result,
                            name='clf_'+auc_optimal_name+'_'+species, postfix='')
    
    #plt.show()
    fig.set_size_inches(18.5, 10.5)
    fig.savefig('%s'%species+'_'+auc_optimal_name+ '.png', dpi=600)
    plt.close(fig)
    

#     predictions[~nan_mask, :] = auc_optimal_clf.predict_proba(XMAP[~nan_mask,:])
#     presence_proba_current = predictions[:, 1]
#     plt.figure()
#     plt.contourf(LONS_GRID, LATS_GRID, presence_proba_current.reshape(1000,1000))
#     plt.title('Present')
#     plt.show()
    
#     fill_env = FillEnvironmentalData(optimal_vars, postfix='_50cc26')
#     filled_df_f = fill_env.transform_nans(map_df)
#     XMAP_f = filled_df_f.loc[:, optimal_vars].values
#     nan_mask_future = np.any(np.isnan(XMAP_f), axis=1)
#     predictions_future = np.zeros((len(nan_mask_future), 2)) * np.nan
    
#     predictions_future[~nan_mask_future, :] = auc_optimal_clf.predict_proba(XMAP_f[~nan_mask_future,:])
#     presence_proba_future = predictions_future[:, 1]
#     plt.figure()
#     plt.contourf(LONS_GRID, LATS_GRID, presence_proba_future.reshape(1000,1000))
#     plt.title('Future: 50cc26')
#     plt.show()
      

Constructing the dataset...


  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  result[vals > t] = result[vals > t] + vals[vals > t] - t
  inds = _ < vals_avg
  result[vals < t] = result[vals < t] + vals[vals < t] - t
  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4799 entries, 0 to 4798
Data columns (total 28 columns):
absence      4799 non-null bool
latitude     4799 non-null float64
longitude    4799 non-null float64
species      4799 non-null object
TMAX12       4799 non-null float64
BIO5         4799 non-null float64
TMIN10       4799 non-null float64
BIO7         4799 non-null float64
PCKI0        4799 non-null float64
BIO12        4799 non-null float64
PREC5        4799 non-null float64
TAVG4        4799 non-null float64
PREC2        4799 non-null float64
PREC6        4799 non-null float64
PREC8        4799 non-null float64
TMAX9        4799 non-null float64
PREC9        4799 non-null float64
PREC4        4799 non-null float64
PWKI0        4799 non-null float64
TMIN8        4799 non-null float64
BIO2         4799 non-null float64
BIO8         4799 non-null float64
PREC7        4799 non-null float64
PREC10       4799 non-null float64
PREC11       4799 non-null float64
BIO3         4799 non-

The best classifier is MaxEnt. Its accuracy score is 0.9989539748953975.
Optimal predictor set (acc):  ['TMAX12' 'BIO5' 'TMIN10' 'BIO12' 'BIO7' 'PCKI0' 'PREC5' 'TAVG4' 'PREC2'
 'TMAX9' 'PREC6' 'PREC9' 'PREC4' 'PWKI0' 'TMIN8' 'BIO2' 'BIO8' 'PREC7'
 'PREC10' 'PREC11' 'BIO3' 'BIO9' 'BIO15' 'PREC8']
The best classifier is MaxEnt. Its roc/auc score is 0.9999152542372881.
Optimal predictor set (auc):  ['BIO5' 'TMIN10' 'BIO7' 'PCKI0' 'PREC5' 'TAVG4' 'PREC2' 'TMAX9' 'PWKI0'
 'TMIN8' 'PREC10' 'PREC11' 'BIO3']
Statistic over all classifiers: 
AUC/ROC - Case:


Unnamed: 0,MaxEnt,tree
0,0.999915,0.978538
1,"[BIO5, TMIN10, BIO7, PCKI0, PREC5]","[BIO5, TMIN10, BIO7, PCKI0, PREC5]"


Precision - Case:


Unnamed: 0,MaxEnt,tree
0,0.998954,0.99875
1,"[TMAX12, BIO5, TMIN10, BIO12, BIO7]","[BIO5, TMIN10, BIO12, BIO7, PCKI0]"


  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]


Constructing the dataset...


  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  result[vals > t] = result[vals > t] + vals[vals > t] - t
  inds = _ < vals_avg
  result[vals < t] = result[vals < t] + vals[vals < t] - t
  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4771 entries, 0 to 4770
Data columns (total 26 columns):
absence      4771 non-null bool
latitude     4771 non-null float64
longitude    4771 non-null float64
species      4771 non-null object
TMAX12       4771 non-null float64
BIO5         4771 non-null float64
TMIN10       4771 non-null float64
BIO7         4771 non-null float64
PCKI0        4771 non-null float64
BIO12        4771 non-null float64
PREC5        4771 non-null float64
TAVG4        4771 non-null float64
PREC2        4771 non-null float64
PREC6        4771 non-null float64
PREC8        4771 non-null float64
PREC9        4771 non-null float64
PREC4        4771 non-null float64
PWKI0        4771 non-null float64
TMIN8        4771 non-null float64
BIO2         4771 non-null float64
BIO8         4771 non-null float64
PREC7        4771 non-null float64
PREC10       4771 non-null float64
BIO3         4771 non-null float64
BIO9         4771 non-null float64
BIO15        4771 non-

The best classifier is MaxEnt. Its accuracy score is 1.0.
Optimal predictor set (acc):  ['TMAX12' 'BIO5' 'TMIN10' 'BIO12' 'BIO7' 'PCKI0' 'PREC5' 'TAVG4' 'PREC2'
 'PREC6' 'PREC9' 'PREC4' 'PWKI0' 'TMIN8' 'BIO2' 'BIO8' 'PREC7' 'PREC10'
 'BIO3' 'BIO9' 'BIO15' 'PREC8']
The best classifier is MaxEnt. Its roc/auc score is 1.0.
Optimal predictor set (auc):  ['TMAX12' 'BIO5' 'TMIN10' 'BIO12' 'BIO7' 'PCKI0' 'PREC5' 'TAVG4' 'PREC2'
 'PREC6' 'PREC9' 'PREC4' 'PWKI0' 'TMIN8' 'BIO2' 'BIO8' 'PREC7' 'PREC10'
 'BIO3' 'BIO9' 'BIO15' 'PREC8']
Statistic over all classifiers: 
AUC/ROC - Case:


Unnamed: 0,MaxEnt,tree
0,1,0.987288
1,"[TMAX12, BIO5, TMIN10, BIO12, BIO7]","[TMIN10, BIO12, BIO7, PCKI0, PREC5]"


Precision - Case:


Unnamed: 0,MaxEnt,tree
0,1,0.999371
1,"[TMAX12, BIO5, TMIN10, BIO12, BIO7]","[TMIN10, BIO12, BIO7, PCKI0, PREC5]"


  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]


Constructing the dataset...


  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  result[vals > t] = result[vals > t] + vals[vals > t] - t
  inds = _ < vals_avg
  result[vals < t] = result[vals < t] + vals[vals < t] - t
  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5072 entries, 0 to 5071
Data columns (total 22 columns):
absence      5072 non-null bool
latitude     5072 non-null float64
longitude    5072 non-null float64
species      5072 non-null object
TMAX12       5072 non-null float64
BIO5         5072 non-null float64
TMIN10       5072 non-null float64
BIO7         5072 non-null float64
PCKI0        5072 non-null float64
BIO12        5072 non-null float64
TAVG4        5072 non-null float64
PREC2        5072 non-null float64
PREC8        5072 non-null float64
TMIN8        5072 non-null float64
BIO2         5072 non-null float64
BIO8         5072 non-null float64
PREC10       5072 non-null float64
PREC11       5072 non-null float64
PREC12       5072 non-null float64
BIO3         5072 non-null float64
BIO9         5072 non-null float64
BIO15        5072 non-null float64
dtypes: bool(1), float64(20), object(1)
memory usage: 837.2+ KB
Removed correlated features:  {'TAVG12', 'TMAX11', 'TAVG7', 'TM

The best classifier is MaxEnt. Its accuracy score is 0.9982244561327066.
Optimal predictor set (acc):  ['TMAX12' 'BIO5' 'BIO2' 'TMIN10' 'BIO12' 'BIO7' 'PCKI0' 'TAVG4' 'BIO8'
 'PREC10' 'PREC11' 'BIO3' 'PREC2' 'TMIN8' 'PREC12' 'BIO9' 'BIO15' 'PREC8']
The best classifier is MaxEnt. Its roc/auc score is 0.9997978287360141.
Optimal predictor set (auc):  ['BIO12' 'PCKI0' 'PREC2' 'TMIN8' 'PREC8']
Statistic over all classifiers: 
AUC/ROC - Case:


Unnamed: 0,MaxEnt,tree
0,0.999798,0.987902
1,"[BIO12, PCKI0, PREC2, TMIN8, PREC8]","[TMIN10, TMIN8, PREC8]"


Precision - Case:


Unnamed: 0,MaxEnt,tree
0,0.998224,0.997435
1,"[TMAX12, BIO5, BIO2, TMIN10, BIO12]","[TMIN10, BIO12, TMIN8, PREC8]"


  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]


Constructing the dataset...


  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  result[vals > t] = result[vals > t] + vals[vals > t] - t
  inds = _ < vals_avg
  result[vals < t] = result[vals < t] + vals[vals < t] - t
  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 22 columns):
absence      5043 non-null bool
latitude     5043 non-null float64
longitude    5043 non-null float64
species      5043 non-null object
TMAX12       5043 non-null float64
BIO5         5043 non-null float64
TMIN10       5043 non-null float64
BIO7         5043 non-null float64
PCKI0        5043 non-null float64
BIO12        5043 non-null float64
TAVG4        5043 non-null float64
PREC2        5043 non-null float64
PREC8        5043 non-null float64
TMIN8        5043 non-null float64
BIO2         5043 non-null float64
BIO8         5043 non-null float64
PREC10       5043 non-null float64
PREC11       5043 non-null float64
PREC12       5043 non-null float64
BIO3         5043 non-null float64
BIO9         5043 non-null float64
BIO15        5043 non-null float64
dtypes: bool(1), float64(20), object(1)
memory usage: 832.4+ KB
Removed correlated features:  {'TAVG12', 'TMAX11', 'TAVG7', 'TM

The best classifier is MaxEnt. Its accuracy score is 0.9998015873015873.
Optimal predictor set (acc):  ['TMAX12' 'BIO5' 'BIO2' 'TMIN10' 'BIO12' 'BIO7' 'PCKI0' 'TAVG4' 'BIO8'
 'PREC10' 'PREC11' 'BIO3' 'PREC2' 'TMIN8' 'PREC12' 'BIO9' 'BIO15' 'PREC8']
The best classifier is MaxEnt. Its roc/auc score is 1.0.
Optimal predictor set (auc):  ['BIO12' 'PCKI0' 'BIO3' 'TMIN8' 'PREC12' 'BIO15' 'PREC8']
Statistic over all classifiers: 
AUC/ROC - Case:


Unnamed: 0,MaxEnt,tree
0,1,0.997972
1,"[BIO12, PCKI0, BIO3, TMIN8, PREC12]","[TMIN10, BIO12]"


Precision - Case:


Unnamed: 0,MaxEnt,tree
0,0.999802,0.999206
1,"[TMAX12, BIO5, BIO2, TMIN10, BIO12]","[TMIN10, BIO12]"


  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]


Constructing the dataset...


  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  result[vals > t] = result[vals > t] + vals[vals > t] - t
  inds = _ < vals_avg
  result[vals < t] = result[vals < t] + vals[vals < t] - t
  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4841 entries, 0 to 4840
Data columns (total 28 columns):
absence      4841 non-null bool
latitude     4841 non-null float64
longitude    4841 non-null float64
species      4841 non-null object
TMAX12       4841 non-null float64
BIO5         4841 non-null float64
TMIN10       4841 non-null float64
BIO7         4841 non-null float64
PCKI0        4841 non-null float64
BIO12        4841 non-null float64
PREC5        4841 non-null float64
TAVG4        4841 non-null float64
PREC2        4841 non-null float64
PREC6        4841 non-null float64
PREC8        4841 non-null float64
TMAX9        4841 non-null float64
PREC9        4841 non-null float64
PREC4        4841 non-null float64
PWKI0        4841 non-null float64
TMIN8        4841 non-null float64
BIO2         4841 non-null float64
BIO8         4841 non-null float64
PREC7        4841 non-null float64
PREC10       4841 non-null float64
PREC11       4841 non-null float64
BIO3         4841 non-