# Basic modelling module

In [1]:
# make plots be included into this doc
%matplotlib auto

Using matplotlib backend: TkAgg


# Importing modules

In [2]:
import numpy as np
import pandas as pd
import gc
import os
from IPython.core.display import display, HTML
from bin.model import *
from sklearn.pipeline import Pipeline
from bin.conf import PREDICTOR_LOADERS
from bin.loader import get_predictor_data
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt

# Constants definitions

In [3]:
SOURCE_DATA_PATH = './data' # relative (or absolute) path to the data directory
CSV_SEPARATOR = r';' # separator used in csv data files
DATA_FILE_NAMES = ['all_species_final.csv',# all data files should be in the same format
                   ] 
ALLOWED_COLUMNS = ['species', 'latitude', 'longitude'] # only these columns will be retained for computations
COLUMNS_DTYPES = [np.str, np.float64, np.float64] # Should have the same length as ALLOWED_COLUMNS
CLIMATIC_MODELS = ['50cc26','50cc85','50cc45', '70cc26', '70cc85','70cc45']
CLIMATIC_MODELS = CLIMATIC_MODELS + list(map(lambda x: x.replace('cc', 'mc'), CLIMATIC_MODELS))
CLIMATIC_MODELS = list(map(lambda x: '_' + x, CLIMATIC_MODELS))
CLIMATIC_MODELS = ['_cclgm', '_ccmid']
MODEL_SPECIES = [
                 'quercus mongolica',
                 'kalopanax septemlobus',
                 'quercus',
                 'quercus crispula',
                 'fraxinus mandshurica'
                ] # all  species should be given in lowercase format

# Initial set of variables (see conf.py: PREDICTOR_LOADERS parameter for details)
VARIABLE_SET = ('WKI5', 'PCKI0','PWKI0', 'CKI5', 'IT', 'IC')
#VARIABLE_SET += tuple(['WIND' + str(k) for k in range(1, 13)])#
#VARIABLE_SET = ('BIO1',)
#VARIABLE_SET += tuple(['WKI' + str(k) for k in range(2, 7)])
#VARIABLE_SET += tuple(['CKI' + str(k) for k in range(2, 7)])
#VARIABLE_SET += tuple(['BIO' + str(k) for k in range(1, 4)])
#VARIABLE_SET += ('PWKI0', 'PCKI0','IT', 'IC', 'TMINM', 'TMAXM')
#VARIABLE_SET += tuple(['PREC' + str(k) for k in range(1, 13)])
#VARIABLE_SET += tuple(['TAVG' + str(k) for k in range(1, 13)])
#VARIABLE_SET += tuple(['TMIN' + str(k) for k in range(1, 13)])
#VARIABLE_SET += tuple(['TMAX' + str(k) for k in range(1, 13)])
VARIABLE_SET = tuple(set(VARIABLE_SET)) # remove duplicate variables if they are exist

CLASSIFIERS = [# ('tree', DecisionTreeClassifier(random_state=10)),
                ('MaxEnt', LogisticRegression()),
                ('RandForest', RandomForestClassifier(n_estimators=100, random_state=10))
                #('SVM', SVC(kernel='linear'))
                #('LDA', LinearDiscriminantAnalysis())
              ]
KFOLDS_NUMBER = 20
PSEUDO_ABSENCE_DENSITY = 0.02

# Source data loading and preprocessing

In [4]:
original_presence_data = pd.DataFrame({col: [] for col in ALLOWED_COLUMNS}) #initialize dataframe-accumulator
for filename in DATA_FILE_NAMES:
    try:
        # data loading procedure
        data = pd.read_csv(os.path.join(SOURCE_DATA_PATH, filename),
                           sep=CSV_SEPARATOR, dtype={a:b for a,b in zip(ALLOWED_COLUMNS, COLUMNS_DTYPES)})
    except IOError:
        print("Couldn't read the file %s." % filename)
    if any(data):
        print('The file %s succesfully loaded.' % filename)
        print('File overview:')
        data.info()
        print('='*50)
    # data concatenation procedure
    original_presence_data = pd.concat([original_presence_data, data[ALLOWED_COLUMNS]], ignore_index=True)

# make species names lowercased and stripped
original_presence_data['species'] = original_presence_data['species'].apply(str.lower).apply(str.strip)

display(HTML('<h3>Original size: %s</h3>'%original_presence_data['species'].size))

# remove duplicate rows and nan values
original_presence_data = original_presence_data.dropna().drop_duplicates(ALLOWED_COLUMNS).reset_index(drop=True)
display(HTML('<h3>The size after duplications removal: %s</h3>'%original_presence_data['species'].size))


The file all_species_final.csv succesfully loaded.
File overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4106 entries, 0 to 4105
Data columns (total 3 columns):
species      4106 non-null object
latitude     4105 non-null float64
longitude    4105 non-null float64
dtypes: float64(2), object(1)
memory usage: 96.3+ KB


## Initial dataset overview

In [5]:
display(HTML('<h3>General info:</h3>'))
original_presence_data.info()
display(HTML('<h3>Species occurences overview:</h3>'))
original_presence_data['species'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2801 entries, 0 to 2800
Data columns (total 3 columns):
latitude     2801 non-null float64
longitude    2801 non-null float64
species      2801 non-null object
dtypes: float64(2), object(1)
memory usage: 65.7+ KB


kalopanax septemlobus     372
carpinus cordata          331
juglans mandshurica       316
quercus crispula          264
phellodendron amurense    235
ulmus davidiana           223
quercus mongolica         192
acer mono                 187
ulmus laciniata           165
pinus koraiensis          157
tilia amurensis           152
fraxinus mandshurica      107
fraxinus rhynchophylla     37
abies holophylla           36
juglans ailanthifolia      27
Name: species, dtype: int64

In [6]:
#tocsv = original_presence_data[original_presence_data.species == 'kalopanax septemlobus']
#tocsv.to_csv('kal.csv', index=False)

# MAIN LOOP OVER ALL SPECIES

In [7]:
for species in MODEL_SPECIES:
    display(HTML('<h5>=============== %s ======================</h5>' % species))
    classifier_stats_acc, classifier_stats_auc = [], []
    model = Pipeline([('select_species', SelectSpecies(species)), 
                      ('prune_suspicious', PruneSuspiciousCoords()),
                      ('ps_absence', FillPseudoAbsenceData(density=2)),
                      ('fill_env', FillEnvironmentalData(VARIABLE_SET)),
                      ('fill_by_cond', FillPseudoAbsenceByConditions(species=species,
                                                                     similarity=0.1,
                                                                     density=0.1,
                                                                     area=[(22,100),(65,169)])),
                      ('exclude_by_corr', CorrelationPruner(threshold=0.95, variables=VARIABLE_SET))
                     ]
                     )
    print("Constructing the dataset...")
    aux_result = model.fit_transform(original_presence_data)
    aux_result.info()
    current_variable_set = set(VARIABLE_SET).intersection(set(aux_result.columns.values))
    print("Removed correlated features: ", set(VARIABLE_SET) - current_variable_set)
    print("Leaved features: ", current_variable_set)
    current_variable_set = list(current_variable_set)
    X, y = aux_result[current_variable_set].values, list(map(int, ~aux_result.absence))
    print("Dataset is formed.")
    aux_result.info()
    for name, clf in CLASSIFIERS:
        std_clf = TweakedPipeline([('scaler', StandardScaler()),
                         ('classificator', clf)])
        print("Preforming recursive feature ellimination for the <%s> classifier..." % name)
        rfecv_acc = RFECV(estimator=std_clf, step=1, cv=StratifiedKFold(KFOLDS_NUMBER, shuffle=True),
                      scoring='accuracy')
        rfecv_acc.fit(X, y)
        acc_score = np.array(rfecv_acc.grid_scores_)[np.argmax(rfecv_acc.grid_scores_)]
        rfecv_auc = RFECV(estimator=std_clf, step=1, cv=StratifiedKFold(KFOLDS_NUMBER, shuffle=True),
                      scoring='roc_auc')
        rfecv_auc.fit(X, y)
        auc_score = np.array(rfecv_auc.grid_scores_)[np.argmax(rfecv_auc.grid_scores_)]
        classifier_stats_acc.append((name, acc_score, std_clf, rfecv_acc.support_))
        classifier_stats_auc.append((name, auc_score, std_clf, rfecv_auc.support_))
    acc_optimal_name, acc_optimal_score, acc_optimal_clf, acc_optimal_mask = tuple(classifier_stats_acc[np.argmax(list(map(lambda x: x[1], classifier_stats_acc)))])
    auc_optimal_name, auc_optimal_score, auc_optimal_clf, auc_optimal_mask = tuple(classifier_stats_auc[np.argmax(list(map(lambda x: x[1], classifier_stats_auc)))])
    display(HTML('<h5> --------------- Summary for %s: --------------- </h5>' % species))
    print("The best classifier is %s. Its accuracy score is %s." % (acc_optimal_name, acc_optimal_score))
    print("Optimal predictor set (acc): ",  np.array(current_variable_set)[acc_optimal_mask])
    print("The best classifier is %s. Its roc/auc score is %s." % (auc_optimal_name, auc_optimal_score))
    print("Optimal predictor set (auc): ",  np.array(current_variable_set)[auc_optimal_mask])
    print("Statistic over all classifiers: ")
    print("AUC/ROC - Case:")
    df = pd.DataFrame({n[0]: [n[1], np.array(current_variable_set)[n[-1]][:5]] for n in classifier_stats_auc})
    display(df)
    print("Precision - Case:")
    df = pd.DataFrame({n[0]: [n[1], np.array(current_variable_set)[n[-1]][:5]] for n in classifier_stats_acc})
    display(df)
    display(HTML('<h5> %s </h5>' % ("~" * 90,)))
    
    # ---- 
    #optimal_vars = list(np.array(current_variable_set)[auc_optimal_mask])
    optimal_vars = current_variable_set
    X, y = aux_result[optimal_vars].values, list(map(int, ~aux_result.absence))
    auc_optimal_clf.fit(X, y)

    
    fig1, ax = plot_map([22, 67], [100, 169], 5000, auc_optimal_clf,
                            optimal_vars, train_df=None,
                            name=species, postfix='')
    
    fig1.set_size_inches(18.5, 10.5)
    fig1.savefig('%s' % species + '_' + auc_optimal_name + '.png', dpi=600)
    plt.close(fig1)
    
    gc.collect()
    for cm in CLIMATIC_MODELS:
        print("CURRENT MODEL:", cm)
        fig2, ax = plot_map([22, 67], [100, 169], 5000, auc_optimal_clf,
                                optimal_vars, train_df=None,
                                name='_'.join([species,cm,auc_optimal_name]), postfix=cm)
        fig2.set_size_inches(18.5, 10.5)
        fig2.savefig(cm+'_'+'%s'%species+'.png', dpi=600)
        plt.close(fig2)
        gc.collect()
    plt.show()
     

Constructing the dataset...


  inds = _ < vals_avg
  result[vals > t] = result[vals > t] + vals[vals > t] - t
  if np.any(vals < t):
  result[vals < t] = result[vals < t] + precs[vals < t]
  result[vals < t] = result[vals < t] + vals[vals < t] - t
  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]


The number of ps-absence by cond: 1565
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13733 entries, 0 to 13732
Data columns (total 9 columns):
IC           13733 non-null float64
IT           13733 non-null float64
PCKI0        13733 non-null float64
PWKI0        13733 non-null float64
WKI5         13733 non-null float64
absence      13733 non-null bool
latitude     13733 non-null float64
longitude    13733 non-null float64
species      13733 non-null object
dtypes: bool(1), float64(7), object(1)
memory usage: 871.8+ KB
Removed correlated features:  {'CKI5'}
Leaved features:  {'IT', 'WKI5', 'IC', 'PWKI0', 'PCKI0'}
Dataset is formed.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13733 entries, 0 to 13732
Data columns (total 9 columns):
IC           13733 non-null float64
IT           13733 non-null float64
PCKI0        13733 non-null float64
PWKI0        13733 non-null float64
WKI5         13733 non-null float64
absence      13733 non-null bool
latitude     13733 non-null float6

The best classifier is RandForest. Its accuracy score is 0.994464986918907.
Optimal predictor set (acc):  ['IT' 'WKI5' 'IC' 'PWKI0' 'PCKI0']
The best classifier is RandForest. Its roc/auc score is 0.98835002519938.
Optimal predictor set (auc):  ['IT' 'WKI5' 'IC' 'PWKI0' 'PCKI0']
Statistic over all classifiers: 
AUC/ROC - Case:


Unnamed: 0,MaxEnt,RandForest
0,0.966586,0.98835
1,"[IT, WKI5, IC, PWKI0, PCKI0]","[IT, WKI5, IC, PWKI0, PCKI0]"


Precision - Case:


Unnamed: 0,MaxEnt,RandForest
0,0.986311,0.994465
1,"[IT, WKI5, IC]","[IT, WKI5, IC, PWKI0, PCKI0]"


Bands completed: 0.0


  inds = _ < vals_avg
  result[vals > t] = result[vals > t] + vals[vals > t] - t
  if np.any(vals > t):
  result[vals > t] = result[vals > t] + precs[vals > t]
  if np.any(vals < t):


Bands completed: 0.04


  result[vals < t] = result[vals < t] + precs[vals < t]


Bands completed: 0.08
Bands completed: 0.12
Bands completed: 0.16
Bands completed: 0.2
Bands completed: 0.24
Bands completed: 0.28


UnboundLocalError: local variable 'result' referenced before assignment