# Basic modelling module

In [1]:
# make plots be included into this doc
%matplotlib inline

# Importing modules

In [2]:
import numpy as np
import pandas as pd
import os
from IPython.core.display import display, HTML
from bin.model import *
from sklearn.pipeline import Pipeline
from bin.conf import PREDICTOR_LOADERS
from bin.loader import get_predictor_data
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Constants definitions

In [3]:
SOURCE_DATA_PATH = './data' # relative (or absolute) path to the data directory
CSV_SEPARATOR = r'\t' # separator used in csv data files
DATA_FILE_NAMES = ['broad_leaf_GBIF.csv', 'широколиственные.csv'] # all data files should be in the same format
ALLOWED_COLUMNS = ['species', 'latitude', 'longitude'] # only these columns will be retained for computations
COLUMNS_DTYPES = [np.str, np.float64, np.float64] # Should have the same length as ALLOWED_COLUMNS
MODEL_SPECIES = ['quercus mongolica', 'fraxinus mandshurica'] # all  species should be given in lowercase format

# Initial set of variables (see conf.py: PREDICTOR_LOADERS parameter for details)
VARIABLE_SET = tuple(['BIO' + str(k) for k in range(1, 20)])
#VARIABLE_SET += ('WKI5', 'CKI5', 'PWKI0', 'PCKI0','IT', 'IC', 'TMINM', 'TMAXM')
#VARIABLE_SET += tuple(['PREC' + str(k) for k in range(1, 13)])
#VARIABLE_SET += tuple(['TAVG' + str(k) for k in range(1, 13)])
#VARIABLE_SET += tuple(['WKI' + str(k) for k in range(2, 7)])
#VARIABLE_SET += tuple(['CKI' + str(k) for k in range(2, 7)])
CLASSIFIERS = [#('Naive Bayes', GaussianNB()),
               ('MaxEnt', LogisticRegression()),
               ('LDA', LinearDiscriminantAnalysis())
              ]
KFOLDS_NUMBER = 20

# Source data loading and preprocessing

In [4]:
original_presence_data = pd.DataFrame({col: [] for col in ALLOWED_COLUMNS}) #initialize dataframe-accumulator
for filename in DATA_FILE_NAMES:
    try:
        # data loading procedure
        data = pd.read_csv(os.path.join(SOURCE_DATA_PATH, filename),
                           sep=CSV_SEPARATOR, dtype={a:b for a,b in zip(ALLOWED_COLUMNS, COLUMNS_DTYPES)})
    except IOError:
        print("Couldn't read the file %s." % filename)
    if any(data):
        print('The file %s succesfully loaded.' % filename)
        print('File overview:')
        data.info()
        print('='*50)
    # data concatenation procedure
    original_presence_data = pd.concat([original_presence_data, data[ALLOWED_COLUMNS]], ignore_index=True)



# make species names lowercased and stripped
original_presence_data['species'] = original_presence_data['species'].apply(str.lower).apply(str.strip)

display(HTML('<h3>Original size: %s</h3>'%original_presence_data['species'].size))

# remove duplicate rows and nan values
original_presence_data = original_presence_data.dropna().drop_duplicates(ALLOWED_COLUMNS).reset_index(drop=True)
display(HTML('<h3>The size after duplications removal: %s</h3>'%original_presence_data['species'].size))


# remove duplicate values


The file broad_leaf_GBIF.csv succesfully loaded.
File overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3036 entries, 0 to 3035
Data columns (total 4 columns):
species        3036 non-null object
countrycode    3036 non-null object
latitude       3034 non-null float64
longitude      3034 non-null float64
dtypes: float64(2), object(2)
memory usage: 95.0+ KB
The file широколиственные.csv succesfully loaded.
File overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 605 entries, 0 to 604
Data columns (total 3 columns):
species      605 non-null object
latitude     604 non-null float64
longitude    604 non-null float64
dtypes: float64(2), object(1)
memory usage: 14.3+ KB


  


## Initial dataset overview

In [5]:
display(HTML('<h3>General info:</h3>'))
original_presence_data.info()
display(HTML('<h3>Species occurences overview:</h3>'))
original_presence_data['species'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2042 entries, 0 to 2041
Data columns (total 3 columns):
latitude     2042 non-null float64
longitude    2042 non-null float64
species      2042 non-null object
dtypes: float64(2), object(1)
memory usage: 47.9+ KB


quercus mongolica         329
kalopanax septemlobus     292
carpinus cordata          285
fraxinus lanuginosa       274
juglans mandshurica       269
quercus crispula          158
phellodendron amurense    144
ulmus davidiana           130
acer pictum                75
fraxinus mandshurica       40
juglans ailanthifolia      22
tilia amurensis            11
abies holophylla            8
quercus crispula blume      5
Name: species, dtype: int64

# MAIN LOOP OVER ALL SPECIES

In [6]:

for species in MODEL_SPECIES:
    display(HTML('<h5>=============== %s ======================</h5>' % species))
    classifier_stats = []
    model = Pipeline([('select_species', SelectSpecies(species)), 
                      ('ps_absence', FillPseudoAbsenceData()),
                      ('fill_env', FillEnvironmentalData(VARIABLE_SET))]
                     )
    print("Constructing the dataset...")
    aux_result = model.fit_transform(original_presence_data)
    X, y = aux_result[list(VARIABLE_SET)].values, list(map(int, ~aux_result.absence))
    print("Dataset is formed.")
    aux_result.info()
    for name, clf in CLASSIFIERS:
        std_clf = make_pipeline(StandardScaler(), clf)
        std_clf.coef_ = property(lambda x: x.named_steps.coef_)
        std_clf.feature_importances_ = property(lambda x: x.steps[-1][-1].feature_importances_)
        # Not yet completed!
        print("Preforming recursive feature ellimination for the <%s> classifier..." % name)
        rfecv_acc = RFECV(estimator=std_clf, step=1, cv=StratifiedKFold(KFOLDS_NUMBER),
                      scoring='accuracy')
        rfecv_acc.fit(X, y)
        print("=" * 20, "accuracy case", "=" * 20)
        print("Optimal composition of variables: ", np.array(VARIABLE_SET)[rfecv_acc.support_])
        acc_score = np.array(rfecv_acc.grid_scores_)[np.argmax(rfecv_acc.grid_scores_)]
        print("Composition score (acc): ", acc_score)
        print("=" * 60)
        print("=" * 20, "roc/auc case", "=" * 20)
        rfecv_auc = RFECV(estimator=std_clf, step=1, cv=StratifiedKFold(KFOLDS_NUMBER),
                      scoring='roc_auc')
        rfecv_auc.fit(X, y)
        print("Optimal composition of variables: ", np.array(VARIABLE_SET)[rfecv_auc.support_])
        auc_score = np.array(rfecv_auc.grid_scores_)[np.argmax(rfecv_auc.grid_scores_)]
        print("Composition score (auc): ", auc_score)
        print("=" * 60)
        print("The following predictor composition was selected: ",  np.array(VARIABLE_SET)[rfecv_acc.support_])
        classifier_stats.append((name, acc_score, clf))

    optimal_name, optimal_score, optimal_clf = tuple(classifier_stats[np.argmax(map(lambda x: x[1], classifier_stats))])
    print("The best classifier is %s. Its score is %s." % (optimal_name, optimal_score))
    print("~" * 90)
    
    





Constructing the dataset...
Dataset is formed.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 721 entries, 0 to 720
Data columns (total 23 columns):
absence      721 non-null bool
latitude     721 non-null float64
longitude    721 non-null float64
species      721 non-null object
BIO1         721 non-null float64
BIO2         721 non-null float64
BIO3         721 non-null float64
BIO4         721 non-null float64
BIO5         721 non-null float64
BIO6         721 non-null float64
BIO7         721 non-null float64
BIO8         721 non-null float64
BIO9         721 non-null float64
BIO10        721 non-null float64
BIO11        721 non-null float64
BIO12        721 non-null float64
BIO13        721 non-null float64
BIO14        721 non-null float64
BIO15        721 non-null float64
BIO16        721 non-null float64
BIO17        721 non-null float64
BIO18        721 non-null float64
BIO19        721 non-null float64
dtypes: bool(1), float64(21), object(1)
memory usage: 124.7+ KB
Prefor

RuntimeError: The classifier does not expose "coef_" or "feature_importances_" attributes