# Basic modelling module

In [57]:
# make plots be included into this doc
%matplotlib inline

# Importing modules

In [58]:
import numpy as np
import pandas as pd
import os
from IPython.core.display import display, HTML

# Constants definitions

In [59]:
SOURCE_DATA_PATH = './data' # relative (or absolute) path to the data directory
CSV_SEPARATOR = r'\t' # separator used in csv data files
DATA_FILE_NAMES = ['broad_leaf_GBIF.csv', 'широколиственные.csv'] # all data files should be in the same format
ALLOWED_COLUMNS = ['species', 'latitude', 'longitude'] # only these columns will be retained for computations
COLUMNS_DTYPES = [np.object, np.float64, np.float64] # Should have the same length as ALLOWED_COLUMNS
MODEL_SPECIES = ['quercus mongolica', 'fraxinus mandshurica'] # all  species should be given in lowercase format


# Source data loading and preprocessing

In [62]:
original_presence_data = pd.DataFrame({col: [] for col in ALLOWED_COLUMNS}) #initialize dataframe-accumulator
for filename in DATA_FILE_NAMES:
    try:
        # data loading procedure
        data = pd.read_csv(os.path.join(SOURCE_DATA_PATH, filename),
                           sep=CSV_SEPARATOR, dtype={a:b for a,b in zip(ALLOWED_COLUMNS, COLUMNS_DTYPES)})
    except IOError:
        print("Couldn't read the file %s." % filename)
    if any(data):
        print('The file %s succesfully loaded.' % filename)
        print('File overview:')
        data.info()
        print('='*50)
    # data concatenation procedure
    original_presence_data = pd.concat([original_presence_data, data[ALLOWED_COLUMNS]], ignore_index=True)

# make species names lowercased and stripped
original_presence_data['species'] = original_presence_data['species'].apply(str.lower).apply(str.strip)

display(HTML('<h3>Original size: %s</h3>'%original_presence_data['species'].size))

# remove duplicate rows and nan values
original_presence_data = original_presence_data.dropna().drop_duplicates().reset_index()
display(HTML('<h3>The size after duplications removal: %s</h3>'%original_presence_data['species'].size))


# remove duplicate values


The file broad_leaf_GBIF.csv succesfully loaded.
File overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3036 entries, 0 to 3035
Data columns (total 4 columns):
species        3036 non-null object
countrycode    3036 non-null object
latitude       3034 non-null float64
longitude      3034 non-null float64
dtypes: float64(2), object(2)
memory usage: 95.0+ KB
The file широколиственные.csv succesfully loaded.
File overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 605 entries, 0 to 604
Data columns (total 3 columns):
species      605 non-null object
latitude     604 non-null float64
longitude    604 non-null float64
dtypes: float64(2), object(1)
memory usage: 14.3+ KB


  


## Initial dataset overview

In [64]:
display(HTML('<h3>General info:</h3>'))
original_presence_data.info()
display(HTML('<h3>Species occurences overview:</h3>'))
original_presence_data['species'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2042 entries, 0 to 2041
Data columns (total 4 columns):
index        2042 non-null int64
latitude     2042 non-null float64
longitude    2042 non-null float64
species      2042 non-null object
dtypes: float64(2), int64(1), object(1)
memory usage: 63.9+ KB


quercus mongolica         329
kalopanax septemlobus     292
carpinus cordata          285
fraxinus lanuginosa       274
juglans mandshurica       269
quercus crispula          158
phellodendron amurense    144
ulmus davidiana           130
acer pictum                75
fraxinus mandshurica       40
juglans ailanthifolia      22
tilia amurensis            11
abies holophylla            8
quercus crispula blume      5
Name: species, dtype: int64