In [2]:
import sys
import os
sys.path.insert(0, os.path.abspath('/home/maldo/projects/virtualenvs/datathon/lib/python3.7/site-packages'))
sys.path.insert(0, os.path.abspath('/home/maldo/projects/datathon'))
from imblearn.under_sampling import *
from imblearn.over_sampling import *
from imblearn.combine import *
from src.models.model import SoilClassifier
from src.constants import *
from sklearn.base import clone
from src.features.features import ImbalanceTransformer
import pandas as pd
import seaborn as sns
import math

In [25]:
df = pd.read_csv(os.path.join('../data/processed', 'train_data.csv'))
df_test = pd.read_csv(os.path.join('../data/processed', 'test_data.csv'))
min_samples = 500
max_samples = 9000
features = ['not_correlated', 'cadastral_ordinal_encoder_onehot', 'log_area', 'log_antiquity', 'squared_geoms', 'pssr', 'savi']
classifier = 'gradient_boosting'

In [6]:
model = SoilClassifier(min_samples = min_samples,
                       max_samples = max_samples,
                       feature_names = features,
                       classifier = classifier)

In [7]:
model.fit(df, df[TARGET_FEATURE])

In [8]:
pipe = clone(model.pipeline)
clsf = pipe.steps.pop()
imb = pipe.steps.pop()

In [9]:
transformed_data = pipe.fit_transform(df)

In [10]:
transformed_data = pd.DataFrame(transformed_data, columns=pipe.named_steps['features'].get_feature_names())

In [11]:
import numpy as np
def correlation_features(data, features_names):
    """Compute feature correlation.

    :param data: the matrix to compute correlations
    :type data: Spacy Sparce Matrix
    :param features_names: the features names
    :type features_names: list
    :return: The features names
    :rtype: list
    """
    df = pd.DataFrame(data, columns=features_names)
    # Compute correlation matix
    corr_matrix = df.corr(method='pearson')
    # uppper triangular matrix
    upper_mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)
    upper = corr_matrix.where(upper_mask)
    # Get correlated features
    stack_non_null = upper.stack()
    list_corr_feats = [
        (idx0, idx1, v)
        for (idx0, idx1), v
        in zip(stack_non_null.index, stack_non_null.values)
    ]
    # Sorted according its correlation value.
    list_corr_feats = sorted(list_corr_feats,
                             key=lambda x: abs(x[2]),
                             reverse=True)
    return list_corr_feats

In [12]:
correlation_features(transformed_data, features_names=pipe.named_steps['features'].get_feature_names())

[('cadastral_ordinal_encoder_onehot__CADASTRALQUALITYID_9',
  'cadastral_ordinal_encoder_onehot__CADASTRALQUALITYID_Na',
  -1.0),
 ('not_correlated__GEOM_R1',
  'squared_geoms__quad_GEOM_R1',
  0.9784632281332182),
 ('not_correlated__Q_G_3_1_0',
  'not_correlated__Q_NIR_8_1_0',
  0.9581120120216013),
 ('pssr__PSSR', 'savi__SAVI', 0.9432135382126091),
 ('not_correlated__Q_R_4_0_5',
  'not_correlated__Q_G_3_0_5',
  0.9383378375516469),
 ('not_correlated__CONTRUCTIONYEAR',
  'log_antiquity__log_CONTRUCTIONYEAR_antiquity',
  -0.9335670143224466),
 ('not_correlated__Q_R_4_1_0',
  'not_correlated__Q_G_3_1_0',
  0.9276453688264251),
 ('not_correlated__Q_R_4_0_5',
  'not_correlated__Q_B_2_0_5',
  0.9245407276648115),
 ('not_correlated__Q_G_3_0_5',
  'not_correlated__Q_B_2_0_5',
  0.9128546345460538),
 ('not_correlated__Q_R_4_0_1',
  'not_correlated__Q_G_3_0_1',
  0.9080798370365291),
 ('not_correlated__Q_R_4_1_0',
  'not_correlated__Q_NIR_8_1_0',
  0.8914209343702856),
 ('not_correlated__Q_NIR

In [13]:
corr_matrix = transformed_data.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

In [14]:
len(to_drop)

9

In [16]:
to_drop

['not_correlated__Q_G_3_0_1',
 'not_correlated__Q_G_3_0_5',
 'not_correlated__Q_G_3_1_0',
 'not_correlated__Q_B_2_0_5',
 'not_correlated__Q_NIR_8_1_0',
 'cadastral_ordinal_encoder_onehot__CADASTRALQUALITYID_Na',
 'log_antiquity__log_CONTRUCTIONYEAR_antiquity',
 'squared_geoms__quad_GEOM_R1',
 'savi__SAVI']

In [15]:
dropped_highly = transformed_data.copy()
dropped_highly.drop(columns=to_drop, inplace=True)
dropped_highly.columns

Index(['not_correlated__X', 'not_correlated__Y', 'not_correlated__Q_R_4_0_1',
       'not_correlated__Q_R_4_0_5', 'not_correlated__Q_R_4_1_0',
       'not_correlated__Q_B_2_0_1', 'not_correlated__Q_B_2_1_0',
       'not_correlated__Q_NIR_8_0_1', 'not_correlated__Q_NIR_8_0_5',
       'not_correlated__GEOM_R1', 'not_correlated__GEOM_R2',
       'not_correlated__GEOM_R3', 'not_correlated__GEOM_R4',
       'not_correlated__CONTRUCTIONYEAR',
       'cadastral_ordinal_encoder_onehot__CADASTRALQUALITYID_9',
       'cadastral_ordinal_encoder_onehot__CADASTRALQUALITYID_8',
       'cadastral_ordinal_encoder_onehot__CADASTRALQUALITYID_7',
       'cadastral_ordinal_encoder_onehot__CADASTRALQUALITYID_6',
       'cadastral_ordinal_encoder_onehot__CADASTRALQUALITYID_5',
       'cadastral_ordinal_encoder_onehot__CADASTRALQUALITYID_4',
       'cadastral_ordinal_encoder_onehot__CADASTRALQUALITYID_3',
       'cadastral_ordinal_encoder_onehot__CADASTRALQUALITYID_2',
       'cadastral_ordinal_encoder_oneho

In [17]:
dropped_highly.shape

(82584, 31)

In [18]:
imbt = ImbalanceTransformer(min_samples=min_samples, max_samples=max_samples)

In [19]:
X,y = imbt.fit_resample(dropped_highly, df[TARGET_FEATURE])

In [20]:
X.shape

(19597, 31)

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X, y)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [23]:
from sklearn.metrics import *
def calculate_metrics(y_true, y_pred):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'precision_macro': precision_score(y_true, y_pred, average='macro'),
        'precision_weighted': precision_score(y_true, y_pred, average='weighted'),
        'recall_macro': recall_score(y_true, y_pred, average='macro'),
        'recall_weighted': recall_score(y_true, y_pred, average='weighted'),
        'f1_macro': f1_score(y_true, y_pred, average='macro'),
        'f1_weighted': f1_score(y_true, y_pred, average='weighted'),
    }

In [24]:
calculate_metrics(y, gb.predict(X))

{'accuracy': 0.6925549829055467,
 'balanced_accuracy': 0.5594842662071581,
 'precision_macro': 0.695506908938056,
 'precision_weighted': 0.6855671051906608,
 'recall_macro': 0.5594842662071581,
 'recall_weighted': 0.6925549829055467,
 'f1_macro': 0.590695713517712,
 'f1_weighted': 0.6627500799499345}

In [29]:
t_df_test = pipe.transform(df_test)
t_df_test = pd.DataFrame(t_df_test, columns=pipe.named_steps['features'].get_feature_names())
calculate_metrics(df_test[TARGET_FEATURE], gb.predict(t_df_test.drop(columns=to_drop)))

{'accuracy': 0.8579385837450354,
 'balanced_accuracy': 0.4714332731665511,
 'precision_macro': 0.4519462943927353,
 'precision_weighted': 0.8746145673410394,
 'recall_macro': 0.4714332731665511,
 'recall_weighted': 0.8579385837450354,
 'f1_macro': 0.44710731468950404,
 'f1_weighted': 0.8636437006494757}

In [22]:
model.metrics

{'train': {'accuracy': 0.8645258161387194,
  'custom_accuracy': 0.6843253705115877,
  'balanced_accuracy': 0.5587315929133986,
  'precision_macro': 0.516373940354464,
  'precision_weighted': 0.883704681315883,
  'recall_macro': 0.5587315929133986,
  'recall_weighted': 0.8645258161387194,
  'f1_macro': 0.5174336119273506,
  'f1_weighted': 0.8706946954115227}}

In [30]:
model.evaluate(df_test, df_test[TARGET_FEATURE])

In [31]:
model.metrics

{'train': {'accuracy': 0.8645258161387194,
  'custom_accuracy': 0.6843253705115877,
  'balanced_accuracy': 0.5587315929133986,
  'precision_macro': 0.516373940354464,
  'precision_weighted': 0.883704681315883,
  'recall_macro': 0.5587315929133986,
  'recall_weighted': 0.8645258161387194,
  'f1_macro': 0.5174336119273506,
  'f1_weighted': 0.8706946954115227},
 'test': {'accuracy': 0.8575510994865834,
  'custom_accuracy': 0.6526893452099854,
  'balanced_accuracy': 0.4927809653044917,
  'precision_macro': 0.46682769092666104,
  'precision_weighted': 0.8760241555378591,
  'recall_macro': 0.4927809653044917,
  'recall_weighted': 0.8575510994865834,
  'f1_macro': 0.4637404246862448,
  'f1_weighted': 0.8638738592777445}}

In [33]:
model.get_feature_importances().sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
32,log_area__log_AREA,0.247576
11,not_correlated__Q_NIR_8_0_1,0.100917
15,not_correlated__GEOM_R2,0.072901
16,not_correlated__GEOM_R3,0.065027
1,not_correlated__Y,0.057786
0,not_correlated__X,0.056298
10,not_correlated__Q_B_2_1_0,0.046219
9,not_correlated__Q_B_2_0_5,0.037388
12,not_correlated__Q_NIR_8_0_5,0.032232
35,squared_geoms__quad_GEOM_R2,0.031341
