In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath('/home/maldo/projects/virtualenvs/datathon/lib/python3.7/site-packages'))
sys.path.insert(0, os.path.abspath('/home/maldo/projects/datathon'))
from imblearn.under_sampling import *
from imblearn.over_sampling import *
from imblearn.combine import *
from src.models.model import SoilClassifier
from src.constants import *
from sklearn.base import clone
from src.features.features import ImbalanceTransformer
import pandas as pd
import seaborn as sns
import math


In [2]:
df = pd.read_csv(os.path.join('../data/processed', 'train_data.csv'))
min_samples = 500
max_samples = 9000
features = ['not_transformed','squared_coordinates', 'cadastral_ordinal_encoder_onehot', 'log_area', 'savi', 'pssr', 'evi', 'evi2']
classifier = 'gradient_boosting'

In [3]:
model = SoilClassifier(feature_names=features,
                       classifier=classifier,
                       min_samples=min_samples,
                       max_samples=max_samples)

model.fit(df, df[TARGET_FEATURE])


In [4]:
model.metrics

{'train': {'accuracy': 0.8670202460525042,
  'custom_accuracy': 0.689416796525314,
  'balanced_accuracy': 0.5651608049968951,
  'precision_macro': 0.5308227110255096,
  'precision_weighted': 0.8859004547859511,
  'recall_macro': 0.5651608049968951,
  'recall_weighted': 0.8670202460525042,
  'f1_macro': 0.5287893110814682,
  'f1_weighted': 0.8730152025769694}}

In [5]:
model.get_feature_importances().sort_values('importance',ascending=False).head(20)

Unnamed: 0,feature,importance
66,log_area__log_AREA,0.206293
33,not_transformed__Q_B_2_0_9,0.13676
47,not_transformed__GEOM_R2,0.113321
48,not_transformed__GEOM_R3,0.077326
46,not_transformed__GEOM_R1,0.043942
51,squared_coordinates__quad_X,0.04247
1,not_transformed__Y,0.035588
0,not_transformed__X,0.033727
50,not_transformed__CONTRUCTIONYEAR,0.027423
36,not_transformed__Q_NIR_8_0_1,0.024273


In [6]:
model.dump('aux_model.pkl')

In [10]:
model = SoilClassifier()

In [11]:
model.load('aux_model.pkl')

In [12]:
pipe = clone(model.pipeline)
clsf = pipe.steps.pop()
imbt = pipe.steps.pop()

In [13]:
pipe.steps

[('scaler',
  ScalerTransformer(keys=['X', 'Y', 'Q_R_4_0_0', 'Q_R_4_0_1', 'Q_R_4_0_2',
                          'Q_R_4_0_3', 'Q_R_4_0_4', 'Q_R_4_0_5', 'Q_R_4_0_6',
                          'Q_R_4_0_7', 'Q_R_4_0_8', 'Q_R_4_0_9', 'Q_R_4_1_0',
                          'Q_G_3_0_0', 'Q_G_3_0_1', 'Q_G_3_0_2', 'Q_G_3_0_3',
                          'Q_G_3_0_4', 'Q_G_3_0_5', 'Q_G_3_0_6', 'Q_G_3_0_7',
                          'Q_G_3_0_8', 'Q_G_3_0_9', 'Q_G_3_1_0', 'Q_B_2_0_0',
                          'Q_B_2_0_1', 'Q_B_2_0_2', 'Q_B_2_0_3', 'Q_B_2_0_4',
                          'Q_B_2_0_5', ...])),
 ('features', FeatureUnion(n_jobs=None,
               transformer_list=[('not_transformed',
                                  DataFrameIndexSelector(keys=['X', 'Y',
                                                               'Q_R_4_0_0',
                                                               'Q_R_4_0_1',
                                                               'Q_R_4_0_2',
    

In [14]:
pipe.fit(df, df[TARGET_FEATURE])

Pipeline(memory=None,
         steps=[('scaler',
                 ScalerTransformer(keys=['X', 'Y', 'Q_R_4_0_0', 'Q_R_4_0_1',
                                         'Q_R_4_0_2', 'Q_R_4_0_3', 'Q_R_4_0_4',
                                         'Q_R_4_0_5', 'Q_R_4_0_6', 'Q_R_4_0_7',
                                         'Q_R_4_0_8', 'Q_R_4_0_9', 'Q_R_4_1_0',
                                         'Q_G_3_0_0', 'Q_G_3_0_1', 'Q_G_3_0_2',
                                         'Q_G_3_0_3', 'Q_G_3_0_4', 'Q_G_3_0_5',
                                         'Q_G_3_0_6', 'Q_G_3_0_7', 'Q_G_3_0_8',
                                         'Q_G_3_0_9', 'Q_G_3_1_0', 'Q_B_2_0_0',
                                         'Q_B...
                                                 OneHotOrdinalEncoder(category_orders=[['9',
                                                                                        '8',
                                                                               

In [19]:
preprocessed_df = pd.DataFrame(pipe.transform(df), columns = pipe.named_steps['features'].get_feature_names())
X, y = imbt[1].fit_resample(preprocessed_df, df[TARGET_FEATURE])

In [24]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import GradientBoostingClassifier
estimator = clone(model.pipeline)
estimator.steps.pop(0)
estimator.steps.pop(0)

rfcev = RFECV(
    estimator = GradientBoostingClassifier(),
    scoring = 'balanced_accuracy',
    verbose = 5,
    step = 5
    cv = 3
)

In [30]:
rfcev.fit(X,y)

Fitting estimator with 71 features.
Fitting estimator with 68 features.
Fitting estimator with 65 features.
Fitting estimator with 62 features.
Fitting estimator with 59 features.
Fitting estimator with 56 features.
Fitting estimator with 53 features.
Fitting estimator with 50 features.
Fitting estimator with 47 features.
Fitting estimator with 44 features.
Fitting estimator with 41 features.
Fitting estimator with 38 features.
Fitting estimator with 35 features.
Fitting estimator with 32 features.
Fitting estimator with 29 features.
Fitting estimator with 26 features.
Fitting estimator with 23 features.
Fitting estimator with 20 features.
Fitting estimator with 17 features.
Fitting estimator with 14 features.
Fitting estimator with 11 features.
Fitting estimator with 8 features.
Fitting estimator with 5 features.
Fitting estimator with 2 features.
Fitting estimator with 71 features.
Fitting estimator with 68 features.
Fitting estimator with 65 features.
Fitting estimator with 62 featu

RFECV(cv=None,
      estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                           criterion='friedman_mse', init=None,
                                           learning_rate=0.1, loss='deviance',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=2,
                                           min_weight_fraction_leaf=0.0,
                                           n_estimators=100,
                                           n_iter_no_change=None,
                                           presort='deprecated',
                                           random_state=None, subsample=1.0,
                      

In [48]:
rfcev_json = {
    'n_features': rfcev.n_features_,
    'support': rfcev.support_,
    'grid_scores': rfcev.grid_scores_,
    'ranking': rfcev.ranking_,
    'feature_names': preprocessed_df.columns
}

In [49]:
rfcev_json

{'n_features': '62',
 'support': array([ True,  True,  True,  True,  True, False,  True, False,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False, False,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True, False,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, False,
         True,  True,  True,  True,  True,  True, False, False,  True,
         True, False,  True,  True,  True,  True,  True,  True]),
 'grid_scores': array([0.23426916, 0.39998062, 0.4462992 , 0.4700991 , 0.48636802,
        0.49564523, 0.49563122, 0.5010122 , 0.50232711, 0.50080898,
        0.50272978, 0.50320109, 0.50390016, 0.502751  , 0.50430906,
        0.5042379 , 0.50558094, 0.50468823, 0.50627672, 0.50610708,
        0.50703877, 0.50705004, 0.50682003, 0.504485  , 0.5059598 ]),
 'ranking': array([1, 1, 1, 1, 1, 2, 1, 3, 1, 1,

In [52]:
import pickle
pickle.dump(rfcev_json, open('../src/experiments/rfecv.json', 'wb'))

In [55]:
support_df= pd.DataFrame({
    'features':preprocessed_df.columns,
    'support':rfcev.support_
})

support_df.loc[~support_df.support]

Unnamed: 0,features,support
5,not_transformed__Q_R_4_0_3,False
7,not_transformed__Q_R_4_0_5,False
19,not_transformed__Q_G_3_0_6,False
20,not_transformed__Q_G_3_0_7,False
41,not_transformed__Q_NIR_8_0_6,False
53,cadastral_ordinal_encoder_onehot__CADASTRALQUA...,False
60,cadastral_ordinal_encoder_onehot__CADASTRALQUA...,False
61,cadastral_ordinal_encoder_onehot__CADASTRALQUA...,False
64,cadastral_ordinal_encoder_onehot__CADASTRALQUA...,False


In [33]:
rfcev.support_

array([ True,  True,  True,  True,  True, False,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True, False,  True,  True,  True,  True,  True,  True])

In [34]:
rfcev.ranking_

array([1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 4, 1,
       1, 1, 1, 1, 1])