Copyright 2019 Simon Grest

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [155]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

import lightgbm as lgbm
import xgboost as xgb
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from category_encoders import TargetEncoder

from pathlib import Path

from skopt import BayesSearchCV
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from skopt.callbacks import DeadlineStopper # Stop the optimization before running out of a fixed budget of time.
from skopt.callbacks import VerboseCallback # Callback to control the verbosity
from skopt.callbacks import DeltaXStopper # Stop the optimization If the last two positions at which the objective has been evaluated are less than delta

from time import time

from pprint import pprint

from sklearn.metrics import average_precision_score
from sklearn.metrics import make_scorer

from imblearn.over_sampling import SMOTE, SMOTENC

In [156]:
data_path = Path('../data')

In [157]:
features_df = pd.read_csv(data_path/'features.csv')

In [158]:
categorical_cols = ['Subregion','tile','side_of_river','Crop_Id_Ne']

In [159]:
dtypes = {c:'category' for c in categorical_cols}



In [160]:
features_df = pd.read_csv(data_path/'features.csv', dtype=dtypes)

In [161]:
cols_to_drop = ['Crop_Id_Ne',
                'Field_Id',
                'train_test',
                'geometry']

### Test train split

In [162]:
features_dummies_df = pd.get_dummies(features_df.drop(cols_to_drop,axis=1))

X = features_dummies_df[features_df.train_test == 'train']#.drop(cols_to_drop, axis=1).values

y = features_df[features_df.train_test == 'train'].Crop_Id_Ne.values

X_test = features_dummies_df[features_df.train_test == 'test']#.drop(cols_to_drop, axis=1).values

### Synthetic Minority Oversampling

In [165]:
categorical_idx = list(range(373,386))

In [166]:
sm = SMOTENC(random_state=42, categorical_features=categorical_idx)
X_res, y_res = sm.fit_resample(X, y)

In [167]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
skf.get_n_splits(X,y)

5

In [168]:
lgbs=[]

i=0
for train_index, valid_index in skf.split(X,y):
    i=i+1
    print(f'Fold: {i}')
    X_train, X_valid = X_res[train_index], X_res[valid_index]
    y_train, y_valid = y_res[train_index], y_res[valid_index]
    lgb = lgbm.LGBMClassifier(n_estimators=100000, 
                              categorical_feature=categorical_idx,
                              class_weight='balanced',
                              max_bin=5,
                              num_leaves=15,
                              min_data_in_leaf=140,
                              min_sum_hessian_in_leaf=2,
                              bagging_fraction=0.7,
                              bagging_freq=1,
                              feature_fraction=0.4,
                              lambda_l1=0.6,
                              lambda_l2=0,
                              min_gain_to_split=0.01,
                              max_depth=15
                             )
    lgb.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_valid,y_valid)],early_stopping_rounds=100,verbose=100)
    
    lgbs.append(lgb)

Fold: 1
Training until validation scores don't improve for 100 rounds.


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


[100]	training's multi_logloss: 0.349067	valid_1's multi_logloss: 0.749376
[200]	training's multi_logloss: 0.141859	valid_1's multi_logloss: 0.65784
[300]	training's multi_logloss: 0.0780418	valid_1's multi_logloss: 0.64473
Early stopping, best iteration is:
[285]	training's multi_logloss: 0.083862	valid_1's multi_logloss: 0.642812
Fold: 2
Training until validation scores don't improve for 100 rounds.


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


[100]	training's multi_logloss: 0.355661	valid_1's multi_logloss: 0.706957
[200]	training's multi_logloss: 0.1449	valid_1's multi_logloss: 0.600255
[300]	training's multi_logloss: 0.0789128	valid_1's multi_logloss: 0.569041
[400]	training's multi_logloss: 0.0547281	valid_1's multi_logloss: 0.566257
Early stopping, best iteration is:
[350]	training's multi_logloss: 0.0642853	valid_1's multi_logloss: 0.563661
Fold: 3
Training until validation scores don't improve for 100 rounds.


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


[100]	training's multi_logloss: 0.353867	valid_1's multi_logloss: 0.746341
[200]	training's multi_logloss: 0.144302	valid_1's multi_logloss: 0.643825
[300]	training's multi_logloss: 0.0789984	valid_1's multi_logloss: 0.62168
[400]	training's multi_logloss: 0.0547769	valid_1's multi_logloss: 0.625405
Early stopping, best iteration is:
[330]	training's multi_logloss: 0.069104	valid_1's multi_logloss: 0.620572
Fold: 4
Training until validation scores don't improve for 100 rounds.


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


[100]	training's multi_logloss: 0.342076	valid_1's multi_logloss: 0.785676
[200]	training's multi_logloss: 0.138338	valid_1's multi_logloss: 0.6728
[300]	training's multi_logloss: 0.0753395	valid_1's multi_logloss: 0.652876
Early stopping, best iteration is:
[291]	training's multi_logloss: 0.0787718	valid_1's multi_logloss: 0.651867
Fold: 5
Training until validation scores don't improve for 100 rounds.


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


[100]	training's multi_logloss: 0.346704	valid_1's multi_logloss: 0.810489
[200]	training's multi_logloss: 0.13937	valid_1's multi_logloss: 0.697951
[300]	training's multi_logloss: 0.0766587	valid_1's multi_logloss: 0.673818
[400]	training's multi_logloss: 0.0537375	valid_1's multi_logloss: 0.676622
Early stopping, best iteration is:
[308]	training's multi_logloss: 0.0740368	valid_1's multi_logloss: 0.671885


In [169]:

xgbs=[]

i=0
for train_index, valid_index in skf.split(X,y):
    i=i+1
    print(f'Fold: {i}')
    X_train, X_valid = X_res[train_index], X_res[valid_index]
    y_train, y_valid = y_res[train_index], y_res[valid_index]
    xg = xgb.XGBClassifier(n_estimators=10000)#,colsample_bytree=0.3)
    xg.fit(X_train, y_train, eval_set=[(X_train,y_train), (X_valid,y_valid)],
           eval_metric='mlogloss', early_stopping_rounds=30,verbose=10)
    
    xg.fit(X_train, y_train)
    xgbs.append(xg)

Fold: 1
[0]	validation_0-mlogloss:1.95111	validation_1-mlogloss:1.97171
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 30 rounds.
[10]	validation_0-mlogloss:0.996578	validation_1-mlogloss:1.11326
[20]	validation_0-mlogloss:0.662931	validation_1-mlogloss:0.841566
[30]	validation_0-mlogloss:0.490023	validation_1-mlogloss:0.727141
[40]	validation_0-mlogloss:0.383869	validation_1-mlogloss:0.66741
[50]	validation_0-mlogloss:0.308606	validation_1-mlogloss:0.632837
[60]	validation_0-mlogloss:0.25364	validation_1-mlogloss:0.614419
[70]	validation_0-mlogloss:0.211095	validation_1-mlogloss:0.600399
[80]	validation_0-mlogloss:0.177072	validation_1-mlogloss:0.591592
[90]	validation_0-mlogloss:0.148631	validation_1-mlogloss:0.586631
[100]	validation_0-mlogloss:0.126018	validation_1-mlogloss:0.582826
[110]	validation_0-mlogloss:0.107709	validation_1-mlogloss:0.579148
[120]	validation_0-mloglo

KeyboardInterrupt: 

### Feature importance

In [171]:
importances = []
for k, l in enumerate(lgbs):
    importances.append(pd.DataFrame(
        {'model': f'lgbm fold {k}',
         'feature': features_dummies_df.columns, 
         'importance': l.feature_importances_}))
    
for k, x in enumerate(xgbs):
    importances.append(pd.DataFrame(
        {'model': f'xgboost fold {k}',
         'feature': features_dummies_df.columns, 
         'importance': x.feature_importances_}))


In [172]:
importance_df = pd.concat(importances, axis=0).groupby('feature').importance.sum().reset_index()
importance_df['importance %'] = importance_df.importance/importance_df.importance.sum()

In [173]:
importance_df.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance,importance %
1,Field_Id,903.170457,1.268569e-02
10,argmax_b4_mode_0,611.038167,8.582479e-03
213,composite_min_ndvi_lbp_std_4,535.006920,7.514564e-03
244,ndvi_t_med_median_3,503.038382,7.065543e-03
15,argmin_ndvi_slope_mode_0,480.004848,6.742020e-03
20,blue_t_med_median_4,474.022525,6.657994e-03
233,corner_count,454.004911,6.376832e-03
267,nir_t_med_median_3,434.026863,6.096225e-03
253,nearby_crop_4_prop,398.014376,5.590404e-03
272,norm_t_med_median_3,386.094903,5.422986e-03


### Inference

In [188]:
all_preds = []

for clf in [*lgbs]:
    all_preds.append(clf.predict_proba(X_test.values))
    
for clf in [*xgbs]:
    all_preds.append(clf.predict_proba(X_test.values))

In [189]:
preds=np.stack(all_preds).mean(axis=0)

In [190]:
pd.read_csv(data_path/'sample_submission_fixed.csv').head(1)

Unnamed: 0,field_id,crop_id_1,crop_id_2,crop_id_3,crop_id_4,crop_id_5,crop_id_6,crop_id_7,crop_id_8,crop_id_9
0,5,0.00321,0.4321,0.677,0.1323,0.97,0.432,0.432,0.123,0.432


In [191]:
field_ids = features_df[features_df.train_test=='test'].Field_Id.values

In [192]:
preds_df = pd.DataFrame(preds)
preds_df.columns = [f'crop_id_{c+1}' for c in preds_df.columns]
preds_df['field_id'] = field_ids

In [193]:
predictions = preds_df[['field_id',
          'crop_id_1',
          'crop_id_2',
          'crop_id_3',
          'crop_id_4',
          'crop_id_5',
          'crop_id_6',
          'crop_id_7',
          'crop_id_8',
          'crop_id_9']]

In [194]:
predictions.to_csv(data_path/'submission.csv',index=False)