In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

import lightgbm as lgbm
import xgboost as xgb
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from category_encoders import TargetEncoder

from pathlib import Path

### include gps coords
#include perimeter
#one hot encode the tile
#one hot encode the region

#tabular model
#represent each answer at each time and use recurrent model

#make categorical properly categorical
#mean target encoding of nearest farm ids and subregion and tile (category_encoders.target_encoder.TargetEncoder)
### imblearn.over_sampling.SMOTE
### https://github.com/h2oai/pystacknet
### https://github.com/sentinel-hub/sentinelhub-py

In [2]:
data_path = Path('data')

In [12]:
categorical_cols = ['Crop_Id_Ne_nf_1',
                     'Crop_Id_Ne_nf_2',
                     'Crop_Id_Ne_nf_3',
                     'Crop_Id_Ne_nf_4',
                     'Crop_Id_Ne_nf_5',
                     'Crop_Id_Ne_nf_6',
                     'Crop_Id_Ne_nf_7',
                     'Crop_Id_Ne_nf_8',
                     'Crop_Id_Ne_nf_9',
                     'Crop_Id_Ne_nf_10',
                     'Subregion',
                     'tile',
                     'Crop_Id_Ne']

In [13]:
dtypes = {c:'category' for c in categorical_cols}



In [14]:
features_df = pd.read_csv(data_path/'features.csv', dtype=dtypes)

In [6]:
cols_to_drop = ['Crop_Id_Ne',
                'Field_Id',
                'Field_Id_1',
                'Field_Id_2',
                'Field_Id_3',
                'Field_Id_4',
                'Field_Id_5',
                'Field_Id_6',
                'Field_Id_7',
                'Field_Id_8',
                'Field_Id_9',
                'Field_Id_10',
                'train_test']

bands_to_drop = ['B11',
                'B01',
                'B12',
                'B09',
                'B07',
                'B8A',
                'B10']

### Create ratios month on month

In [8]:
dates    = sorted(list({c.split('_')[-2] for c in features_df.columns if '2017' in c}))
features = sorted(list({c.split('_')[0] for c in features_df.columns if '2017' in c}))
bands    = sorted(list({c.split('_')[-1] for c in features_df.columns if '2017' in c}))

In [9]:
for b in bands:
    for f in features:
        for d1, d2 in zip(dates[1:11], dates[0:10]):
            r_name = f'ratio_{f}_{d1}_{b}'
            d1_name = f'{f}_{d1}_{b}'
            d2_name = f'{f}_{d2}_{b}'
            if d1_name in features_df.columns:
                features_df[r_name] = features_df[d1_name]/ features_df[d2_name]

In [47]:
X = features_df[features_df.train_test == 'train'].drop(cols_to_drop, axis=1)

y = features_df[features_df.train_test == 'train'].Crop_Id_Ne

X_test = features_df[features_df.train_test == 'test'].drop(cols_to_drop, axis=1)

In [23]:
lgbm.LGBMClassifier()

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

- Use small max_bin
- Use small num_leaves
- Use min_data_in_leaf and min_sum_hessian_in_leaf
- Use bagging by set bagging_fraction and bagging_freq
- Use feature sub-sampling by set feature_fraction
- Use bigger training data
- Try lambda_l1, lambda_l2 and min_gain_to_split for regularization
- Try max_depth to avoid growing deep tree


In [63]:
kf = KFold(n_splits=6,shuffle=True,random_state=42)
kf.get_n_splits(X)

lgbs=[]

i=0
for train_index, valid_index in kf.split(X):
    i=i+1
    print(f'Fold: {i}')
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    lgb = lgbm.LGBMClassifier(n_estimators=10000, categorical_feature=categorical_cols)
    lgb.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_valid,y_valid)],early_stopping_rounds=30,verbose=10)
    
    lgb.fit(X_train, y_train)
    lgbs.append(lgb)

Fold: 1
Training until validation scores don't improve for 30 rounds.
[10]	training's multi_logloss: 0.829558	valid_1's multi_logloss: 1.10791
[20]	training's multi_logloss: 0.398269	valid_1's multi_logloss: 0.818524
[30]	training's multi_logloss: 0.197084	valid_1's multi_logloss: 0.708878
[40]	training's multi_logloss: 0.0980097	valid_1's multi_logloss: 0.669264
[50]	training's multi_logloss: 0.0492204	valid_1's multi_logloss: 0.6628
[60]	training's multi_logloss: 0.0248418	valid_1's multi_logloss: 0.67319
[70]	training's multi_logloss: 0.0127043	valid_1's multi_logloss: 0.697363
[80]	training's multi_logloss: 0.00653555	valid_1's multi_logloss: 0.724165
Early stopping, best iteration is:
[50]	training's multi_logloss: 0.0492204	valid_1's multi_logloss: 0.6628
Fold: 2
Training until validation scores don't improve for 30 rounds.
[10]	training's multi_logloss: 0.827234	valid_1's multi_logloss: 1.15124
[20]	training's multi_logloss: 0.396375	valid_1's multi_logloss: 0.869673
[30]	traini

KeyboardInterrupt: 

In [20]:
kf = KFold(n_splits=6,shuffle=True,random_state=42)
kf.get_n_splits(X)

xgbs=[]

i=0
for train_index, valid_index in kf.split(X):
    i=i+1
    print(f'Fold: {i}')
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    xg = xgb.XGBClassifier(n_estimators=10000,colsample_bytree=0.3)
    xg.fit(X_train, y_train, eval_set=[(X_train,y_train), (X_valid,y_valid)],
           eval_metric='mlogloss', early_stopping_rounds=30,verbose=10)
    
    xg.fit(X_train, y_train)
    xgbs.append(xg)

Fold: 1


KeyError: "None of [Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,\n            ...\n            2479, 2482, 2484, 2485, 2486, 2487, 2488, 2490, 2491, 2493],\n           dtype='int64', length=2078)] are in the [columns]"

In [None]:
kf = KFold(n_splits=6,shuffle=True,random_state=42)
kf.get_n_splits(X)

cbs=[]

i=0
for train_index, valid_index in kf.split(X):
    i=i+1
    print(f'Fold: {i}')
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    cb = CatBoostClassifier(n_estimators=10000)
    cb.fit(X_train, y_train, eval_set=[(X_valid,y_valid)],
           early_stopping_rounds=30,verbose=10)
    
    cb.fit(X_train, y_train)
    cbs.append(cb)

### Feature importance

In [None]:
l = lgbs[0]

In [None]:
importance_df = pd.DataFrame(
    {'feature': features_df.drop(cols_to_drop, axis=1).columns, 
     'importance': l.feature_importances_})

In [None]:
importance_df['feature_type'] = importance_df.feature.str.split('_').apply(lambda x: x[0]+"_"+x[2] if len(x) > 2 else x[0])

In [None]:
importance_feat_type_df = importance_df.groupby(
    'feature_type').importance.sum().sort_values(ascending=False).reset_index()

In [None]:
importance_df.to_csv('feature_importances.csv')

### Inference

In [None]:
all_preds = []

for clf in [*lgbs]:
    all_preds.append(clf.predict_proba(X_test))

In [None]:
preds=np.stack(all_preds).mean(axis=0)

In [None]:
pd.read_csv(data_path/'sample_submission_fixed.csv').head(1)

In [None]:
field_ids = features_df[features_df.train_test=='test'].Field_Id.values

In [None]:
preds_df = pd.DataFrame(preds)
preds_df.columns = [f'crop_id_{c+1}' for c in preds_df.columns]
preds_df['field_id'] = field_ids

In [None]:
predictions = preds_df[['field_id',
          'crop_id_1',
          'crop_id_2',
          'crop_id_3',
          'crop_id_4',
          'crop_id_5',
          'crop_id_6',
          'crop_id_7',
          'crop_id_8',
          'crop_id_9']]

In [None]:
predictions.to_csv('statistics_20190901_2131.csv',index=False)

In [None]:
predictions.head()