In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from scipy import stats, interpolate
from shapely.geometry import shape
from functools import reduce
import json

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import recall_score, precision_score
from sklearn.utils import shuffle
from sklearn.multiclass import OutputCodeClassifier

import xgboost as xgb
import lightgbm as lgbm

In [8]:
def preprocess(data, 
               train=True,   
               scaling=False):
    
    def parse_json(obj):
        coords = shape(json.loads(obj.values[0])).bounds
        x = (coords[0] + coords[2]) / 2
        y = (coords[1] + coords[3]) / 2
        return (x, y)

    target_col = pd.Index(['crop'])
    area_col = pd.Index(['area'])
    geo_col = pd.Index(['.geo'])
    ts_cols = data.columns.difference(area_col.append([geo_col, target_col])).to_list()
    
    # Mode
    if train:
        target = data[target_col]
        features = data.drop(target_col.to_list(), axis=1)
    else:
        target = None
        features = data
        
#     Interpolation
#     features_ts[features_ts <= 0] = np.nan
#     features_ts[features_ts >= 1] = 1
    
#     features_ts = np.clip(features_ts, 0, 1)
#     features_ts.sort_index(axis=1, inplace=True)
    
#     features_ts = features_ts.apply(fill_fl_nan, axis=1)
#     features_ts = features_ts.apply(interpolate_, axis=1)
    
#     features_ts.interpolate(axis=1, inplace=True)
#     features_ts.ffill(axis=1, inplace=True)
#     features_ts.bfill(axis=1, inplace=True)

    features_ts = features[ts_cols].copy()
    
    # Scaling
    if scaling:
        global_min = features_ts.min()
        global_max = features_ts.max()

        features_ts = (features_ts - global_min)/(global_max - global_min)
    
    features_geo = features[geo_col].copy()
    coordinates = features_geo.apply(parse_json, axis=1)
    features_geo =  pd.DataFrame(coordinates.to_list(), columns=['x', 'y'], index=features_geo.index)
    
    features_area = features[area_col].copy()
    features_parts = [features_ts, features_geo, features_area]
    features = reduce(lambda left, right: left.join(right), features_parts)
    
    data = features.join(target)
    data = shuffle(data)
    features = data[features.columns]
    target = data.crop
    
    return features, target

In [9]:
train_file = 'data/train_dataset_train.csv'
test_file = 'data/test_dataset_test.csv'

data = pd.read_csv(train_file)
data.sort_index(axis=1, inplace=True)
data.set_index('id', inplace=True)
features, target = preprocess(data)

In [15]:
recall = lambda pred_y, target_y: print(f'Recall: {recall_score(target_y, pred_y, average="macro", zero_division=0)}')

In [11]:
train_X, valid_X, train_Y, valid_Y = train_test_split(features, target, test_size=0.2, shuffle=True, stratify=target)

In [12]:
xgb_params = {'colsample_bytree': 0.5, 'eta': 0.05, 'gamma': 0.3, 'max_depth': 5, 'min_child_weight': 3}
lgbm_params = {'boosting': 'goss', 'feature_fraction': 0.9, 'learning_rate': 0.01, 'max_depth': 10, 'min_data_in_leaf': 15, 'num_leaves': 31, 'reg_alpha': 0, 'reg_lambda': 0}

In [13]:
clf_1 = xgb.XGBClassifier(n_estimators=500, **xgb_params)
clf_2 = lgbm.LGBMClassifier(n_estimators=500, **lgbm_params)
clf_3 = lgbm.LGBMClassifier(n_estimators=500, learning_rate=0.01)

In [14]:
clf_1.fit(train_X, train_Y)
clf_2.fit(train_X, train_Y)
clf_3.fit(train_X, train_Y)



In [16]:
pred_y_1 = clf_1.predict(valid_X)
print('clf_1')
recall(pred_y_1, valid_Y)

pred_y_2 = clf_2.predict(valid_X)
print('clf_2')
recall(pred_y_2, valid_Y)

pred_y_3 = clf_3.predict(valid_X)
print('clf_3')
recall(pred_y_3, valid_Y)

clf_1
Recall: 0.9727160254832542
clf_2
Recall: 0.9747358382045058
clf_3
Recall: 0.9696119192649141


In [19]:
n_sam = valid_Y.shape[0]
arr = np.empty((n_sam, 3))

arr[:,0] = clf_1.predict(valid_X)
arr[:,1] = clf_2.predict(valid_X)
arr[:,2] = clf_3.predict(valid_X)

In [25]:
mode = stats.mode(arr, axis=1)[0]

In [26]:
print('mde')
recall(mode, valid_Y)

mde
Recall: 0.9747358382045058
