In [1]:
import numpy as np
import tensorflow as tf

In [75]:
model_name = 'field_size'

### Load the data

In [20]:
NP_DATA_PATH = '/mnt/azrael/spatialdata/projects/eo_data/iclr-2020-challange/mlhub/prepared_data/features_with_field_size.npz'

In [27]:
data = np.load(NP_DATA_PATH)
features = data['features']
field_id = data['crop_id']
crop_id = data['field_id']
field_size = data['field_size']

features.shape, field_id.shape, crop_id.shape, field_size.shape

((67557, 13, 14), (67557,), (67557,), (67557,))

### Normalizing the data

In [28]:
mean_f = features.mean(axis=(0,1))
std_f = features.std(axis=(0,1))

mean_f, std_f

(array([0.04495004, 0.05522608, 0.08520251, 0.08335783, 0.13549377,
        0.25449966, 0.29252816, 0.29083016, 0.31650684, 0.31871104,
        0.26029542, 0.17459822, 0.5274778 , 3.82845028]),
 array([ 0.02151336,  0.023685  ,  0.02388389,  0.03328747,  0.03058358,
         0.0464853 ,  0.05491788,  0.05862865,  0.05730477,  0.05223269,
         0.05640826,  0.05882478,  0.18024394, 17.91834208]))

In [29]:
features = (features - mean_f)/std_f

In [30]:
field_size = (field_size - field_size.mean()) / field_size.std()

In [31]:
np.unique(field_id, return_counts=True)

(array([   1,    2,    3, ..., 4795, 4796, 4797], dtype=int32),
 array([ 1, 11,  9, ...,  8, 50, 11]))

## Spatial sampling

In [32]:
agg_methods = {
    'min': np.mean,
    'max': np.max,
    'mean': np.mean,
    'std': np.std
}

def sample_field(features, N=10, n_samples=1, methods=['mean']):
    agg_features_stack = []
    indices = np.arange(len(features))
    for _ in range(n_samples):
        sampled_i = np.random.choice(indices, size=N, replace=True)

        sampled_features = features[sampled_i]
        
        aggreagations = [agg_methods[method](sampled_features, axis=0) for method in methods]
        agg_features = np.concatenate(aggreagations, axis=-1)
        
        agg_features_stack.append(agg_features)
    
    return np.stack(agg_features_stack)

In [33]:
from tqdm.auto import tqdm

def sample_dataset(features, labels, field_ids, field_sizes, n=10, methods=['mean']):
    fields, counts = np.unique(field_ids, return_counts=True)
    
    features_sampled = []
    labels_sampled = []
    field_id_sampled = []
    weights_sampled = []
    fs_sampled = []
    for fid, c in zip(tqdm(fields), counts):
        n_samples = max(c//n, 3)

        field_mask = field_ids == fid
        field_features = features[field_mask]
        field_label = labels[field_mask][0]
        field_size = field_sizes[field_mask][0]
        
        sampled_features = sample_field(field_features, N=n, n_samples=n_samples, methods=methods)
        sampled_labels = np.full(n_samples, field_label)
        sampled_fid = np.full(n_samples, fid)
        sampled_weights = np.full(n_samples, 1.0/n_samples)
        sampled_fs = np.full(n_samples, field_size)
        
        features_sampled.append(sampled_features)
        labels_sampled.append(sampled_labels)
        field_id_sampled.append(sampled_fid)
        weights_sampled.append(sampled_weights)
        fs_sampled.append(sampled_fs)
    
    features = np.concatenate(features_sampled, axis=0)
    labels = np.concatenate(labels_sampled, axis=0)
    field_ids = np.concatenate(field_id_sampled, axis=0)
    weights = np.concatenate(weights_sampled, axis=0)
    field_sizes = np.concatenate(fs_sampled, axis=0)
    
    return features, labels, field_ids, weights, field_sizes

In [34]:
features_s, crop_id_s, field_id_s, weights_s, field_size_s = sample_dataset(features, crop_id, field_id, field_size, methods=['mean', 'min', 'max'])

HBox(children=(IntProgress(value=0, max=4688), HTML(value='')))




### Concatenate field_size features

In [39]:
field_size_ext = np.stack([field_size_s for i in range(13)], axis=-1)[..., np.newaxis]
field_size_ext.shape

(14755, 13, 1)

In [40]:
features_s = np.concatenate([features_s, field_size_ext], axis=-1)
features_s.shape

(14755, 13, 43)

### Extract training data and split train, val

In [41]:
mask = crop_id_s != 0
features_valid = features_s[mask]
labels_valid = crop_id_s[mask] - 1
field_id_valid = field_id_s[mask]
weights_valid = weights_s[mask]

features_test = features_s[~mask]
field_id_test = field_id_s[~mask]

features_valid.shape, features_test.shape

((10363, 13, 43), (4392, 13, 43))

In [42]:
# # Compute field split and save
# val_ratio = 0.1
# random_state = np.random.RandomState(seed=42)

# fields = np.unique(field_id_valid)
# random_state.shuffle(fields)

# val_i = int(val_ratio * len(fields))
# fields_train = fields[val_i:]
# fields_val = fields[:val_i]
# fields_test = np.unique(field_id_test)

# # Save split
# np.save('/mnt/azrael/spatialdata/projects/eo_data/iclr-2020-challange/mlhub/split/train.npy', fields_train)
# np.save('/mnt/azrael/spatialdata/projects/eo_data/iclr-2020-challange/mlhub/split/val.npy', fields_val)
# np.save('/mnt/azrael/spatialdata/projects/eo_data/iclr-2020-challange/mlhub/split/test.npy', fields_test)
# fields_train.shape, fields_val.shape, fields_test.shape

In [43]:
# Load training field IDs (split)
fields_train = np.load('/mnt/azrael/spatialdata/projects/eo_data/iclr-2020-challange/mlhub/split/train.npy')

In [44]:
train_mask = np.isin(field_id_valid, fields_train)
print(f'Ratio of pixels in train: {np.mean(train_mask)}')

features_train = features_valid[train_mask]
labels_train = labels_valid[train_mask]
field_id_train = field_id_valid[train_mask]
weights_train = weights_valid[train_mask]

features_val = features_valid[~train_mask]
labels_val = labels_valid[~train_mask]
field_id_val = field_id_valid[~train_mask]
weights_val = weights_valid[~train_mask]

features_train.shape, features_val.shape

Ratio of pixels in train: 0.8999324519926662


((9326, 13, 43), (1037, 13, 43))

In [45]:
# Flattened features
features_train_flat = features_train.reshape((features_train.shape[0], -1))
features_val_flat = features_val.reshape((features_val.shape[0], -1))

### Train model

In [46]:
from eoflow.models import TempCNNModel, BiRNN, TransformerEncoder

In [47]:
# TempCNN model

model_config = {
    'learning_rate': 0.1,
    'n_classes': 7,
    'keep_prob': 0.5,
    'nb_conv_stacks': 2
}
model = TempCNNModel(model_config)

In [48]:
# BiRNN model

# model_config = {
#     'learning_rate': 0.1,
#     'n_classes': 7,
#     'rnn_layer': 'lstm',
#     'keep_prob': 0.5
# }
# model = BiRNN(model_config)

In [49]:
# Dense Model

# model = tf.keras.Sequential([
#     tf.keras.layers.Dense(256, activation='relu'),
#     tf.keras.layers.Dropout(rate=0.5),
#     tf.keras.layers.Dense(7, activation='softmax')
# ])

In [50]:
# Class weights
# classes = np.unique(labels_valid, return_counts=True)
# class_probs = classes[1]/np.sum(classes[1])
# class_inv_probs = 1.0 / class_probs
# weights = {i:inv_prob for i, inv_prob in enumerate(class_inv_probs)}
# weights

In [51]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [63]:
# Temporal
model.fit(
    x=features_train, 
    y=labels_train, 
    validation_data=(features_val, labels_val),
    batch_size=256,
    epochs=50,
    sample_weight=weights_train
)

# Non-temporal
# model.fit(
#     x=features_train_flat, 
#     y=labels_train, 
#     validation_data=(features_val_flat, labels_val),
#     batch_size=1024,
#     epochs=10)

  ...
    to  
  ['...']


  ...
    to  
  ['...']


Train on 9326 samples, validate on 1037 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7ff9d009c668>

### Random forest

In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

random_state = 7
rf = RandomForestClassifier(n_estimators=500, random_state=random_state, n_jobs=-1)

rf.fit(features_train_flat, labels_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=7, verbose=0,
                       warm_start=False)

### Predictions

In [64]:
preds_rf = rf.predict_proba(features_val_flat)
preds_dnn = model.predict(features_val)

In [65]:
preds_rf.shape, preds_dnn.shape

((1037, 7), (1037, 7))

In [66]:
def one_hot(labels, num_classes):
    """ One-hot encodes integer labels for a given number of classes. """
    
    one_hot_labels = np.zeros((labels.size, num_classes))
    one_hot_labels[np.arange(labels.size),labels] = 1
    
    return one_hot_labels

In [67]:
import pandas as pd
def field_predictions(predictions, labels, field_ids):
    """ Groups pixel-wise predictions into field predictions by averaging. """
    
    df = pd.DataFrame(predictions)
    df['CROP_ID'] = labels
    df['FIELD_ID'] = field_ids
    
    df_grouped = df.groupby('FIELD_ID').mean()
    field_preds = df_grouped.drop('CROP_ID', axis=1).values
    field_labels = df_grouped['CROP_ID'].values
    
    return field_preds, field_labels, df_grouped.index

In [68]:
# Create onehot labels for val
one_hot_labels_val = one_hot(labels_val, 7)

### Pixel log loss

In [69]:
rf_loss = log_loss(one_hot_labels_val, preds_rf)
dnn_loss = log_loss(one_hot_labels_val, preds_dnn)

print(f'Random forest: {rf_loss}')
print(f'DNN: {dnn_loss}')

Random forest: 1.2201519914092738
DNN: 1.1075955360473144


### Field log loss

In [81]:
preds_rf_field, lbl_rf_field, field_id_rf = field_predictions(preds_rf, labels_val, field_id_val)
preds_dnn_field, lbl_dnn_field, field_id_dnn = field_predictions(preds_dnn, labels_val, field_id_val)

In [82]:
lbl_rf_field_oh = one_hot(lbl_rf_field, 7)
lbl_dnn_field_oh = one_hot(lbl_dnn_field, 7)

In [83]:
rf_loss_field = log_loss(lbl_rf_field_oh, preds_rf_field)
dnn_loss_field = log_loss(lbl_dnn_field_oh, preds_dnn_field)

print(f'Random forest: {rf_loss_field}')
print(f'DNN: {dnn_loss_field}')

Random forest: 1.239690186361492
DNN: 1.127511332874192


## Prepare predictions for analysis

In [84]:
import pandas as pd

In [85]:
def prepare_predictions(preds, labels, field_ids):
    preds_l = np.argmax(preds, axis=1)

    df = pd.DataFrame(preds, columns=[f'Crop_ID_{i+1}' for i in range(7)])
    df['Field_ID'] = field_ids
    df['prediction'] = preds_l+1
    df['label'] = labels+1
    
    df = df.set_index('Field_ID')
    
    return df

In [86]:
df_dnn = prepare_predictions(preds_dnn_field, lbl_dnn_field, field_id_dnn)

Unnamed: 0_level_0,Crop_ID_1,Crop_ID_2,Crop_ID_3,Crop_ID_4,Crop_ID_5,Crop_ID_6,Crop_ID_7,prediction,label
Field_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.232000,0.530000,0.018000,0.086000,0.082000,0.036000,0.016000,2,1
27,0.633333,0.216000,0.026667,0.043333,0.016667,0.056667,0.007333,1,2
33,0.298000,0.301333,0.123333,0.141333,0.049333,0.035333,0.051333,2,2
44,0.331333,0.484667,0.020667,0.060667,0.030000,0.046667,0.026000,2,1
50,0.213333,0.491333,0.018000,0.129333,0.088667,0.040000,0.019333,2,5
...,...,...,...,...,...,...,...,...,...
4717,0.580000,0.249333,0.007333,0.052667,0.062000,0.036000,0.012667,1,1
4733,0.526600,0.044800,0.032200,0.249000,0.049800,0.084000,0.013600,1,1
4735,0.296000,0.184667,0.035333,0.184667,0.057333,0.238000,0.004000,1,4
4775,0.453333,0.273333,0.012000,0.120667,0.056667,0.069333,0.014667,1,1


In [87]:
np.mean(df_dnn['prediction'] == df_dnn['label'])

0.5945121951219512

In [88]:
df_dnn.to_csv(f'/mnt/azrael/spatialdata/projects/eo_data/iclr-2020-challange/mlhub/val_predictions/{model_name}.csv')

## Prepare results for submission

In [53]:
# Train best model on whole training dataset (without val)
# TempCNN model

model_config = {
    'learning_rate': 0.1,
    'n_classes': 7,
    'keep_prob': 0.5,
    'nb_conv_stacks': 2
}
model_final = TempCNNModel(model_config)
model_final.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [54]:
model_final.fit(
    x=features_valid, 
    y=labels_valid,
    batch_size=256,
    epochs=250,
    sample_weight=weights_valid,
)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 13, 42)]          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 13, 16)            3376      
_________________________________________________________________
activation_3 (Activation)    (None, 13, 16)            0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 13, 16)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 13, 16)            1296      
_________________________________________________________________
activation_4 (Activation)    (None, 13, 16)            0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 13, 16)            0   

  ...
    to  
  ['...']


Train on 10363 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/25

Epoch 150/250
Epoch 151/250
Epoch 152/250
Epoch 153/250
Epoch 154/250
Epoch 155/250
Epoch 156/250
Epoch 157/250
Epoch 158/250
Epoch 159/250
Epoch 160/250
Epoch 161/250
Epoch 162/250
Epoch 163/250
Epoch 164/250
Epoch 165/250
Epoch 166/250
Epoch 167/250
Epoch 168/250
Epoch 169/250
Epoch 170/250
Epoch 171/250
Epoch 172/250
Epoch 173/250
Epoch 174/250
Epoch 175/250
Epoch 176/250
Epoch 177/250
Epoch 178/250
Epoch 179/250
Epoch 180/250
Epoch 181/250
Epoch 182/250
Epoch 183/250
Epoch 184/250
Epoch 185/250
Epoch 186/250
Epoch 187/250
Epoch 188/250
Epoch 189/250
Epoch 190/250
Epoch 191/250
Epoch 192/250
Epoch 193/250
Epoch 194/250
Epoch 195/250
Epoch 196/250
Epoch 197/250
Epoch 198/250
Epoch 199/250
Epoch 200/250
Epoch 201/250
Epoch 202/250
Epoch 203/250
Epoch 204/250
Epoch 205/250
Epoch 206/250
Epoch 207/250
Epoch 208/250
Epoch 209/250
Epoch 210/250
Epoch 211/250
Epoch 212/250
Epoch 213/250
Epoch 214/250
Epoch 215/250
Epoch 216/250
Epoch 217/250
Epoch 218/250
Epoch 219/250
Epoch 220/250
Epoch 

<tensorflow.python.keras.callbacks.History at 0x7ff81407f6d8>

Test on val

In [55]:
preds_val = model_final.predict(features_val)

In [56]:
one_hot_labels_val = one_hot(labels_val, 7)
log_loss(one_hot_labels_val, preds_val)

0.8696593886087294

Predict on test data

In [60]:
preds_final = model_final.predict(features_test)

In [61]:
# Groups pixel-wise predictions into field predictions by averaging.
    
df_pred = pd.DataFrame(preds_final, columns=[f'Crop_ID_{i+1}' for i in range(7)])
df_pred['Field_ID'] = field_id_test
    
df_grouped = df_pred.groupby('Field_ID').mean()
df_grouped

Unnamed: 0_level_0,Crop_ID_1,Crop_ID_2,Crop_ID_3,Crop_ID_4,Crop_ID_5,Crop_ID_6,Crop_ID_7
Field_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,0.093878,0.741820,0.018600,0.058965,0.010796,0.007072,0.068869
6,0.092141,0.853336,0.000131,0.013566,0.013991,0.025687,0.001148
11,0.087642,0.674603,0.010626,0.132801,0.056422,0.003938,0.033968
13,0.243931,0.547923,0.016267,0.029965,0.114173,0.034280,0.013460
14,0.048479,0.865151,0.042108,0.007048,0.008549,0.000287,0.028379
...,...,...,...,...,...,...,...
4785,0.909692,0.070041,0.006883,0.007005,0.002077,0.002031,0.002272
4788,0.922520,0.023131,0.007554,0.028200,0.004992,0.013580,0.000022
4790,0.790623,0.102938,0.008065,0.026936,0.042148,0.028407,0.000883
4793,0.537171,0.087778,0.002759,0.163861,0.110695,0.096905,0.000830


In [62]:
df_grouped.to_csv('submissions/temp_cnn_sampling_weighted.csv')

In [63]:
model_final.save_weights('models/temp_cnn_sampling_weighted.hdf5')