# Farmland & Crops Keras + GBM


## Import Package


In [3]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt

In [22]:
import lightgbm as lgb

In [15]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
from utils.metrics import evaluate
from utils.image import load_image_and_resize
from utils.datetime import get_taken_time, get_taken_month, transform_day_of_year

## Const & Inputs

Here are all of the parameters to change for the run.


In [3]:
# top level data directory. Here we assume the format of the directory conforms to the ImageFolder structure
path = "."

labels = next(os.walk(path), (None, None, []))[1]

# Number of classes in the dataset
num_classes = 14

# Image size for model
image_size = 224

## Data Preprocessing

Include: laod data, encode categorical features, split data, normalize data, and save data.


In [9]:
# Encoder to choose from [OneHotEncoder, LabelEncoder]
encoder = 'LabelEncoder'

### Load Data

We can choose to process the data from scratch, or read data that has already been processed.


In [10]:
subset = pd.read_csv(f'{path}/data/label.csv')
subset['taken_datetime'] = subset['path'].apply(lambda x: get_taken_time(x))
subset['day_of_year'] = subset['taken_datetime'].apply(lambda x: x.timetuple().tm_yday)
subset['transform_day_of_year'] = subset['day_of_year'].apply(lambda x: transform_day_of_year(x))
subset.to_csv(f'{path}/data/describe_train_subset.csv', index=False)

In [11]:
tqdm.pandas()
simple = pd.read_csv(f'{path}/data/describe_train_subset.csv')
simple["vector"] = simple['path'].progress_apply(lambda x: load_image_and_resize(x, new_size=image_size, resize_method="resize"))

### Encode categorical features

OneHotEncoder or LabelEncoder.


In [12]:
if(encoder == 'OneHotEncoder'):
    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(simple['label'].values.reshape(-1, 1))
    simple['class'] = enc.transform(simple['true_label'].values.reshape(-1, 1)).toarray()
elif(encoder == 'LabelEncoder'):
    from sklearn.preprocessing import LabelEncoder
    enc = LabelEncoder()
    enc.fit(simple['label'].values)
    simple['class'] = enc.transform(simple['true_label'].values)

### Split Data

Split data into train, validation and test sets.


In [13]:
train = simple[simple['set_name'].eq('train')]
x_train_vector, y_train = np.array(train['vector'].tolist()), np.array(train['class'].tolist())
x_train_additional = np.array(train['transform_day_of_year'].tolist()).reshape(-1, 1)

valid = simple[simple['set_name'].eq('valid')]
x_valid_vector, y_valid = np.array(valid['vector'].tolist()), np.array(valid['class'].tolist())
x_valid_additional = np.array(valid['transform_day_of_year'].tolist()).reshape(-1, 1)

test = simple[simple['set_name'].eq('test')]
x_test_vector, y_test = np.array(test['vector'].tolist()), np.array(test['class'].tolist())
x_test_additional = np.array(test['transform_day_of_year'].tolist()).reshape(-1, 1)

print(f'x_train {x_train_vector.shape}, y_train {y_train.shape}')
print(f'x_valid {x_valid_vector.shape}, y_valid {y_valid.shape}')
print(f'x_test {x_test_vector.shape}, y_test {y_test.shape}')

x_train (14049, 224, 224, 3), y_train (14049,)
x_valid (3003, 224, 224, 3), y_valid (3003,)
x_test (3013, 224, 224, 3), y_test (3013,)


## Modelling and Training

Using pretrained EfficientNet and adding the last dense layer.


In [14]:
# If set to False, you will need to define the output layer yourself.
include_top = True

# Get the shape of the input layer from training data.
input_shape = x_train_vector.shape[1:]
print('input_shape:', input_shape)

input_shape: (224, 224, 3)


### Initialize and Reshape the Networks


In [None]:
if(include_top):
    base_model = tf.keras.applications.EfficientNetB0(weights='imagenet', input_shape=input_shape)
    model = tf.keras.models.Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)
    # model.summary()
else:
    base_model = tf.keras.applications.EfficientNetB0(weights='imagenet', input_shape=input_shape, include_top=False)
    output = tf.keras.layers.Flatten(name='flatten')(base_model.layers[-1].output)
    output = tf.keras.layers.Dense(input_shape[0], activation='relu', name='fc')(output)
    model = tf.keras.models.Model(inputs=base_model.input, outputs=output) 
    # model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(lr=0.0001)
model.compile(
    loss='categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

### Predict training data

Use the pretrained EfficientNet model to extract predicitons for the training data.


In [20]:
x_train_vvv = model.predict(x_train_vector)
x_valid_vvv = model.predict(x_valid_vector)
x_test_vvv = model.predict(x_test_vector)
print(x_train_vvv.shape)

(14049, 1280)


## Classification Prediction

Based on the output of feature extractor.


In [5]:
model_name = 'LightGBM'

### Inputs

Add additional parameters on the output of feature extractor to create inputs.


In [24]:
x_train = np.concatenate((x_train_vvv, x_train_additional), axis=1)
x_valid = np.concatenate((x_valid_vvv, x_valid_additional), axis=1)
x_test = np.concatenate((x_test_vvv, x_test_additional), axis=1)
x_train.shape

(14049, 1281)

### Modelling


In [32]:
def initialize_model(model_name, params, num_classes):
    return lgb.LGBMClassifier(**params)

In [6]:
# Load best params
# best_params = {'objective':'multiclass', 'num_class':14, 'device':'gpu'}
best_params = pd.read_pickle(f'{path}/models/{model_name}-hyperopt180.p')
best_params

{'class_weight': None,
 'colsample_bytree': 0.5,
 'device': 'gpu',
 'learning_rate': 0.026622694688566075,
 'max_depth': 7,
 'min_child_samples': 30,
 'min_child_weight': 3.0,
 'min_split_gain': 8.221232425878033e-05,
 'n_estimators': 4900,
 'num_class': 14,
 'num_leaves': 29,
 'objective': 'multiclass',
 'reg_alpha': 0.00013231176673023093,
 'reg_lambda': 0.04977589545584271,
 'subsample': 0.6000000000000001}

In [33]:
# Create the model [LightGBM, XGBoost]
clf = initialize_model(model_name, best_params, num_classes)

# Train the model using the training sets
clf.fit(x_train, y_train)

LGBMClassifier(device='gpu', num_class=14, objective='multiclass')

In [35]:
# Predict the response for test dataset
y_pred = clf.predict(x_test)

# Transform the encoded predictions to their actual values
y_pred_inverse = enc.inverse_transform(y_pred)
y_test_inverse = enc.inverse_transform(y_test)

# Evaluate the model
evaluation = evaluate(y_test_inverse.tolist(), y_pred_inverse.tolist())
pd.DataFrame(evaluation).T

Unnamed: 0,precision,recall,f1-score,support
banana,0.979021,0.921053,0.949153,152.0
bareland,0.890173,0.968553,0.927711,477.0
carrot,0.859649,0.662162,0.748092,74.0
corn,0.832447,0.862259,0.847091,363.0
dragonfruit,0.753623,0.722222,0.737589,72.0
garlic,0.81672,0.881944,0.84808,288.0
guava,0.829016,0.888889,0.857909,180.0
peanut,0.82679,0.910941,0.866828,393.0
pineapple,0.919732,0.919732,0.919732,299.0
pumpkin,0.707317,0.467742,0.563107,62.0


### Save Results


In [36]:
# The time model was trained
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Build the dataframe of results
result = test[['path', 'true_label', 'taken_datetime']].reset_index(drop=True)

# Only keep filename from path
# result['path'] = result['path'].apply(lambda x: '/'.join(x.split('\\')[5:]))

# Add the predicted labels to dataframe
result['pred_label'] = y_pred_inverse.tolist()

# Save the results to a csv file
# result.to_csv(f'./results/efficientnetb0-{model_name}-{x_train.shape[1]}-hyperopt180-from-vgg19-{timestamp}.csv', index=None)