In [101]:
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import to_categorical
from keras.preprocessing import image
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from tqdm import tqdm
import os, sys
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
%matplotlib inline

# Upload training data file

In [102]:
df = pd.read_csv("traininglabels.csv")

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15244 entries, 0 to 15243
Data columns (total 3 columns):
image_id       15244 non-null object
has_oilpalm    15244 non-null int64
score          15244 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 357.4+ KB


In [104]:
df[df.has_oilpalm == 1].count() # only 6% of the images have palm oil plantations

image_id       942
has_oilpalm    942
score          942
dtype: int64

In [105]:
df.score.sort_values() 
# some images have very small scores. Perhaps use a threshold like 0.5 and give 0.5 or less, label 0

1062     0.3887
72       0.3944
353      0.3954
1404     0.3963
270      0.3965
4163     0.3970
3032     0.3973
5330     0.3993
8508     0.3994
1942     0.3994
9956     0.3995
10327    0.3996
1969     0.4003
14359    0.4006
1532     0.4006
7699     0.4010
8853     0.4012
715      0.4018
13936    0.4020
9844     0.4026
14229    0.4032
5541     0.4035
14625    0.4037
5937     0.4042
13531    0.4042
3234     0.4043
14813    0.4045
5242     0.4045
9099     0.4045
4756     0.4047
          ...  
5496     1.0000
5495     1.0000
5494     1.0000
5493     1.0000
5466     1.0000
5467     1.0000
5468     1.0000
5469     1.0000
5470     1.0000
5471     1.0000
5472     1.0000
5473     1.0000
5474     1.0000
5477     1.0000
5478     1.0000
5464     1.0000
5479     1.0000
5481     1.0000
5482     1.0000
5483     1.0000
5484     1.0000
5485     1.0000
5486     1.0000
5487     1.0000
5488     1.0000
5489     1.0000
5490     1.0000
5491     1.0000
5480     1.0000
15243    1.0000
Name: score, Length: 152

In [106]:
df = df.sort_values(by='image_id').reset_index(drop=True)

In [107]:
df_hasPalm = df[df.has_oilpalm == 1]
df_noPalm = df[(df.has_oilpalm == 0) & (df.score == 1)]

In [108]:
df_noPalm_samp = df_noPalm.sample(1000)
df_noPalm_samp.shape

(1000, 3)

In [109]:
df_final = df_hasPalm.append(df_noPalm_samp).sort_values(by='image_id').reset_index(drop=True)

In [110]:
df_final

Unnamed: 0,image_id,has_oilpalm,score
0,img_000122018.jpg,0,1.0000
1,img_000192018.jpg,0,1.0000
2,img_000282017.jpg,0,1.0000
3,img_000282018.jpg,0,1.0000
4,img_000372018.jpg,1,1.0000
5,img_000582017.jpg,0,1.0000
6,img_000592018.jpg,0,1.0000
7,img_000732017.jpg,0,1.0000
8,img_000822017.jpg,0,1.0000
9,img_000862017.jpg,0,1.0000


In [111]:
# Reading images from a folder and converting to numpy array
train_image = []
for i in tqdm(range(len(df_final))):
    img = image.load_img('/Users/HS/Downloads/widsdatathon2019/train_images/'+df_final.image_id[i], target_size=(150,150,1))
    img = image.img_to_array(img)
    img = img/255
    train_image.append(img)
X = np.array(train_image)
X

100%|██████████| 1942/1942 [00:05<00:00, 378.36it/s]


array([[[[ 0.09019608,  0.15686275,  0.08235294],
         [ 0.06666667,  0.13725491,  0.05098039],
         [ 0.10588235,  0.17647059,  0.09019608],
         ..., 
         [ 0.10588235,  0.18039216,  0.07058824],
         [ 0.09019608,  0.14901961,  0.05882353],
         [ 0.10588235,  0.15294118,  0.07450981]],

        [[ 0.09019608,  0.16470589,  0.07843138],
         [ 0.05882353,  0.13333334,  0.04705882],
         [ 0.09803922,  0.17647059,  0.07843138],
         ..., 
         [ 0.12156863,  0.19607843,  0.08627451],
         [ 0.15294118,  0.20784314,  0.10980392],
         [ 0.1254902 ,  0.17254902,  0.08627451]],

        [[ 0.1254902 ,  0.20392157,  0.10588235],
         [ 0.10196079,  0.18039216,  0.08235294],
         [ 0.10196079,  0.18039216,  0.07450981],
         ..., 
         [ 0.11764706,  0.1882353 ,  0.08627451],
         [ 0.17254902,  0.22745098,  0.12941177],
         [ 0.14117648,  0.1882353 ,  0.09411765]],

        ..., 
        [[ 0.07450981,  0.15686275,

In [112]:
y = df_final['has_oilpalm'].values
y = to_categorical(y)
y

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       ..., 
       [ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.]], dtype=float32)

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [115]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',input_shape=(150,150,3)))
model.add(Conv2D(32, kernel_size=(3, 3),activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='sigmoid'))

In [116]:
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

In [117]:
# Build model with training data and get loss and accuracy scores
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Train on 1553 samples, validate on 389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a49c44be0>

# Leaderboard Test Data Prediction

In [118]:
testpath = "/Users/HS/Downloads/widsdatathon2019/leaderboard_test_data"
testimagesList = sorted(os.listdir(testpath))
testimagesList

['img_000022018.jpg',
 'img_000042017.jpg',
 'img_000052017.jpg',
 'img_000062017.jpg',
 'img_000062018.jpg',
 'img_000122017.jpg',
 'img_000142018.jpg',
 'img_000162018.jpg',
 'img_000182017.jpg',
 'img_000262017.jpg',
 'img_000292017.jpg',
 'img_000302018.jpg',
 'img_000342018.jpg',
 'img_000382017.jpg',
 'img_000382018.jpg',
 'img_000422017.jpg',
 'img_000422018.jpg',
 'img_000432017.jpg',
 'img_000472018.jpg',
 'img_000532017.jpg',
 'img_000542018.jpg',
 'img_000552017.jpg',
 'img_000592017.jpg',
 'img_000602018.jpg',
 'img_000622017.jpg',
 'img_000622018.jpg',
 'img_000632018.jpg',
 'img_000672018.jpg',
 'img_000682018.jpg',
 'img_000692017.jpg',
 'img_000702017.jpg',
 'img_000752017.jpg',
 'img_000762017.jpg',
 'img_000812018.jpg',
 'img_000822018.jpg',
 'img_000832018.jpg',
 'img_000842018.jpg',
 'img_000852018.jpg',
 'img_000872018.jpg',
 'img_000882017.jpg',
 'img_000882018.jpg',
 'img_000902018.jpg',
 'img_000912017.jpg',
 'img_000912018.jpg',
 'img_000922018.jpg',
 'img_0009

In [119]:
leaderboard_test_pred = pd.DataFrame(testimagesList , columns=['image_id'])
leaderboard_test_pred

Unnamed: 0,image_id
0,img_000022018.jpg
1,img_000042017.jpg
2,img_000052017.jpg
3,img_000062017.jpg
4,img_000062018.jpg
5,img_000122017.jpg
6,img_000142018.jpg
7,img_000162018.jpg
8,img_000182017.jpg
9,img_000262017.jpg


In [120]:
test_image = []
for i in tqdm(range(len(testimagesList))):
    img = image.load_img('/Users/HS/Downloads/widsdatathon2019/leaderboard_test_data/'+testimagesList[i], target_size=(150,150,1))
    img = image.img_to_array(img)
    img = img/255
    test_image.append(img)
test = np.array(test_image)
test

100%|██████████| 4356/4356 [00:11<00:00, 392.11it/s]


array([[[[ 0.43529412,  0.35294119,  0.23137255],
         [ 0.52941179,  0.44705883,  0.33333334],
         [ 0.54901963,  0.46666667,  0.36078432],
         ..., 
         [ 0.18431373,  0.1882353 ,  0.13333334],
         [ 0.18039216,  0.18431373,  0.12941177],
         [ 0.30980393,  0.3137255 ,  0.25882354]],

        [[ 0.61960787,  0.53725493,  0.42352942],
         [ 0.58823532,  0.50588238,  0.39215687],
         [ 0.56078434,  0.47843137,  0.37254903],
         ..., 
         [ 0.17647059,  0.18431373,  0.12941177],
         [ 0.18431373,  0.1882353 ,  0.13333334],
         [ 0.21960784,  0.22352941,  0.16862746]],

        [[ 0.77254903,  0.67843139,  0.56862748],
         [ 0.68235296,  0.60000002,  0.48627451],
         [ 0.56470591,  0.48235294,  0.3764706 ],
         ..., 
         [ 0.16470589,  0.17254902,  0.12156863],
         [ 0.18431373,  0.19215687,  0.13725491],
         [ 0.16862746,  0.17647059,  0.12156863]],

        ..., 
        [[ 0.66274512,  0.55686277,

In [121]:
# making predictions
prediction = model.predict_classes(test)

In [122]:
leaderboard_test_pred['has_oilpalm'] = prediction

In [123]:
leaderboard_test_pred.has_oilpalm.unique()

array([1, 0])

In [124]:
leaderboard_test_pred[leaderboard_test_pred.has_oilpalm == 1]

Unnamed: 0,image_id,has_oilpalm
0,img_000022018.jpg,1
5,img_000122017.jpg,1
9,img_000262017.jpg,1
11,img_000302018.jpg,1
15,img_000422017.jpg,1
16,img_000422018.jpg,1
23,img_000602018.jpg,1
24,img_000622017.jpg,1
25,img_000622018.jpg,1
27,img_000672018.jpg,1


# Leaderboard Holdback Data Prediction

In [125]:
holdpath = "/Users/HS/Downloads/widsdatathon2019/leaderboard_holdout_data"
holdimagesList = sorted(os.listdir(holdpath))
holdimagesList

['img_000012018.jpg',
 'img_000032017.jpg',
 'img_000132018.jpg',
 'img_000162017.jpg',
 'img_000222017.jpg',
 'img_000222018.jpg',
 'img_000252018.jpg',
 'img_000262018.jpg',
 'img_000432018.jpg',
 'img_000482017.jpg',
 'img_000482018.jpg',
 'img_000502017.jpg',
 'img_000572017.jpg',
 'img_000612018.jpg',
 'img_000642017.jpg',
 'img_000642018.jpg',
 'img_000652018.jpg',
 'img_000682017.jpg',
 'img_000722017.jpg',
 'img_000742018.jpg',
 'img_000772018.jpg',
 'img_000842017.jpg',
 'img_000922017.jpg',
 'img_001022017.jpg',
 'img_001092018.jpg',
 'img_001102018.jpg',
 'img_001162017.jpg',
 'img_001172017.jpg',
 'img_001242018.jpg',
 'img_001282017.jpg',
 'img_001282018.jpg',
 'img_001352017.jpg',
 'img_001502017.jpg',
 'img_001612017.jpg',
 'img_001622018.jpg',
 'img_001672018.jpg',
 'img_001822018.jpg',
 'img_001832017.jpg',
 'img_001882017.jpg',
 'img_001962018.jpg',
 'img_002062017.jpg',
 'img_002112018.jpg',
 'img_002182017.jpg',
 'img_002202018.jpg',
 'img_002242017.jpg',
 'img_0023

In [126]:
leaderboard_hold_pred = pd.DataFrame(holdimagesList , columns=['image_id'])
leaderboard_hold_pred

Unnamed: 0,image_id
0,img_000012018.jpg
1,img_000032017.jpg
2,img_000132018.jpg
3,img_000162017.jpg
4,img_000222017.jpg
5,img_000222018.jpg
6,img_000252018.jpg
7,img_000262018.jpg
8,img_000432018.jpg
9,img_000482017.jpg


In [127]:
hold_image = []
for i in tqdm(range(len(holdimagesList))):
    img = image.load_img('/Users/HS/Downloads/widsdatathon2019/leaderboard_holdout_data/'+holdimagesList[i], target_size=(150,150,1))
    img = image.img_to_array(img)
    img = img/255
    hold_image.append(img)
hold = np.array(hold_image)
hold

100%|██████████| 2178/2178 [00:05<00:00, 401.97it/s]


array([[[[ 0.14901961,  0.21176471,  0.09803922],
         [ 0.15294118,  0.21568628,  0.10196079],
         [ 0.11372549,  0.17647059,  0.0627451 ],
         ..., 
         [ 0.07058824,  0.14509805,  0.02745098],
         [ 0.03921569,  0.11764706,  0.00784314],
         [ 0.11764706,  0.19607843,  0.08627451]],

        [[ 0.12941177,  0.19215687,  0.07843138],
         [ 0.14117648,  0.20392157,  0.09019608],
         [ 0.12941177,  0.19607843,  0.07058824],
         ..., 
         [ 0.10588235,  0.18039216,  0.0627451 ],
         [ 0.07058824,  0.14509805,  0.03529412],
         [ 0.0627451 ,  0.14117648,  0.03137255]],

        [[ 0.13333334,  0.19215687,  0.07058824],
         [ 0.14509805,  0.20392157,  0.08235294],
         [ 0.14117648,  0.20784314,  0.08235294],
         ..., 
         [ 0.12156863,  0.19607843,  0.07843138],
         [ 0.14117648,  0.21568628,  0.09803922],
         [ 0.11764706,  0.19215687,  0.08235294]],

        ..., 
        [[ 0.14509805,  0.20392157,

In [128]:
# making predictions
prediction = model.predict_classes(hold)

In [129]:
leaderboard_hold_pred['has_oilpalm'] = prediction

In [130]:
leaderboard_hold_pred.has_oilpalm.unique()

array([1, 0])

In [131]:
leaderboard_test_pred[leaderboard_test_pred.has_oilpalm == 1]

Unnamed: 0,image_id,has_oilpalm
0,img_000022018.jpg,1
5,img_000122017.jpg,1
9,img_000262017.jpg,1
11,img_000302018.jpg,1
15,img_000422017.jpg,1
16,img_000422018.jpg,1
23,img_000602018.jpg,1
24,img_000622017.jpg,1
25,img_000622018.jpg,1
27,img_000672018.jpg,1


# Building submission file

In [132]:
leaderboard_test_pred.shape

(4356, 2)

In [133]:
leaderboard_hold_pred.shape

(2178, 2)

In [134]:
finalSubmission = leaderboard_test_pred.append(leaderboard_hold_pred)
finalSubmission.shape

(6534, 2)

In [135]:
finalSubmission.head()

Unnamed: 0,image_id,has_oilpalm
0,img_000022018.jpg,1
1,img_000042017.jpg,0
2,img_000052017.jpg,0
3,img_000062017.jpg,0
4,img_000062018.jpg,0


In [136]:
finalSubmission.to_csv("finalSubmission_underSamp.csv",index=False)