In [65]:
# PACKAGE
import os, sys

# DATA
# directory of the data
datafile='./data/NL_Veluwe_2017_S1_bsc' # Sentinel-1 backscattering
#datafile='./data/NL_Veluwe_2017_S1_coh' # Sentinel-1 coherence


In [66]:
# input parameters
rootname = datafile+'_cropselect'
nclass = 5
nepoch = 80
nrun = 2

# 1 / Agreggate the crop by categories 
 Remove unuseful columns, aggragate the classes, set a numerical label and save results with '_cropselect.csv' at the end


In [67]:
import sys
import pandas as pd 

#load the data
df = pd.read_csv(datafile+ '.csv')

#remo small parcels
df = df[df.area > 1000]

# remove empty lines
df= df[df.iloc[:,3] > 0]

# remove unused columns
df.drop(['.geo', 'area', 'gws_gewas','bufferedarea', 'id', 'perimeter'], axis=1, inplace=True)

df["gws_gewasc"] = df.gws_gewasc.astype(int)

# aggregate the crops by classes
gra = df['gws_gewasc'].isin([265, 266, 331, 336, 383, 332])
mai = df['gws_gewasc'].isin([259, 316, 317])
cer = df['gws_gewasc'].isin([234, 236, 235, 237])
pot = df['gws_gewasc'].isin([2016, 2015 , 2017, 2014 ]) #2014, 2015, 2016, 

# convert the class to label integer strating from 0,1,2...
df['gws_gewasc'] = 4
df.loc[gra, 'gws_gewasc'] = 0
df.loc[mai, 'gws_gewasc'] = 1
df.loc[cer, 'gws_gewasc'] = 2
df.loc[pot, 'gws_gewasc'] = 3

# remove unlabelled parcels
subset = df.loc[df['gws_gewasc'].isin([0,1,2,3,4])]

# convert 'gws_gewasc' to 'label' and remove 'gws_gewasc'
subset.insert(1, 'label', subset['gws_gewasc'])
subset.drop(['gws_gewasc'],axis=1, inplace=True)

# save the outptu as a .csv file
subset.to_csv(datafile + '_cropselect.csv')

# 2.   / Select the training and test data
Select the training data by sampling in the different classes and save as outpus 1 file for training and 1 file for testing.

In [68]:
import sys
import pandas as pd
import numpy as np 

samplesizeGRA=300
samplesizeMAI=300
samplesizeCER=150

df = pd.read_csv(datafile + '_cropselect.csv', low_memory=False)

In [69]:
for i_nrun in range(nrun):
    trainingGRA=df.loc[df['label'] == 0].take(np.random.permutation(len(df.loc[df['label'] == 0]))[:samplesizeGRA])
    trainingMAI=df.loc[df['label'] == 1].take(np.random.permutation(len(df.loc[df['label'] == 1]))[:samplesizeMAI])
    trainingCER=df.loc[df['label'] == 2].take(np.random.permutation(len(df.loc[df['label'] == 2]))[:samplesizeCER])
    training=trainingGRA.append(trainingMAI).append(trainingCER)
    testing = df.drop(training.index)
    # save outptus
    training.to_csv(datafile+'_cropselect_train_{}'.format(len(training))+'_nrun{}'.format(i_nrun)+'.csv')
    testing.to_csv(datafile+'_cropselect_test_{}'.format(len(testing))+'_nrun{}'.format(i_nrun)+'.csv')

# 3 / Learning : building the neural network model, training the model, applying it to the parcel


Loading the training data

In [70]:
import numpy as np
import tensorflow as tf
import tflearn
import sys
import glob

# Load CSV file, indicate that the first column represents labels
from tflearn.data_utils import load_csv

# tflearn.init_graph(gpu_memory_fraction=0.0)

In [78]:
for i_nrun in range(nrun):
    tf.reset_default_graph() #reset before starting
    # load training
    flist = glob.glob(rootname + '_train_*_nrun{}.csv'.format(i_nrun))
    if len(flist) > 1:
      print("FATAL: Only single training set allowed for {}, found {}"
            .format(rootname, len(flist)))
      sys.exit(1)
    elif len(flist) == 0:
      print("FATAL: No training set found for {}".format(rootname))
      sys.exit(1)

    # load testing
    glist = glob.glob(rootname + '_test_*_nrun{}.csv'.format(i_nrun))
    if len(glist) > 1:
      print("FATAL: Only single test set allowed for {}, found {}"
            .format(rootname, len(flist)))
      sys.exit(1)
    elif len(glist) == 0:
      print("FATAL: No test set found for {}".format(rootname))
      sys.exit(1)

    fname = flist[0]
    gname = glist[0]

    data, labels = load_csv(fname, target_column=3,
                            categorical_labels=True, n_classes=nclass)

    test_data, test_labels = load_csv(gname, target_column=3,
                            categorical_labels=True, n_classes=nclass)

    # Preprocessing function
    def preprocess(profiles, columns_to_delete):
        # Sort by descending id and delete columns
        for column_to_delete in sorted(columns_to_delete, reverse=True):
            [profile.pop(column_to_delete) for profile in profiles]
        return np.array(profiles, dtype=np.float32)

    # Ignore 'id' 
    to_ignore=[0,1,2]

    # Preprocess data
    data = preprocess(data, to_ignore)

    # Build neural network
    net = tflearn.input_data(shape=[None, len(data[0])])
    net = tflearn.fully_connected(net, 32)
    net = tflearn.fully_connected(net, 32)
    net = tflearn.fully_connected(net, nclass, activation='softmax')
    net = tflearn.regression(net)

    # Define model
    model = tflearn.DNN(net)
    # Start training (apply gradient descent algorithm)
    model.fit(data, labels, n_epoch=nepoch, batch_size=32, show_metric=True)


    fw = open('{}_{}_predictions.csv'.format(rootname, i_nrun), 'w')
    fw.write("id,klass")
    for i in range(nclass):
      fw.write(",prob{}".format(i))

    fw.write('\n')

    # Check predictions for the samples not used in training
    for i in range(len(test_data)):
      sample = test_data[i][3:]
      slabel = test_labels[i].tolist().index(1)
      #print(labels[i])
      pred = model.predict([sample])
      fw.write("{},{}".format(test_data[i][2], str(slabel)))
      for i in range(nclass):
        fw.write(",{:6.2f}".format(100*pred[0][i]))
      fw.write('\n')


    tf.reset_default_graph()


Training Step: 1519  | total loss: [1m[32m0.37942[0m[0m | time: 0.043s
| Adam | epoch: 080 | loss: 0.37942 - acc: 0.8680 -- iter: 576/600
Training Step: 1520  | total loss: [1m[32m0.37938[0m[0m | time: 0.045s
| Adam | epoch: 080 | loss: 0.37938 - acc: 0.8729 -- iter: 600/600
--


# 4/ Select the class with the highest probability for eqch parcel and each run

In [83]:
for i_nrun in range(nrun):
    # load the predictions probability
    df = pd.read_csv(glob.glob(rootname +
        '_{}_predictions.csv'
        .format(i_nrun))[0], index_col=0, low_memory = False)
    # index
    r_index = df.columns[1:]
    # Select the class with the proability maximum as the prediction
    df['pred'] = df.apply(lambda x: np.array(x[r_index]).argmax(), axis=1)
    # Get the maximum probability
    df['pred_max'] = df.apply(lambda x: np.array(x[r_index]).max(), axis=1)
    
    # When the maximum probability is below 70 %, do not change the class prediction 
    df_ok = df[df.pred_max > 70]
    df_nok = df[df.pred_max <= 70]
    df_nok['pred']=df_nok['klass']
    df=df_ok.append(df_nok)
    
    df.drop(r_index, axis=1, inplace=True)
    df.drop('pred_max', axis=1, inplace=True)
    
    # save output
    df.to_csv(rootname+ '_{}_class.csv'.format(i_nrun))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


# 5 / Combine the different run and retrieve the majority class

In [73]:
import pandas as pd
import numpy as np
import sys

from collections import Counter

In [74]:
df0 = pd.read_csv('{}_0_class.csv'.format(rootname), index_col = 0, low_memory=False)
df1 = pd.read_csv('{}_1_class.csv'.format(rootname), index_col = 0, low_memory=False)

In [75]:
# Create the join and retain 'klass' label as 'klass_1'
df = df0.join(df1, how="outer", rsuffix= '_1')
# Records that were not yet in df0 have 'klass' label missing (NA)
# so, overwrite with those of 'klass_1'
df['klass'].loc[df['klass'].isnull()] = df['klass_1'].loc[df['klass'].isnull()]
# and drop the now redundant 'klass_1' label
df.drop('klass_1', axis=1, inplace=True)

In [76]:
df.fillna(-1, inplace=True)
r_index = df.columns[1:]

In [77]:
df['majclass'] = df.apply(lambda x: Counter(x[r_index]).most_common(1)[0][0], axis=1)
df['majcount'] = df.apply(lambda x: Counter(x[r_index]).most_common(1)[0][1], axis=1)

df.astype(int).to_csv('{}_classes.csv'.format(rootname))