In [0]:
from google.colab import drive
drive.mount("/content/drive")

In [0]:
!pip install fiona
!pip install rasterio
!pip install catboost


In [0]:
import sys
sys.path.append('/content/drive/My Drive/Weak learners/programming/Artur')

import fiona
import matplotlib.pyplot as plt
import numpy as np
import rasterio
import scipy
import seaborn as sns
import torch
import torch.nn as nn
import torch.utils.data as torch_data
import xgboost as xgb

from catboost import CatBoostClassifier
from extractor_helper import extractor
from imblearn.over_sampling import RandomOverSampler
from rasterio.mask import mask
from shapely import geometry
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from tqdm import tqdm_notebook

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


In [0]:
#import unlabeled data
sat_image = rasterio.open("/content/drive/My Drive/Weak learners/programming/Artur/project/1030010056130F00_MS_Pan_modified_KNN.tif")
points1 = fiona.open("/content/drive/My Drive/Weak learners/programming/Artur/project/Points_MS_Pan_KNN.shp", "r")
points2 = fiona.open("/content/drive/My Drive/Weak learners/programming/Artur/project/AB_points.shp", "r")

#import labeled data
test2_img = rasterio.open("/content/drive/My Drive/Weak learners/programming/Artur/Eval/test2/pp_2_sat_modified.tif")
test2_points = fiona.open("/content/drive/My Drive/Weak learners/programming/Artur/Eval/test2/points_2_modified_Copy.shp", "r")

test3_img = rasterio.open("/content/drive/My Drive/Weak learners/programming/Artur/Eval/test3/pp_3_sat_modified.tif")
test3_points = fiona.open("/content/drive/My Drive/Weak learners/programming/Artur/Eval/test3/targets_Copy.shp", "r")

test4_img = rasterio.open("/content/drive/My Drive/Weak learners/programming/Artur/Eval/test4/pp_4_sat_modified_spline.tif")
test4_points = fiona.open("/content/drive/My Drive/Weak learners/programming/Artur/Eval/test4/modified_points_Copy.shp", "r")


In [0]:
class AutoEncoder(nn.Module):
      def __init__(self,encoder,decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
      def forward(self,x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out

def train(epochs, net, criterion, optimizer,
          train_loader, val_loader, scheduler=None,
          verbose=True, save_dir=None):
  loss_tr =0
  loss_val =0
  net.to(device)
  for epoch in range(1,epochs+1):
      net.train()
      for X in train_loader:
          X = X.to(device)
          pred = net(X)
          loss = criterion(pred,X)
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
    
      net.eval()
      for X in val_loader:
          X = X.to(device)
          val_pred = net(X)
          val_loss = criterion(val_pred,X)
      loss /= len(train_loader)
      val_loss /= len(val_loader)
      loss_tr +=loss
      loss_val +=val_loss

  
      if scheduler is not None:
          schduler.step()
    
      freq = max(epochs//20,1)
      #if verbose and epoch%freq==0:
        #print('Epoch {}/{} || Loss:  Train {:.6f} | Validation {:.6f}'.format(epoch, epochs, loss.item(), val_loss.item())) 
  return (loss_tr/epochs).item(), (loss_val/epochs).item()

# **Experiment without using coordinates**

In [0]:
#Extract data and define image size

mse_train=np.zeros((len(range(40,130,20)),len(range(5,16,2))))
mse_test=np.zeros((len(range(40,130,20)),len(range(5,16,2))))

accuracy_train=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))
accuracy_test=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))

f1_train=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))
f1_test=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))

recall_train=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))
recall_test=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))

precision_train=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))
precision_test=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))


#for s in range(5,16,2):
for s in range(5,8,2):
  j=int((s-1)/2-2)
  # Extract images of the needed size: (2*s)x(2*s)
  # unlabeled data used for autoencoder/NN
  sm_images1, points1_new,_ = extractor(sat_image,points1,size=s,normalize=True);
  sm_images2, points2_new,_ = extractor(sat_image,points2,size=s,normalize=True);
  x=len(sm_images1+sm_images2)
  images_unlabeled = torch.tensor(np.array(sm_images1+sm_images2),dtype=torch.float32).reshape(x,-1).to(device)
  x_train, x_test, y_train, y_test = train_test_split(images_unlabeled, images_unlabeled,test_size=.20, shuffle=True,random_state=12)

  # labeled data used for classifiers
  patch2,coordinates2,labels2 = extractor(test2_img,test2_points,size=s,normalize=True,labeling=True)
  patch3,coordinates3,labels3 = extractor(test3_img,test3_points,size=s,normalize=True,labeling=True)
  patch4,coordinates4,labels4 = extractor(test4_img,test4_points,size=s,normalize=True,labeling=True)
  y=len(patch2+patch3+patch4)

  images_labeled = np.array(patch2+patch3+patch4).reshape(y,-1)
  labels = np.array(labels2+labels3+labels4).reshape(y,-1)
  x_label_train, x_label_test, y_label_train, y_label_test = train_test_split(images_labeled, labels,test_size=.20, shuffle=True, random_state=12, stratify=labels)
  #y_label_test = y_label_test.cpu().detach().numpy()

  # Upsample train data 
  #x_label_train = x_label_train.cpu().detach().numpy()
  #y_label_train = y_label_train.cpu().detach().numpy()
  ros = RandomOverSampler(random_state=12)
  x_label_train, y_label_train = ros.fit_resample(x_label_train, y_label_train)

  #for b in range(40,130,20):
  for b in range(60,90,20):
    i=int(b/20-2)

    # The size of the hiddenlayer between input and bottleneck should be in the middle between them
    m=int(round(8*((2*s)**2)-(8*((2*s)**2)-b)/2))

    # Create NN 
    # input_size -> m -> b -> m -> output_size,
    # where input_size = output_size = 8*(2*s)^2
    encoder = nn.Sequential(
      
      # Between input and hidden layer between input and bottleneck
      nn.Linear(8*(2*s)**2,m),
      nn.BatchNorm1d(m),
      nn.ReLU(),

      # Between fist hiddenlayer and bottleneck
      nn.Linear(m,b),
      nn.BatchNorm1d(b),
      nn.ReLU()
    )

    decoder = nn.Sequential(
      
      # Between Bottleneck and second hiddenlayer
      nn.Linear(b,m),
      nn.BatchNorm1d(m),
      nn.ReLU(),

      #Between second hidden layer and output
      nn.Linear(m,8*((2*s)**2)),
      nn.Sigmoid(),
    )

    # Create optimazation parameters of NN
    NN_net = AutoEncoder(encoder, decoder)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(NN_net.parameters())
    train_loader = torch_data.DataLoader(x_train,batch_size=300, shuffle=True)
    val_loader = torch_data.DataLoader(x_test,batch_size=300,shuffle=True)
    scheduler=None

    # Use 100 epochs
    lsstr, lssts = train(100, NN_net, criterion, optimizer, train_loader, val_loader, scheduler)
    #print(lsstr, lssts)
    
    # Compute the reconstructed pictures of our NN
    NN_net.eval()
    y_train_pred = NN_net(x_train).cpu().detach().numpy()
    y_test_pred = NN_net(x_test).cpu().detach().numpy()

    # Compute the MSE between original images and the reconstructed output of the autoencoder
    mse_train[i,j]=mean_squared_error(y_train.cpu().detach().numpy(),y_train_pred)
    mse_test[i,j]=mean_squared_error(y_test.cpu().detach().numpy(),y_test_pred)
    
    # Save the MSE values for different bottleneck and image sizes
    np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/Wo_coordinates/mse_train.csv', mse_train, delimiter=',')
    np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/Wo_coordinates/mse_test.csv', mse_test, delimiter=',')

    # Compress train and test data by our NN
    x_label_train_encode = NN_net.encoder(torch.tensor(x_label_train,dtype=torch.float32).to(device)).cpu().detach().numpy()
    x_label_test_encode = NN_net.encoder(torch.tensor(x_label_test,dtype=torch.float32).to(device)).cpu().detach().numpy()

    # Initialize and train classifiers we want to test
    clf1 = AdaBoostClassifier(random_state=12)
    clf2 = xgb.XGBClassifier(n_jobs=-1, random_state=12)
    clf3 = CatBoostClassifier(verbose=False,random_state=12)
    clf4 = RandomForestClassifier(n_jobs=-1, random_state=12)
    clf5 = SVC(random_state=12)
    clf6 = LogisticRegression(n_jobs=-1,random_state=12)

    #Parameters we optimize our classifiers on
    parameters = [{"n_estimators":[20,30,50,100,150,200]},
              {"n_estimators":[50,100,150,200,300,400,500], "max_depth":[3,5,6,10]},
              {},
              {"n_estimators":[50,100,150,200,300,400,500], "max_depth":[3,5,6,10,15,20]},
              {'kernel':['rbf','poly','sigmoid'],'gamma':['scale','auto'],'decision_function_shape':['ovo', 'ovr']},
              {}]

    # Compute different measures on train and test set for every classifier
    clfs=[clf1,clf2,clf3,clf4,clf5,clf6]
    for m in range(0,6):
      clf_temp = GridSearchCV(clfs[m], param_grid=parameters[m], cv=5, n_jobs=-1, scoring='f1_macro',).fit(x_label_train_encode,y_label_train)
      pred_train = clf_temp.predict(x_label_train_encode)
      pred_test = clf_temp.predict(x_label_test_encode)
      accuracy_train[i,j,m]=accuracy_score(y_label_train,pred_train)
      accuracy_test[i,j,m]=accuracy_score(y_label_test,pred_test)
      f1_train[i,j,m]=f1_score(y_label_train,pred_train,average='macro')
      f1_test[i,j,m]=f1_score(y_label_test,pred_test,average='macro')
      recall_train[i,j,m]=recall_score(y_label_train,pred_train,average='macro')
      recall_test[i,j,m]=recall_score(y_label_test,pred_test,average='macro')
      precision_train[i,j,m]=precision_score(y_label_train,pred_train,average='macro')
      precision_test[i,j,m]=precision_score(y_label_test,pred_test,average='macro')

    # Save all results
    clfs_names=['adaboost','xgb','catboost','rfc','svc','lr']
    for m in range(0,6):
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/Wo_coordinates/accuracy_train_'+clfs_names[m]+'.csv', accuracy_train[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/Wo_coordinates/accuracy_test_'+clfs_names[m]+'.csv', accuracy_test[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/Wo_coordinates/f1_train_'+clfs_names[m]+'.csv', f1_train[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/Wo_coordinates/f1_test_'+clfs_names[m]+'.csv', f1_test[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/Wo_coordinates/recall_train_'+clfs_names[m]+'.csv', recall_train[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/Wo_coordinates/recall_test_'+clfs_names[m]+'.csv', recall_test[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/Wo_coordinates/precision_train_'+clfs_names[m]+'.csv', precision_train[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/Wo_coordinates/precision_test_'+clfs_names[m]+'.csv', precision_test[:,:,m], delimiter=',')

# **Experiment using coordinates**

In [0]:
#Extract data and define image size

mse_train=np.zeros((len(range(40,130,20)),len(range(5,16,2))))
mse_test=np.zeros((len(range(40,130,20)),len(range(5,16,2))))

accuracy_train=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))
accuracy_test=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))

f1_train=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))
f1_test=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))

recall_train=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))
recall_test=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))

precision_train=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))
precision_test=np.zeros((len(range(40,130,20)),len(range(5,16,2)),6))

for s in range(5,16,2):
  j=int((s-1)/2-2)
  # Extract images of the needed size: (2*s)x(2*s)
  # unlabeled data used for autoencoder/NN
  sm_images1, points1_new,_ = extractor(sat_image,points1,size=s,normalize=True);
  sm_images2, points2_new,_ = extractor(sat_image,points2,size=s,normalize=True);
  x=len(sm_images1+sm_images2)

  images_unlabeled = torch.tensor(np.array(sm_images1+sm_images2),dtype=torch.float32).reshape(x,-1).to(device)
  co = torch.tensor(np.array(points1_new+points2_new),dtype=torch.float32).to(device)
  co[:,0]=co[:,0]/torch.max(co[:,0])
  co[:,1]=co[:,1]/torch.max(co[:,1])
  images_unlabeled_co = torch.cat((images_unlabeled,co),1)
  x_train, x_test, y_train, y_test = train_test_split(images_unlabeled_co, images_unlabeled_co,test_size=.20, shuffle=True,random_state=12)

  # labeled data used for classifiers
  patch2,coordinates2,labels2 = extractor(test2_img,test2_points,size=s,normalize=True,labeling=True)
  patch3,coordinates3,labels3 = extractor(test3_img,test3_points,size=s,normalize=True,labeling=True)
  patch4,coordinates4,labels4 = extractor(test4_img,test4_points,size=s,normalize=True,labeling=True)
  y=len(patch2+patch3+patch4)

  images_labeled = np.array(patch2+patch3+patch4).reshape(y,-1)
  co = np.array(coordinates2+coordinates3+coordinates4)
  co[:,0]=co[:,0]/np.max(co[:,0])
  co[:,1]=co[:,1]/np.max(co[:,1])
  images_labeled_co = np.concatenate((images_labeled,co),axis=1)

  labels = np.array(labels2+labels3+labels4).reshape(y,-1)
  x_label_train, x_label_test, y_label_train, y_label_test = train_test_split(images_labeled_co, labels,test_size=.20, shuffle=True, random_state=12, stratify=labels)

  # Upsample train data 
  ros = RandomOverSampler(random_state=12)
  x_label_train, y_label_train = ros.fit_resample(x_label_train, y_label_train)


  for b in range(40,130,20):    
    i=int(b/20-2)
    # The size of the hiddenlayer between input and bottleneck should be in the middle between them
    m=int(round(8*((2*s)**2)-(8*((2*s)**2)-b)/2))

    # Create NN 
    # input_size -> m -> b -> m -> output_size,
    # where input_size = output_size = 8*(2*s)^2
    encoder = nn.Sequential(
      
      # Between input and hidden layer between input and bottleneck
      nn.Linear(2+8*(2*s)**2,m),
      nn.BatchNorm1d(m),
      nn.ReLU(),

      # Between fist hiddenlayer and bottleneck
      nn.Linear(m,b),
      nn.BatchNorm1d(b),
      nn.ReLU()
    )

    decoder = nn.Sequential(
      
      # Between Bottleneck and second hiddenlayer
      nn.Linear(b,m),
      nn.BatchNorm1d(m),
      nn.ReLU(),

      #Between second hidden layer and output
      nn.Linear(m,2+8*((2*s)**2)),
      nn.Sigmoid(),
    )

    # Create optimazation parameters of NN
    NN_net = AutoEncoder(encoder, decoder)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(NN_net.parameters())
    train_loader = torch_data.DataLoader(x_train,batch_size=300, shuffle=True)
    val_loader = torch_data.DataLoader(x_test,batch_size=300,shuffle=True)
    scheduler=None

    # Use 100 epochs
    lsstr, lssts = train(100, NN_net, criterion, optimizer, train_loader, val_loader, scheduler)
    #print(lsstr, lssts)
    
    # Compute the reconstructed pictures of our NN
    NN_net.eval()
    y_train_pred = NN_net(x_train).cpu().detach().numpy()
    y_test_pred = NN_net(x_test).cpu().detach().numpy()

    # Compute the MSE between original images and the reconstructed output of the autoencoder
    mse_train[i,j]=mean_squared_error(y_train.cpu().detach().numpy(),y_train_pred)
    mse_test[i,j]=mean_squared_error(y_test.cpu().detach().numpy(),y_test_pred)
    
    # Save the MSE values for different bottleneck and image sizes
    np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/w_coordinates/mse_train.csv', mse_train, delimiter=',')
    np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/w_coordinates/mse_test.csv', mse_test, delimiter=',')

    # Compress train and test data by our NN
    x_label_train_pred = NN_net.encoder(torch.tensor(x_label_train,dtype=torch.float32).to(device)).cpu().detach().numpy()
    x_label_test_pred = NN_net.encoder(torch.tensor(x_label_test,dtype=torch.float32).to(device)).cpu().detach().numpy()

    # Initialize and train classifiers we want to test
    clf1 = AdaBoostClassifier(random_state=12)
    clf2 = xgb.XGBClassifier(n_jobs=-1, random_state=12)
    clf3 = CatBoostClassifier(verbose=False,random_state=12)
    clf4 = RandomForestClassifier(n_jobs=-1, random_state=12)
    clf5 = SVC(random_state=12)
    clf6 = LogisticRegression(n_jobs=-1,random_state=12)

    # Parameters we optimize our classifiers on
    parameters = [{"n_estimators":[20,30,50,100,150,200]},
              {"n_estimators":[50,100,150,200,300,400,500], "max_depth":[3,5,6,10]},
              {},
              {"n_estimators":[50,100,150,200,300,400,500], "max_depth":[3,5,6,10,15,20]},
              {'kernel':['rbf','poly','sigmoid'],'gamma':['scale','auto'],'decision_function_shape':['ovo', 'ovr']},
              {}]

    # Compute different measures on train and test set for every classifier
    clfs=[clf1,clf2,clf3,clf4,clf5,clf6]
    for m in range(0,6):
      clf_temp = GridSearchCV(clfs[m], param_grid=parameters[m], cv=5, n_jobs=1, scoring='f1_macro',).fit(x_label_train_pred,y_label_train)
      pred_train = clf_temp.predict(x_label_train_pred)
      pred_test = clf_temp.predict(x_label_test_pred)
      accuracy_train[i,j,m]=accuracy_score(y_label_train,pred_train)
      accuracy_test[i,j,m]=accuracy_score(y_label_test,pred_test)
      f1_train[i,j,m]=f1_score(y_label_train,pred_train,average='macro')
      f1_test[i,j,m]=f1_score(y_label_test,pred_test,average='macro')
      recall_train[i,j,m]=recall_score(y_label_train,pred_train,average='macro')
      recall_test[i,j,m]=recall_score(y_label_test,pred_test,average='macro')
      precision_train[i,j,m]=precision_score(y_label_train,pred_train,average='macro')
      precision_test[i,j,m]=precision_score(y_label_test,pred_test,average='macro')

    # Save all results
    clfs_names=['adaboost','xgb','catboost','rfc','svc','lr']
    for m in range(0,6):
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/w_coordinates/accuracy_train_'+clfs_names[m]+'.csv', accuracy_train[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/w_coordinates/accuracy_test_'+clfs_names[m]+'.csv', accuracy_test[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/w_coordinates/f1_train_'+clfs_names[m]+'.csv', f1_train[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/w_coordinates/f1_test_'+clfs_names[m]+'.csv', f1_test[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/w_coordinates/recall_train_'+clfs_names[m]+'.csv', recall_train[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/w_coordinates/recall_test_'+clfs_names[m]+'.csv', recall_test[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/w_coordinates/precision_train_'+clfs_names[m]+'.csv', precision_train[:,:,m], delimiter=',')
      np.savetxt('/content/drive/My Drive/Weak learners/programming/Artur/w_coordinates/precision_test_'+clfs_names[m]+'.csv', precision_test[:,:,m], delimiter=',')
