In [13]:
import pandas as pd
import numpy as np
import glob as glob
import re
import os
import itertools as it
from sklearn.linear_model import LogisticRegression
from PIL import Image as img

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor

from skimage.io import imread
from skimage.color import rgb2gray
from skimage.transform import resize

from utils import create_submission, rle_encoding

In [68]:
### Parameters

subset_proportion = .0001
batch_size = 5000

img_height = 512
img_width = 512
img_channels = 3
smooth = 1e-9

In [15]:
### Functions

def get_img_id(img_path):
    
    img_basename = os.path.basename(img_path)
    img_id = os.path.splitext(img_basename)[0][:-len('_sat')]
    return img_id

def image_gen(img_paths, img_size=(img_height, img_width)):

    for img_path in img_paths:
        
        img_id = get_img_id(img_path)
        mask_path = os.path.join(path_to_train, img_id + '_msk.png')
        
        img = imread(img_path) / 255.
        mask = rgb2gray(imread(mask_path))
        
        img = resize(img, img_size, preserve_range=True)
        mask = resize(mask, img_size, mode='constant', preserve_range=True)
        mask = (mask >= 0.5).astype(float)
        
        yield img, mask

def dice_coef(y_true, y_pred):
    
    y_true_f = K.flatten(y_true)
    y_pred = K.cast(y_pred, 'float32')
    y_pred_f = K.cast(K.greater(K.flatten(y_pred), 0.5), 'float32')
    intersection = y_true_f * y_pred_f
    score = 2. * (K.sum(intersection) + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
    
    return score

def image_batch_generator(img_paths, batchsize=batch_size):
    
    while True:
        
        ig = image_gen(img_paths)
        batch_img, batch_mask = [], []
        
        for img, mask in ig:

            batch_img.append(img)
            batch_mask.append(mask)

            if len(batch_img) == batchsize:
                
                yield np.stack(batch_img, axis=0), np.expand_dims(np.stack(batch_mask, axis=0),axis = -1)
                batch_img, batch_mask = [], []
        
        if len(batch_img) != 0:
            yield np.stack(batch_img, axis=0), np.expand_dims(np.stack(batch_mask, axis=0),axis = -1)
            batch_img, batch_mask = [], []

def calc_steps(data_len, batchsize):
    
    return (data_len + batchsize - 1) // batchsize

def create_submission(csv_name, predictions, image_ids):
    """
    csv_name -> string for csv ("XXXXXXX.csv")
    predictions -> numpyarray of size (num_examples, height, width)
                In this case (num_examples, 512, 512)
    image_ids -> numpyarray or list of size (num_examples,)
    
    predictions[i] should be the prediciton of road for image_id[i]
    """
    sub = pd.DataFrame()
    sub['ImageId'] = image_ids
    encodings = []
    num_images = len(image_ids)
    for i in range(num_images):
        if (i+1) % (num_images//10) == 0:
            print(i, num_images)
        encodings.append(rle_encoding(predictions[i]))
        
    sub['EncodedPixels'] = encodings
    sub.to_csv(csv_name, index=False)

In [69]:
%%time
### Loading images

path_to_train = 'comp-540-spring-2019/train'

glob_train_imgs = os.path.join(path_to_train, '*_sat.jpg')
glob_train_masks = os.path.join(path_to_train, '*_msk.png')

train_img_paths = glob.glob(glob_train_imgs)
train_mask_paths = glob.glob(glob_train_masks)

ig = image_gen(train_img_paths)
train_pixels, train_masks = [], []
neighborhoods = [1 << exponent for exponent in range(1, 9)]

count = 0
for image, mask in ig:
    
    temp_img = pd.DataFrame(np.array(image).reshape((img_width * img_height, 3)))
    temp_img.columns = ["R", "G", "B"]
    temp_img["Lightness"] = (temp_img[["R", "G", "B"]].max(axis = 1) + temp_img[["R", "G", "B"]].max(axis = 1)) / 2
    
    for neighbor in neighborhoods:
        temp_n = img.fromarray(np.uint8(image)).resize((neighbor, neighbor))
        temp_n_low_res = temp_n.resize((img_width, img_height))
        temp_n_mat = pd.DataFrame(list(temp_n_low_res.getdata()))
        temp_n_mat.columns = ["R", "G", "B"]
        temp_img[("R_" + chr(neighbor))] = temp_n_mat["R"]
        temp_img[("G_" + chr(neighbor))] = temp_n_mat["G"]
        temp_img[("B_" + chr(neighbor))] = temp_n_mat["B"]
    
    temp_img['Mask'] = np.array(mask).reshape((img_width * img_height))
    temp_img = temp_img.sample(int((temp_img.shape[0]) * subset_proportion))
    
    train_masks.append(temp_img['Mask'])
    train_pixels.append(temp_img.drop('Mask', axis = 1))
    
    if(count == batch_size):
        break
    count += 1

train_masks = np.array(np.concatenate(train_masks, axis = 0))[:, np.newaxis]
train_pixels = np.concatenate(train_pixels, axis = 0)
train_data = pd.DataFrame(np.concatenate([train_pixels, train_masks], axis = 1))

  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


KeyboardInterrupt: 

In [70]:
### Building features


324

In [2]:
### Model for predicting whether image has any roads
### Input: list of images

In [None]:
### Splitting into train and validation

train_X, test_X, train_y, test_y = train_test_split(train_data[train_data.columns[0:28]], \
                                                    train_data[train_data.columns[28]], test_size = 0.5)

In [66]:
### Model for road detection in images with roads
### Input: list of images which have been predicted to have roads

rf = RandomForestRegressor(n_estimators = 300, random_state = 123)

rffit = rf.fit(train_X, train_y)


In [67]:
### Evaluation metrics

pred_y = np.round(rf.predict(test_X))

pred_y = np.round(pred_y)
conf_mat = confusion_matrix(test_y, pred_y)
print(conf_mat)
print(classification_report(test_y, pred_y))
print(1 - dice(pred_y, test_y))

[[38545   179]
 [  459   138]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     38724
         1.0       0.44      0.23      0.30       597

   micro avg       0.98      0.98      0.98     39321
   macro avg       0.71      0.61      0.65     39321
weighted avg       0.98      0.98      0.98     39321

0.30196936542669583


In [None]:
### Model for smoothing mask predictions
### Input: list of masks