In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import glob, os

# Read in files

This is pretty routine stuff.

* We get a list of jpeg files, reading them in as needed with `matplotlib.pyplot.imread`.

In [None]:
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
smjpegs = [f for f in glob.glob("../input/train_sm/*.jpeg")]
print(smjpegs[:9])

In [None]:
set175 = [smj for smj in smjpegs if "set175" in smj]
print(set175)

# Basic exploration

Just look at image dimensions, confirm it's 3 band (RGB), byte scaled (0-255).

In [None]:
first = plt.imread('../input/train_sm/set175_1.jpeg')
dims = np.shape(first)
print(dims)

In [None]:
np.min(first), np.max(first)

For any image specific classification, clustering, etc. transforms we'll want to 
collapse spatial dimensions so that we have a matrix of pixels by color channels.

In [None]:
pixel_matrix = np.reshape(first, (dims[0] * dims[1], dims[2]))
print(np.shape(pixel_matrix))

Scatter plots are a go to to look for clusters and separatbility in the data, but these are busy and don't reveal density well, so we
switch to using 2d histograms instead. The data between bands is really correlated, typical with
visible imagery and why most satellite image analysts prefer to at least have near infrared values.

In [None]:
#plt.scatter(pixel_matrix[:,0], pixel_matrix[:,1])
_ = plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))

In [None]:
fifth = plt.imread('../input/train_sm/set175_5.jpeg')
dims = np.shape(fifth)
pixel_matrix5 = np.reshape(fifth, (dims[0] * dims[1], dims[2]))

In [None]:
_ = plt.hist2d(pixel_matrix5[:,1], pixel_matrix5[:,2], bins=(50,50))

We can look at variations between the scenes now and see that there's a significant
amount of difference, probably due to sensor angle and illumination variation. Raw band
differences will need to be scaled or thresholded for any traditional approach.

In [None]:
_ = plt.hist2d(pixel_matrix[:,2], pixel_matrix5[:,2], bins=(50,50))

In [None]:
plt.imshow(first)

In [None]:
plt.imshow(fifth)

Without coregistering portions of the image, the naive red band subtraction for change indication
basically just shows the location shift between images.

In [None]:
plt.imshow(first[:,:,2] - fifth[:,:,1])

In [None]:
second = plt.imread('../input/train_sm/set175_2.jpeg')
plt.imshow(first[:,:,2] - second[:,:,2])

In [None]:
plt.imshow(second)

# Initial impressions

Images aren't registered, so an image registration process between images with common overlap would probably be the first step in a traditional approach.
Using a localizer in a deep learning context would probably be the newfangled way to tackle this.

Image content and differences will be dominated by topographic and built variations
due to sensor orientation, resolution differences between scenes, and some registration accuracy will be impossible to factor out as
the image hasn't been orthorectified and some anciliary data would be required for it
to be done, e.g. georeferenceing against a previously orthorectified image.

So this is basically a basic computer vision task that deep learning will be a good fit for. The usual preprocessing steps
and data expectations you'd see in remote sensing aren't fulfilled by this dataset.

In [None]:
# simple k means clustering
from sklearn import cluster

kmeans = cluster.KMeans(5)
clustered = kmeans.fit_predict(pixel_matrix)

dims = np.shape(first)
clustered_img = np.reshape(clustered, (dims[0], dims[1]))
plt.imshow(clustered_img)

In [None]:
plt.imshow(first)

In [None]:
ind0, ind1, ind2, ind3 = [np.where(clustered == x)[0] for x in [0, 1, 2, 3]]

This code doesn't run on the server.

```python
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

plot_vals = [('r', 'o', ind0),
             ('b', '^', ind1),
             ('g', '8', ind2),
             ('m', '*', ind3)]

for c, m, ind in plot_vals:
    xs = pixel_matrix[ind, 0]
    ys = pixel_matrix[ind, 1]
    zs = pixel_matrix[ind, 2]
    ax.scatter(xs, ys, zs, c=c, marker=m)

ax.set_xlabel('Blue channel')
ax.set_ylabel('green channel')
ax.set_zlabel('Red channel')
```

In [None]:
# quick look at color value histograms for pixel matrix from first image
import seaborn as sns
sns.distplot(pixel_matrix[:,0], bins=12)
sns.distplot(pixel_matrix[:,1], bins=12)
sns.distplot(pixel_matrix[:,2], bins=12)

In [None]:
# even subsampling is throwing memory error for me, :p
#length = np.shape(pixel_matrix)[0]
#rand_ind = np.random.choice(length, size=50000)
#sns.pairplot(pixel_matrix[rand_ind,:])

# Day 2

We'll start by considering the entire sequence of a different image set this time and look at strategies
for matching features across scenes.

In [None]:
set79 = [smj for smj in smjpegs if "set79" in smj]
print(set79)

In [None]:
img79_1, img79_2, img79_3, img79_4, img79_5 = \
  [plt.imread("../input/train_sm/set79_" + str(n) + ".jpeg") for n in range(1, 6)]

In [None]:
img_list = (img79_1, img79_2, img79_3, img79_4, img79_5)

print("Image " + str(n))
plt.figure(figsize=(8,10))
plt.imshow(img_list[0])
plt.show()

Tracking dimensions across image transforms is annoying, so we'll make a class to do that.
Also I'm going to use this brightness normalization transform and visualize the image that
way, good test scenario for class.

In [None]:
class MSImage():
    """Lightweight wrapper for handling image to matrix transforms. No setters,
    main point of class is to remember image dimensions despite transforms."""
    
    def __init__(self, img):
        """Assume color channel interleave that holds true for this set."""
        self.img = img
        self.dims = np.shape(img)
        self.mat = np.reshape(img, (self.dims[0] * self.dims[1], self.dims[2]))

    @property
    def matrix(self):
        return self.mat
        
    @property
    def image(self):
        return self.img
    
    def to_flat_img(self, derived):
        """"Use dims property to reshape a derived matrix back into image form when
        derived image would only have one band."""
        return np.reshape(derived, (self.dims[0], self.dims[1]))
    
    def to_matched_img(self, derived):
        """"Use dims property to reshape a derived matrix back into image form."""
        return np.reshape(derived, (self.dims[0], self.dims[1], self.dims[2]))

In [None]:
msi79_1 = MSImage(img79_1)
print(np.shape(msi79_1.matrix))
print(np.shape(msi79_1.img))

I initially defined a @np.vectorize function for this, but the loop runs faster for some
reason.

In [None]:
bnorm = np.zeros_like(msi79_1.matrix, dtype=np.float32)
for x in range(7219900):
    bnorm[x,:] = msi79_1.matrix[x,:] / float(np.max(msi79_1.matrix[x,:]))

In [None]:
bnorm_img = msi79_1.to_matched_img(bnorm)

plt.figure(figsize=(8,10))
plt.imshow(bnorm_img)
plt.show()

In [None]:
msi79_2 = MSImage(img79_2)

def bnormalize(mat):
    bnorm = np.zeros_like(mat, dtype=np.float32)
    for x in range(np.shape(mat)[0]):
        bnorm[x,:] = mat[x,:] / float(np.max(mat[x,:]))
    return bnorm

bnorm79_2 = bnormalize(msi79_2.matrix)
bnorm79_2_img = msi79_2.to_matched_img(bnorm79_2)

In [None]:
plt.figure(figsize=(8,10))
plt.imshow(bnorm79_2_img)
plt.show()

In [None]:
msinorm79_1 = MSImage(bnorm_img)
msinorm79_2 = MSImage(bnorm79_2_img)

_ = plt.hist2d(msinorm79_1.matrix[:,2], msinorm79_2.matrix[:,2], bins=(50,50))

In [None]:
_ = plt.hist2d(msinorm79_1.matrix[:,1], msinorm79_2.matrix[:,1], bins=(50,50))

In [None]:
_ = plt.hist2d(msinorm79_1.matrix[:,0], msinorm79_2.matrix[:,0], bins=(50,50))

In [None]:
import seaborn as sns
sns.distplot(msinorm79_1.matrix[:,0], bins=12)
sns.distplot(msinorm79_1.matrix[:,1], bins=12)
sns.distplot(msinorm79_1.matrix[:,2], bins=12)