In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [None]:
!pip install py7zr
import py7zr
import os

if not os.path.exists('/kaggle/train/') :
    os.makedirs('/kaggle/train/')

if not os.path.exists('/kaggle/test/') :
    os.makedirs('/kaggle/test/')

with py7zr.SevenZipFile("/kaggle/input/statoil-iceberg-classifier-challenge/train.json.7z", 'r') as archive:
    archive.extractall(path="/kaggle/train")

with py7zr.SevenZipFile("/kaggle/input/statoil-iceberg-classifier-challenge/test.json.7z", 'r') as archive:
    archive.extractall(path="/kaggle/test")


In [None]:
df_train = pd.read_json('/kaggle/train/data/processed/train.json')
df_test = pd.read_json('/kaggle/test/data/processed/test.json')

In [None]:
df_train

In [None]:
missing = len(df_train[df_train.inc_angle == "na"])
print(np.around(missing/len(df_train) * 100,1) ,'% of the inc_angle data is missing')

# reshape images to their original 75x75 size
From the Data Description: "band_1, band_2 - the flattened image data. Each band has 75x75 pixel values in the list, so the list has 5625 elements."

In [None]:
band_1 = np.array([np.array(i).reshape(75,75) for i in df_train['band_1']])
band_2 = np.array([np.array(i).reshape(75,75) for i in df_train['band_2']])

# Composing a thrid band
From the background section: " you will see data with two channels: HH (transmit/receive horizontally) and HV (transmit horizontally and receive vertically)."  
Composing a third band of horizontal and vertical feedback should then be as simple as band_3 = band_1 + band_2, since our data is in Decibel which is logarithmic.

In [None]:
band_3 = band_1 + band_2
mean_3 = (band_1 + band_2) / 2

I really should read docs more often. https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.imshow.html#matplotlib.pyplot.imshow
I struggled to understand why the images looked the way they did when i plotted them using imshow.  
As it turnes out, if you give imshow 2D scalar data, it will render as a pseudocolor image where values are mapped to colors using normalization and a colormap.  
So the plots are not representative of what im putting into my model.  
This processing is not done when calling imshow on RGB data, so after creating a pseudo RGB image with the imCreate function the plots changes their appearance again.

In [None]:
fig, ax = plt.subplots(1,4,figsize = (10,10))
#horizontal feeback
ax[0].imshow(band_1[0])
#vertical feedback
ax[1].imshow(band_2[0])
#combine the two bands to create a full image
ax[2].imshow(band_3[0])
# compose a third band consisting of the mean of the first two bands
ax[3].imshow(mean_3[0])

In [None]:
def createIm(data):
    imgs = []

    for i, row in data.iterrows():
    #make 75x75 image
        b_1 = np.array(row['band_1']).reshape(75, 75)
        b_2 = np.array(row['band_2']).reshape(75, 75)
        b_3 = b_1 + b_2
        #b_3 = np.zeros((75,75))

        
#         a = (b_1 - b_1.mean()) / (b_1.max() - b_1.min())
#         b = (b_2 - b_2.mean()) / (b_2.max() - b_2.min())
#         c = (b_3 - b_3.mean()) / (b_3.max() - b_3.min())

# Rescale to 0 .. 1 range
        r = (b_1 + abs(b_1.min())) / np.max((b_1 + abs(b_1.min())))
        g = (b_2 + abs(b_2.min())) / np.max((b_2 + abs(b_2.min())))
        b = (b_3 + abs(b_3.min())) / np.max((b_3 + abs(b_3.min())))

        imgs.append(np.dstack((r, g, b)))
    return np.array(imgs)

In [None]:
ship = df_train[df_train.is_iceberg == 0]
iceberg = df_train[df_train.is_iceberg == 1]
ship_img = createIm(ship)
iceberg_img = createIm(iceberg)

In [None]:
ship_img.shape

In [None]:
plt.imshow(ship_img[0])

In [None]:
os.makedirs('train/ship')
os.makedirs('train/iceberg')

In [None]:
for i,x in enumerate(ship_img):
    plt.imsave('train/ship/ship_%d.png' % i,x)

for i,x in enumerate(iceberg_img):
    plt.imsave('train/iceberg/iceberg_%d.png' % i,x)

In [None]:
!ls /kaggle/working/train

In [None]:
from fastai.imports import *
path = Path('/kaggle/working/train')
Path.BASE_PATH = path
path.ls()

In [None]:
from fastai.vision.all import *
dls = ImageDataLoaders.from_folder(path, valid_pct=0.2,seed=42, item_tfms=Resize(224))

In [None]:
learn = cnn_learner(dls,resnet50, metrics =error_rate, loss_func = CrossEntropyLossFlat())
lr_min,lr_steep = learn.lr_find()

In [None]:
learn.fine_tune(3,base_lr = lr_min)

In [None]:
learn.show_results()

In [None]:
interpreter = ClassificationInterpretation.from_learner(learn)
interpreter.plot_top_losses(9,figsize=(20,10))