In [5]:
import os
import numpy as np

In [22]:
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Loading and shaping data

In [8]:
DATA_PATH = '../../data/processed'

def load_data(year, img_h=1400, img_w=1400):

    X = []
    y = []

    data_dir = os.path.join(DATA_PATH, str(year))
    frames_dir = os.path.join(data_dir, 'frames')
    masks_dir = os.path.join(data_dir, 'masks')

    for file in sorted(os.listdir(frames_dir)):
        if file.startswith('R'):
            filename = file.split('.')[0]

            frame = np.load(os.path.join(frames_dir, filename + '.npy'))
            mask = np.load(os.path.join(masks_dir, filename + '_labels.npy'))
            X.append(frame[:img_h, :img_w, :])
            y.append(mask[:img_h, :img_w])

    X = np.array(X)
    y = np.array(y)

    return X, y

In [9]:
# Reshape data so that columns are spectral bands and rows are pixels

def reshape_data(X, y):
    X_reshaped = X.reshape(-1, X.shape[-1])
    y_reshaped = y.reshape(-1)

    return X_reshaped, y_reshaped

## Logistic regression pipeline for pixel-based classification

In [17]:
def standardizer(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

def build_pipeline():  
    pipeline = Pipeline([
        ('standarize', FunctionTransformer(standardizer, validate=False)),
        ('log_reg', LogisticRegression(solver='lbfgs', max_iter=300))])
    
    return pipeline

## Train/val split

In [45]:
X, y = load_data(2017)
X, y = reshape_data(X, y)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

## Training model

In [19]:
log_reg_pipeline = build_pipeline()
log_reg_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standarize',
                 FunctionTransformer(accept_sparse=False, check_inverse=True,
                                     func=<function standardizer at 0x122770f28>,
                                     inv_kw_args=None, inverse_func=None,
                                     kw_args=None, pass_y='deprecated',
                                     validate=False)),
                ('log_reg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=300,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

## Feature importance based on model coefficients

In [43]:
features = np.abs(log_reg_pipeline['log_reg'].coef_)
sorted_index = np.argsort(features)
sorted_index

array([[10,  9,  7,  3,  4,  0, 11, 12,  1,  2,  8,  6,  5]])

## Validating model on 2017 data

In [20]:
y_pred = log_reg_pipeline.predict(X_val)

In [21]:
# Intersection-over-union metric for image segmentation
def iou(confusion_matrix):
    tn, fp, fn, tp = confusion_matrix.ravel()
    iou = tp / (tp + fn + fp)
    return iou

In [23]:
val_confusion_matrix = confusion_matrix(y_val, y_pred)

print('Accuracy: %.2f' % accuracy_score(y_val, y_pred))
print('Intersection-over-Union: %.2f' % iou(val_confusion_matrix))
print('Confusion Matrix: \n', val_confusion_matrix)
print('Classification report:\n', classification_report(y_val, y_pred))

Accuracy: 0.92
Intersection-over-Union: 0.77
Confusion Matrix: 
 [[7798564  322616]
 [ 580054 3058766]]
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95   8121180
           1       0.90      0.84      0.87   3638820

    accuracy                           0.92  11760000
   macro avg       0.92      0.90      0.91  11760000
weighted avg       0.92      0.92      0.92  11760000

