In [54]:
import numpy as np
import sys
import os

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Loading data from file

In [7]:
DATA_PATH = '../../data/processed'

def load_data(year, img_h=1400, img_w=1400):

    X = []
    y = []

    data_dir = os.path.join(DATA_PATH, str(year))
    frames_dir = os.path.join(data_dir, 'frames')
    masks_dir = os.path.join(data_dir, 'masks')

    for file in sorted(os.listdir(frames_dir)):
        if file.startswith('R'):
            filename = file.split('.')[0]

            frame = np.load(os.path.join(frames_dir, filename + '.npy'))
            mask = np.load(os.path.join(masks_dir, filename + '_labels.npy'))
            X.append(frame[:img_h, :img_w, :])
            y.append(mask[:img_h, :img_w])

    X = np.array(X)
    y = np.array(y)

    return X, y

## Data preprocessing

In [14]:
def reshape_data(X, y):
    """ Reshapes data so that columns are spectral bands and rows are pixels """

    X_reshaped = X.reshape(-1, X.shape[-1])
    y_reshaped = y.reshape(-1)

    return X_reshaped, y_reshaped

def standardizer(X):
    """ Standardizes features by removing mean and scaling to unit variance  """
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled

## Building model pipeline

In [60]:
MODEL_PATH = '../../models'

def build_pipeline():
    """ Builds logistic regression pipeline with standardization """
    
    pipeline = Pipeline([
        ('standarize', FunctionTransformer(standardizer, validate=False)),
        ('log_reg', LogisticRegression(solver='lbfgs', max_iter=300))])

    return pipeline

def save_model(pipeline, filename):
    model = pipeline.named_steps['log_reg']
    pickle.dump(model, open(os.path.join(MODEL_PATH, filename), 'wb'))

## Metrics

In [44]:
def intersection_over_union(confusion_matrix):
    """ Intersection-over-union metric for image segmentation """
    
    tn, fp, fn, tp = confusion_matrix.ravel()
    iou = tp / (tp + fn + fp)
    return iou

def print_metrics(y_true, y_pred):
    """ Print classification metrics  """
    
    conf_mat = confusion_matrix(y_true, y_pred)

    print('Accuracy: %.2f' % accuracy_score(y_true, y_pred))
    print('Intersection-over-Union: %.2f' % intersection_over_union(conf_mat))
    print('Confusion Matrix: \n', conf_mat)
    print('Classification report:\n', classification_report(y_true, y_pred))

## Training on 2017 data 

In [57]:
if __name__ == "__main__":
    
    # loading and reshaping data
    X, y = load_data(year=2017)
    X, y = reshape_data(X, y)

    # train/test split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

    # training
    log_reg_pipeline = build_pipeline()
    log_reg_pipeline.fit(X_train, y_train)
    
    # save model
    save_model(log_reg_pipeline, 'log_reg.pkl')

    # predicting
    y_pred = log_reg_pipeline.predict(X_val)
    
    # results
    print_metrics(y_val, y_pred)

Accuracy: 0.92
Intersection-over-Union: 0.77
Confusion Matrix: 
 [[7798704  322010]
 [ 580550 3058736]]
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95   8120714
           1       0.90      0.84      0.87   3639286

    accuracy                           0.92  11760000
   macro avg       0.92      0.90      0.91  11760000
weighted avg       0.92      0.92      0.92  11760000

