This notebook prepares the training data for CNN model training.

In [68]:
import collections
import glob
import os
import bz2
import pickle
import _pickle as cPickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load any compressed pickle file
def read_compressed_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = cPickle.load(data)
    return data

In [69]:
data_folder = '../deep_learning_trainer/generated_data_1s/'

files = glob.glob(os.path.join(data_folder, 'generate_*.pbz2'))

## Using only triggering ratio as input images.

This is only using the triggering ratio as input images, you can set use_n_images to control how many channels you want. If use_n_images == 3, means that every 3 images in time sequence will be used as the input to the model. It also has a parameter n_steady_threshold, which can reduce the cells which have less steady phones in it. 

In [79]:
X = []
y = []
meta = []

count_no_trig = 0

# Note, use only odd number of images, because I am taking np.median(labels)
# as the label of the instance, so if we are using 3 images, then [0, 0, 1]
# will have a label 0, and [0, 1, 1] will have a label 1.
use_n_images = 3
# this controls how to move the window, if it is 1, then we will move the 
# window for every image, and use_n_images controls the window length.
window_moving_step = 1

n_steady_threshold = 50

# whether we require a minimum ratio to be consider, 0.1 means anything
# smaller than 0.1 will be replaced with 0. This means, we only consider
# the ones with 0.1 triggering ratio
bottom_ratio_cap = 0.

for f in files:
    
    df_features, steady_stats, simulation_results, mat_ix = read_compressed_pickle(f)
    
    # get the cell has less than 50 steady phones
    ix_steady_mask = steady_stats['steady_phone_stats'] < n_steady_threshold
    
    for i in range(0, len(df_features) - use_n_images, window_moving_step):
        rows = df_features.iloc[i:i+use_n_images]
        
        # The last batch may contain less than use_n_images
        if len(rows) != use_n_images:
            break
        
        tmp_ratio = []
        tmp_label = []
        for ix, row in rows.iterrows():
            # If there is no trigger at all, we just skip it
            if np.all((row['trig_ratio_stats'] == 0)):
                count_no_trig += 1
                continue
            class_label = row['class_label']
            trig_ratio_stats = np.nan_to_num(row['trig_ratio_stats'])
            # Set the ratio to 0, if the steady phone in the cell less than n_steady_threshold
            trig_ratio_stats[ix_steady_mask] = 0
            
            trig_ratio_stats[trig_ratio_stats < bottom_ratio_cap] = 0
            # if there is no cell larger than bottom_ratio_cap, then 
            # we set it to zero, else, it remain the same. This basically
            # will set any earthquake triggers but ratio less than bottom_ratio_cap
            # to ignore.
            if trig_ratio_stats.max() < bottom_ratio_cap:
                class_label = 0
            tmp_ratio.append(trig_ratio_stats)
            tmp_label.append(class_label)
            
        meta.append(mat_ix)
        X.append(tmp_ratio)
        y.append(np.median(tmp_label))
#np.savez_compressed('training_small.npz', X=X, y=y, meta=meta, n_quake_trigs=n_quake_trigs)

In [80]:
np.array(X).shape

(1893859, 3, 30, 30)

In [81]:
collections.Counter(y)

Counter({0.0: 1555814, 1.0: 338045})

In [82]:
np.savez_compressed(f'training_small_nImg_{use_n_images}_nSteady_{n_steady_threshold}_1sWin_1imgStep.npz', X=X, y=y,
                    meta=meta, use_n_images=use_n_images,
                    n_steady_threshold=n_steady_threshold)

In [50]:
list(np.arange(3.5, 7.6, 0.5))

[3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5]