In [1]:
# Import packages
import os
import re  # regular expressions
import warnings
import matplotlib.pyplot as plt
import numpy as np
import numpy.ma as ma
import rasterio as rio
from rasterio.plot import plotting_extent
import geopandas as gpd
import earthpy as et
import earthpy.plot as ep
import earthpy.spatial as es
import earthpy.mask as em

import rioxarray as rxr

from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

from sklearn.datasets import dump_svmlight_file

from glob import glob

from queue import Queue

warnings.simplefilter('ignore')

# Set working directory
os.chdir(os.path.join(et.io.HOME, 'BD', 'BA_DATA'))

In [2]:
# Constants
d_limit = 1200
tem_conversion = 0.02 # convert to Kelvin temperature
vege_fillvalue = -3000
vege_lowerest = -2000
NE_fillvalue = 32767
tem_fillvalue = 0
radius = 50

vege_len = 457
tem_len = 914
NE_len = 914
thermal_len = 914

In [3]:
vege_fs84 = glob('VegeData_16day/*h08v04*')
vege_fs85 = glob('VegeData_16day/*h08v05*')

In [4]:
tem_fs84 = glob('TemData_8day/*h08v04*')
tem_fs85 = glob('TemData_8day/*h08v05*')

In [5]:
NE_fs84 = glob('NE_Data_8day/*h08v04*')
NE_fs85 = glob('NE_Data_8day/*h08v05*')

In [6]:
thermal_fs84 = glob('ThermalData_8day/*h08v04*')
thermal_fs85 = glob('ThermalData_8day/*h08v05*')

In [7]:
len(NE_fs85)

914

In [8]:
len(NE_fs84)

914

In [9]:
len(tem_fs85)

914

### Mask function

Pick any location, divide the neighborhood into 10 regions of interest in various directions, each region is within 50 km, 36 degree and 50 km sector as a region. 

In [10]:
def sector_mask(shape,centre,radius,angle_range):
    """
    Return a boolean mask for a circular sector. The start/stop angles in  
    `angle_range` should be given in clockwise order.
    """

    x,y = np.ogrid[:shape[0],:shape[1]]
    cx,cy = centre
    tmin,tmax = np.deg2rad(angle_range)

    # ensure stop angle > start angle
    if tmax < tmin:
            tmax += 2*np.pi

    # convert cartesian --> polar coordinates
    r2 = (x-cx)*(x-cx) + (y-cy)*(y-cy)
    theta = np.arctan2(x-cx,y-cy) - tmin

    # wrap angles between 0 and 2*pi
    theta %= (2*np.pi)

    # circular mask
    circmask = r2 <= radius*radius

    # angular mask
    anglemask = theta <= (tmax-tmin)

    return circmask*anglemask

In [11]:
def compute_feature_set(matrix, fill_value, new_value, row, colmn):
    """
    Return a feature set of a single pixel with row number ROW and column number COLUMN in the 1200 * 1200 matrix MATRIX.
    Feature set is computed based on the average value in 10 regions in one time frame from FILENAMES.
    Fill_value will be replaced by new_value in the feature matrix.
    """
    feature_set = []
    for i in range(10):
        mask = sector_mask(matrix.shape, (row, column), radius, (i, i + 36))
        sector = matrix[mask]
        sector[:][sector[:] == fill_value] = new_value
        mean_value = np.mean(sector)
        feature_set.append(mean_value)
    return feature_set

In [12]:
def compute_vege_feature_set(matrix, row, colmn):
    """
    Return a vegetation indices feature set of a single pixel with row number ROW and column number COLUMN 
    in the 1200 * 1200 matrix MATRIX.
    Feature set is computed based on the average value in 10 regions in one time frame from FILENAMES.
    """
    feature_set = []
    for i in range(10):
        mask = sector_mask(matrix.shape, (row, column), radius, (i, i + 36))
        sector = matrix[mask]
        sector[:][sector[:] == vege_fillvalue] = vege_lowerest # change fillvalue
        mean_value = np.mean(sector)
        feature_set.append(mean_value)
    return feature_set

In [13]:
def compute_thermal_feature_set(matrix, row, colmn):
    """
    Return a thermal anomalies feature set of a single pixel with row number ROW and column number COLUMN 
    in the 1200 * 1200 matrix MATRIX.
    Feature set is computed based on the sum of value in 10 regions in one time frame from FILENAMES.
    """
    feature_set = []
    for i in range(10):
        mask = sector_mask(matrix.shape, (row, column), radius, (i, i + 36))
        sector = matrix[mask]
        sector[:][sector[:] < 7] = 0
        sector[:][sector[:] >= 7] = 1
        sum_value = np.sum(sector)
        feature_set.append(sum_value)
    return feature_set

In [14]:
def compute_tem_feature_set(matrix, row, column):
    """
    Return a surface temperature feature set of a single pixel with row number ROW and column number COLUMN 
    in the 1200 * 1200 matrix MATRIX.
    Feature set is computed based on the mean of value in 10 regions in one time frame from FILENAMES.
    """
    feature_set = [] # use array instead
    for i in range(10):
        mask = sector_mask(matrix.shape, (row, column), radius, (i, i + 36))
        sector = matrix[mask]
        #sector = sector[sector[:] != 0] # drop fillvalue
        mean_value = np.mean(sector)
        feature_set.append(mean_value)
    return feature_set

In [15]:
def compute_NE_feature_set(matrix, row, column):
    """
    Return a NE feature set of a single pixel with row number ROW and column number COLUMN 
    in the 1200 * 1200 matrix MATRIX.
    Feature set is computed based on the mean of value in 10 regions in one time frame from FILENAMES.
    """
    feature_set = []
    for i in range(10):
        mask = sector_mask(matrix.shape, (row, column), radius, (i, i + 36))
        sector = matrix[mask]
        mean_value = np.mean(sector)
        feature_set.append(mean_value)
    return feature_set

### Process 16-day hdf file

16-day for vegetation_data

In [16]:
def vege_features(file, sds, fill_value, new_value, radius):
    """
    Return feature sets of all pixels in the file FILE.
    SDS is NDVI or EVI.
    
    """
    all_bands = []
    with rio.open(file) as dataset:
        for name in dataset.subdatasets:
            if re.search(sds, name):
                with rio.open(name) as subdataset:
                    modis_meta = subdataset.profile
                    all_bands.append(subdataset.read(1))
    vege_modis = np.stack(all_bands)
    vege_matrix = vege_modis[0].reshape(d_limit,d_limit)
    feature_sets = np.zeros((d_limit, d_limit))
    for r in range(d_limit):
        for c in range(d_limit):
            temp = compute_feature_set(vege_martrix, fill_value, new_value, r, c)
            feature_sets[r][c] = temp

In [17]:
def compute_vege_features(filenames, num_file, sds, fill_value, new_value, radius):
    """
    Return vegetation indice feature sets for all files in FILENAMES
    """
    all_feature_sets = np.zeros((num_file, d_limit, d_limit))
    
    index = 0
    
    for file in filenames:
        temp = vege_features(file, sds, file_value, new_value, radius)
        all_feature_sets[index] = temp
        index += 1
    return all_feature_sets

### Reduce dimension

In [18]:
def rebin(m, shape):
    """
    Reshape the input matrix A to the shape SHAPE.
    """
    sh = shape[0],m.shape[0]//shape[0],shape[1],m.shape[1]//shape[1]
    return m.reshape(sh).mean(-1).mean(1)

### Process 8-day files

8-day for thermal_data, tem_data and NE_data

In [19]:
NE_fs85[0:4]

['NE_Data_8day\\MOD16A2.A2001001.h08v05.006.2017068145443.hdf',
 'NE_Data_8day\\MOD16A2.A2001009.h08v05.006.2017068172948.hdf',
 'NE_Data_8day\\MOD16A2.A2001017.h08v05.006.2017068204420.hdf',
 'NE_Data_8day\\MOD16A2.A2001025.h08v05.006.2017068232911.hdf']

In [20]:
NE_fs85[4:8]

['NE_Data_8day\\MOD16A2.A2001033.h08v05.006.2017069023644.hdf',
 'NE_Data_8day\\MOD16A2.A2001041.h08v05.006.2017069060452.hdf',
 'NE_Data_8day\\MOD16A2.A2001049.h08v05.006.2017069091436.hdf',
 'NE_Data_8day\\MOD16A2.A2001057.h08v05.006.2017069124001.hdf']

In [21]:
def compute_features_8day(filenames, num_file, radius, feature_type):
    """
    Combine every two 8-day files to compute feature sets. There are in total NUM_FILES in filenames. Used for 8-day datasets.
    Use a queue to help iterate through every two files.
    Input filenames is the result from using glob command
    Result is a 3d np array with all the SDS features using 10-region method
    """
    print(1)
    sds1 = None
    sds2 = None
    if feature_type == "thermal":
        sds1 = "FireMask"
    elif feature_type == "tem":
        sds1 = "LST_Day"
    elif feature_type == "NE":
        sds1 = ":ET_500m"
        sds2 = "PET"
    else:
        print("No such feature for 8-day")
        return
    print(2)
    file_count = num_file // 2
    all_feature_sets = np.zeros((file_count, 1080, 1080, 10))
    q = Queue(1000)
    print(3)
    for filename in filenames:
        q.put(filename)
    print(4)
    for i in range (file_count):
        f1 = q.get()
        f2 = q.get()
        f1_bands = []
        f2_bands = []
        f1_bands_se = []
        f2_bands_se = []
        # open two files in a round
        with rio.open(f1) as dataset:
            for name in dataset.subdatasets:
                if re.search(sds1, name):
                    with rio.open(name) as subdataset:
                        modis_meta = subdataset.profile
                        f1_bands.append(subdataset.read(1))
                if feature_type == "NE":
                    if re.search(sds2, name):
                        with rio.open(name) as subdataset:
                            modis_meta = subdataset.profile
                            f1_bands_se.append(subdataset.read(1))
        f1_modis = np.stack(f1_bands)
        f1_matrix = f1_modis[0].reshape(d_limit,d_limit)
        if feature_type == "NE":
            f1_modis_se = np.stack(f1_bands_se)
            f1_matrix_se = f1_modis_se[0].reshape(d_limit,d_limit)
        print(5)
        with rio.open(f2) as dataset:
            for name in dataset.subdatasets:
                if re.search(sds1, name):
                    with rio.open(name) as subdataset:
                        modis_meta = subdataset.profile
                        f2_bands.append(subdataset.read(1))
                if feature_type == "NE":
                    if re.search(sds2, name):
                        with rio.open(name) as subdataset:
                            modis_meta = subdataset.profile
                            f2_bands_se.append(subdataset.read(1))
        f2_modis = np.stack(f2_bands)
        f2_matrix = f2_modis[0].reshape(d_limit,d_limit)
        if feature_type == "NE":
            f2_modis_se = np.stack(f2_bands_se)
            f2_matrix_se = f2_modis_se[0].reshape(d_limit,d_limit)
        # combine two matrices
        if feature_type == "NE":
            f1_matrix = f1_matrix_se - f1_matrix
            f2_matrix = f2_matrix_se - f2_matrix
        combined_matrix = (f1_matrix + f2_matrix) / 2
        if feature_type == "tem":
            combined_matrix = combined_matrix * tem_conversion
        feature_sets = np.zeros((1080, 1080))
        print(6)
        for r in range(60, 1140):
            for c in range(60, 1140):
                if feature_type == "thermal":
                    temp = compute_thermal_feature_set(combined_matrix, r, c)
                elif feature_type == "tem":
                    temp = compute_tem_feature_set(combined_matrix, r, c)
                    print(temp)
                elif feature_type == "vege":
                    temp = compute_vege_feature_set(combined_matrix, r, c)
                else:
                    temp = compute_NE_feature_set(combined_matrix, r, c)
                feature_sets[r][c] = np.copy(temp)
        
        all_feature_sets[i] = np.copy(feature_sets)
        return all_feature_sets

In [28]:
def compute_TEMfeatures_8day(filenames, num_file, radius):
    """
    Combine every two 8-day files to compute feature sets. There are in total NUM_FILES in filenames. Used for 8-day datasets.
    Use a queue to help iterate through every two files.
    Input filenames is the result from using glob command
    Result is a 3d np array with all the SDS features using 10-region method
    """
    print(1)
    sds1 = "LST_Day"
    print(2)
    file_count = num_file // 2
    all_feature_sets = np.zeros((file_count, 1080, 1080, 10))
    q = Queue(1000)
    print(3)
    for filename in filenames:
        q.put(filename)
    print(4)
    for i in range (file_count):
        f1 = q.get()
        f2 = q.get()
        f1_bands = []
        f2_bands = []
        f1_bands_se = []
        f2_bands_se = []
        # open two files in a round
        with rio.open(f1) as dataset:
            for name in dataset.subdatasets:
                if re.search(sds1, name):
                    with rio.open(name) as subdataset:
                        modis_meta = subdataset.profile
                        f1_bands.append(subdataset.read(1))
        f1_modis = np.stack(f1_bands)
        f1_matrix = f1_modis[0].reshape(d_limit,d_limit)
        print(5)
        with rio.open(f2) as dataset:
            for name in dataset.subdatasets:
                if re.search(sds1, name):
                    with rio.open(name) as subdataset:
                        modis_meta = subdataset.profile
                        f2_bands.append(subdataset.read(1))
        f2_modis = np.stack(f2_bands)
        f2_matrix = f2_modis[0].reshape(d_limit,d_limit)
        # combine two matrices
        combined_matrix = (f1_matrix + f2_matrix) / 2
        combined_matrix = combined_matrix * tem_conversion
        feature_sets = np.zeros((1080, 1080, 10))
        print(6)
        for r in range(60, 1140):
            for c in range(60, 1140):
                print((r-60,c-60))
                temp = compute_tem_feature_set(combined_matrix, r, c)
                feature_sets[r-60][c-60] = np.copy(temp)
        
        all_feature_sets[i] = np.copy(feature_sets)
        return all_feature_sets

In [29]:
def compute_THERMALfeatures_8day(filenames, num_file, radius):
    """
    Combine every two 8-day files to compute feature sets. There are in total NUM_FILES in filenames. Used for 8-day datasets.
    Use a queue to help iterate through every two files.
    Input filenames is the result from using glob command
    Result is a 3d np array with all the SDS features using 10-region method
    """
    print(1)
    sds1 = "FireMask"
    print(2)
    file_count = num_file // 2
    all_feature_sets = np.zeros((file_count, 1080, 1080, 10))
    q = Queue(file_count)
    print(3)
    for filename in filenames:
        q.put(filename)
    print(4)
    for i in range (file_count):
        f1 = q.get()
        f2 = q.get()
        f1_bands = []
        f2_bands = []
        f1_bands_se = []
        f2_bands_se = []
        # open two files in a round
        with rio.open(f1) as dataset:
            for name in dataset.subdatasets:
                if re.search(sds1, name):
                    with rio.open(name) as subdataset:
                        modis_meta = subdataset.profile
                        f1_bands.append(subdataset.read(1))
        f1_modis = np.stack(f1_bands)
        f1_matrix = f1_modis[0].reshape(d_limit,d_limit)
        print(5)
        with rio.open(f2) as dataset:
            for name in dataset.subdatasets:
                if re.search(sds1, name):
                    with rio.open(name) as subdataset:
                        modis_meta = subdataset.profile
                        f2_bands.append(subdataset.read(1))
        f2_modis = np.stack(f2_bands)
        f2_matrix = f2_modis[0].reshape(d_limit,d_limit)
        # combine two matrices
        combined_matrix = (f1_matrix + f2_matrix) / 2
        feature_sets = np.zeros((1080, 1080))
        print(6)
        for r in range(60, 1140):
            for c in range(60, 1140):
                print((r-60,c-60))
                temp = compute_thermal_feature_set(combined_matrix, r, c)
                feature_sets[r-60][c-60] = np.copy(temp)
        
        all_feature_sets[i] = np.copy(feature_sets)
        return all_feature_sets

In [30]:
def compute_NEfeatures_8day(filenames, num_file, radius):
    """
    Combine every two 8-day files to compute feature sets. There are in total NUM_FILES in filenames. Used for 8-day datasets.
    Use a queue to help iterate through every two files.
    Input filenames is the result from using glob command
    Result is a 3d np array with all the SDS features using 10-region method
    """
    print(1)
    sds1 = ":ET_500m"
    sds2 = "PET"
    print(2)
    file_count = num_file // 2
    all_feature_sets = np.zeros((file_count, 1080, 1080, 10))
    q = Queue(num_file)
    print(3)
    for filename in filenames:
        q.put(filename)
    print(4)
    for i in range (file_count):
        f1 = q.get()
        f2 = q.get()
        f1_bands = []
        f2_bands = []
        f1_bands_se = []
        f2_bands_se = []
        # open two files in a round
        with rio.open(f1) as dataset:
            for name in dataset.subdatasets:
                if re.search(sds1, name):
                    with rio.open(name) as subdataset:
                        modis_meta = subdataset.profile
                        f1_bands.append(subdataset.read(1))
                if re.search(sds2, name):
                    with rio.open(name) as subdataset:
                        modis_meta = subdataset.profile
                        f1_bands_se.append(subdataset.read(1))
        f1_modis = np.stack(f1_bands)
        f1_matrix = f1_modis[0].reshape(d_limit,d_limit)
        f1_modis_se = np.stack(f1_bands_se)
        f1_matrix_se = f1_modis_se[0].reshape(d_limit,d_limit)
        print(5)
        with rio.open(f2) as dataset:
            for name in dataset.subdatasets:
                if re.search(sds1, name):
                    with rio.open(name) as subdataset:
                        modis_meta = subdataset.profile
                        f2_bands.append(subdataset.read(1))
                if re.search(sds2, name):
                    with rio.open(name) as subdataset:
                        modis_meta = subdataset.profile
                        f2_bands_se.append(subdataset.read(1))
        f2_modis = np.stack(f2_bands)
        f2_matrix = f2_modis[0].reshape(d_limit,d_limit)
        f2_modis_se = np.stack(f2_bands_se)
        f2_matrix_se = f2_modis_se[0].reshape(d_limit,d_limit)
        # combine two matrices
        f1_matrix = f1_matrix_se - f1_matrix
        f2_matrix = f2_matrix_se - f2_matrix
        combined_matrix = (f1_matrix + f2_matrix) / 2
        feature_sets = np.zeros((1080, 1080))
        print(6)
        for r in range(60, 1140):
            for c in range(60, 1140):
                print((r-60,c-60))
                temp = compute_NE_feature_set(combined_matrix, r, c)
                feature_sets[r-60][c-60] = np.copy(temp)
        
        all_feature_sets[i] = np.copy(feature_sets)
        return all_feature_sets

Within a region, exact a small set of features, and ask is there any fire observed in the region in the past 16 days. Consider the intensity of fire, number of pixels with fire (a pixel with fire for 3 days are considered as 3 fire), for a region in the past 16 days.

In [23]:
len(tem_fs85)

914

In [24]:
len(vege_fs85)

457

In [None]:
Tem = compute_TEMfeatures_8day(tem_fs85[0:4], 4, 50)

1
2
3
4
5
6
(0, 0)
(0, 1)
(0, 2)
(0, 3)
(0, 4)
(0, 5)
(0, 6)
(0, 7)
(0, 8)
(0, 9)
(0, 10)
(0, 11)
(0, 12)
(0, 13)
(0, 14)
(0, 15)
(0, 16)
(0, 17)
(0, 18)
(0, 19)
(0, 20)
(0, 21)
(0, 22)
(0, 23)
(0, 24)
(0, 25)
(0, 26)
(0, 27)
(0, 28)
(0, 29)
(0, 30)
(0, 31)
(0, 32)
(0, 33)
(0, 34)
(0, 35)
(0, 36)
(0, 37)
(0, 38)
(0, 39)
(0, 40)
(0, 41)
(0, 42)
(0, 43)
(0, 44)
(0, 45)
(0, 46)
(0, 47)
(0, 48)
(0, 49)
(0, 50)
(0, 51)
(0, 52)
(0, 53)
(0, 54)
(0, 55)
(0, 56)
(0, 57)
(0, 58)
(0, 59)
(0, 60)
(0, 61)
(0, 62)
(0, 63)
(0, 64)
(0, 65)
(0, 66)
(0, 67)
(0, 68)
(0, 69)
(0, 70)
(0, 71)
(0, 72)
(0, 73)
(0, 74)
(0, 75)
(0, 76)
(0, 77)
(0, 78)
(0, 79)
(0, 80)
(0, 81)
(0, 82)
(0, 83)
(0, 84)
(0, 85)
(0, 86)
(0, 87)
(0, 88)
(0, 89)
(0, 90)
(0, 91)
(0, 92)
(0, 93)
(0, 94)
(0, 95)
(0, 96)
(0, 97)
(0, 98)
(0, 99)
(0, 100)
(0, 101)
(0, 102)
(0, 103)
(0, 104)
(0, 105)
(0, 106)
(0, 107)
(0, 108)
(0, 109)
(0, 110)
(0, 111)
(0, 112)
(0, 113)
(0, 114)
(0, 115)
(0, 116)
(0, 117)
(0, 118)
(0, 119)
(0, 120)
(0, 121)


### Time Series Features

In [54]:
X = np.ones((3,4))
Y = np.zeros(3)

In [55]:
X

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [56]:
Y.shape

(3,)

In [57]:
dump_svmlight_file(X,Y,os.path.join("test_sample1"),zero_based=False)

In [53]:
XX = np.ones((5,4))
YY = np.ones(5)
dump_svmlight_file(XX,YY,os.path.join("test_sample2"),zero_based=False)

In [86]:
a = np.append(Y, [4, 5, 6, 7, 8, 9])

In [92]:
b = np.zeros((3,3,2))

In [120]:
c = [1.0, 2.1]

In [93]:
b[0][0]

array([0., 0.])

In [121]:
b[0][0] = np.copy(c)

In [122]:
b

array([[[1. , 2.1],
        [0. , 0. ],
        [0. , 0. ]],

       [[0. , 0. ],
        [0. , 0. ],
        [0. , 0. ]],

       [[0. , 0. ],
        [0. , 0. ],
        [0. , 0. ]]])