# Connect to drive, libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
directory = '/content/drive/MyDrive/ColabNotebooks/autoencoders/'
#directory = 'C:/Users/student/shelby'
os.chdir(directory)

Mounted at /content/drive


In [None]:
!pip install rioxarray

Collecting rioxarray
  Downloading rioxarray-0.9.1.tar.gz (47 kB)
[?25l[K     |███████                         | 10 kB 24.1 MB/s eta 0:00:01[K     |██████████████                  | 20 kB 13.0 MB/s eta 0:00:01[K     |████████████████████▉           | 30 kB 8.9 MB/s eta 0:00:01[K     |███████████████████████████▉    | 40 kB 4.8 MB/s eta 0:00:01[K     |████████████████████████████████| 47 kB 2.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting rasterio
  Downloading rasterio-1.2.10-cp37-cp37m-manylinux1_x86_64.whl (19.3 MB)
[K     |████████████████████████████████| 19.3 MB 5.2 MB/s 
[?25hCollecting pyproj>=2.2
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 49.3 MB/s 
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting affine
  Download

In [None]:
##############################################################################
### Packages and fun
import matplotlib.pyplot as plt
import numpy as np
import rioxarray as rxr
import xarray as xr
import rasterio
import os
import pandas as pd
from random import random
from random import seed
from sklearn.model_selection import train_test_split
import math

# Functions

In [None]:
##############################################################################
### Define functions
# Check for leap year. If it's a leap year, returns TRUE; else returns FALSE
def is_leap_year(year):
    if year % 100 == 0:
        return year % 400 == 0
    return year % 4 == 0

# Date to DOY; requires is_leap_year result and Y, M, D
def get_doy(Y,M,D):
    """ given year, month, day return day of year
        Astronomical Algorithms, Jean Meeus, 2d ed, 1998, chap 7 """
    if is_leap_year(Y):
        K = 1
    else:
        K = 2
    my_doy = int((275 * M) / 9.0) - K * int((M + 9) / 12.0) + D - 30
    return my_doy

def cumulative_doy(image):
    begin_y = 2017
    leap_year = [1956,1960,1964,1968,1972,1976,1980,1984,1988,1992,1996,2000,
                 2004,2008,2012,2016,2020,2024,2028,2032,2036,2040,2044,2048]
    date = filedate(image)
    Y,M,D = splitter(date)
    doy = get_doy(Y,M,D)
    if Y == begin_y:
        return doy, Y
    else:
        year = begin_y
        count = 0
        while Y > year:
            if year in leap_year:
                count += 366
            else:
                count += 365
            year += 1
        cum_doy = doy + count
        return cum_doy, Y

# DOY to cyclical encoding of DOY using sine and cosine. Returns 2 values
def cyclical_doy(doy, Y):
    if is_leap_year(Y):
        doy_sin = np.sin(2 * np.pi * doy/366.)
        doy_cos = np.cos(2 * np.pi * doy/366.)
        return doy_sin, doy_cos
    else:
        doy_sin = np.sin(2 * np.pi * doy/365.)
        doy_cos = np.cos(2 * np.pi * doy/365.)
        return doy_sin, doy_cos

def splitter(date):
    y = int(date.split('-')[0])
    m = int(date.split('-')[1])
    d = int(date.split('-')[2])
    return y, m, d

def one_step(image):
  date = filedate(image)
  y,m,d = splitter(date)
  dofy = get_doy(y,m,d)
  doy_sin, doy_cos = cyclical_doy(dofy,y)
  return y, dofy, doy_sin, doy_cos

# Define a function to get file data, based on naming scheme
# In this project [-14] will include YMD
def filedate(x):
    return(x[-14:-4])

# Pathways

In [None]:
##############################################################################
### Define directories
data_dir = f'{directory}'
cf_dir = f'{directory}S2_BPWW_UTM/cloudfree/'

# Get pixel info

In [None]:
##############################################################################
### Get 'clean' pixel data to work with
image = f'{cf_dir}33UWP_122_2017-04-01.tif'
pic = rasterio.open(image)

# Pick pixel from image, only considering pixels with a LC class
csv_clean = f'{data_dir}S2_BPWW_UTM/33UWP_122_2019-06-30.csv'
clean_data = pd.read_csv(csv_clean)

# Only look at pixels with LC class = 4 (needle leaf)
needle_leaf = clean_data.loc[clean_data['LC'] == 4]
needle_leaf.columns
pixel_vals = needle_leaf.iloc[5000]  # randomly chose pixel 5000
pixel = int(pixel_vals[0])

# Get easting, northing values
east = int(pixel_vals.x)
north = int(pixel_vals.y)

# get index location in arrays based on chosen pixel lat lon (north east)
row,col = pic.index(east, north)

# DOY data

In [None]:
##############################################################################
### get date information for cumulative doy
test_dict = {}
for image in os.listdir(cf_dir):
    if image.endswith('.tif'):
        doy, year = cumulative_doy(image)
        test_dict[image] = doy

In [None]:
##############################################################################
### Pixel loop in folder
# Name counter and pixel_list
count = 0
pixel_list = []

# Loop through cloudfree folder
for image in os.listdir(cf_dir):
    if image.endswith('.tif'):
        count += 1
        with rxr.open_rasterio(os.path.join(cf_dir, image)) as src:
            pix = src.isel(y=col, x=row)
            df = pix.to_numpy()
            df1 = df.tolist()
            df2 = df1[0:8]
            df2.append(df1[9])
            df2.append(df1[10])
            
            doy, year = cumulative_doy(image)
            doy_sin, doy_cos = cyclical_doy(doy, year)
            df2.insert(0, doy_cos)
            df2.insert(0, doy_sin)
            df2.insert(0, doy)

            pixel_list.append(df2)

            print(image, count)

33UWP_122_2017-04-01.tif 1
33UWP_122_2017-06-20.tif 2
33UWP_122_2017-08-29.tif 3
33UWP_122_2017-09-08.tif 4
33UWP_122_2018-04-21.tif 5
33UWP_122_2018-05-06.tif 6
33UWP_122_2018-08-09.tif 7
33UWP_122_2018-08-29.tif 8
33UWP_122_2018-09-13.tif 9
33UWP_122_2018-09-18.tif 10
33UWP_122_2018-09-28.tif 11
33UWP_122_2018-10-13.tif 12
33UWP_122_2019-04-01.tif 13
33UWP_122_2019-04-16.tif 14
33UWP_122_2019-04-21.tif 15
33UWP_122_2019-06-30.tif 16
33UWP_122_2019-07-25.tif 17
33UWP_122_2020-04-05.tif 18
33UWP_122_2020-08-08.tif 19
33UWP_122_2020-09-12.tif 20
33UWP_79_2017-05-28.tif 21
33UWP_79_2017-08-01.tif 22
33UWP_79_2017-08-31.tif 23
33UWP_79_2017-09-30.tif 24
33UWP_79_2017-10-15.tif 25
33UWP_79_2018-04-08.tif 26
33UWP_79_2018-07-02.tif 27
33UWP_79_2018-08-21.tif 28
33UWP_79_2018-09-30.tif 29
33UWP_79_2018-10-05.tif 30
33UWP_79_2018-10-10.tif 31
33UWP_79_2018-10-30.tif 32
33UWP_79_2019-08-31.tif 33
33UWP_79_2019-09-15.tif 34
33UWP_79_2020-04-02.tif 35
33UWP_79_2020-04-07.tif 36
33UWP_79_2020-04-

# Create table from pixel band values

In [None]:
data_cols= ['doy','doy_sin','doy_cos','B2_blue','B3_green','B4_red','B5_RE1',
         'B6_RE2','B7_RE3','B8_NIR1','B8A_NIR2','B11_SWI1','B12_SWI2']
pixel_df = pd.DataFrame(columns=data_cols,data=pixel_list)
pixel_df[['B2_blue','B3_green','B4_red','B5_RE1','B6_RE2','B7_RE3','B8_NIR1',
         'B8A_NIR2','B11_SWI1','B12_SWI2']] = pixel_df[['B2_blue','B3_green',
                                                      'B4_red','B5_RE1','B6_RE2',
                                                      'B7_RE3','B8_NIR1',
                                                      'B8A_NIR2','B11_SWI1',
                                                      'B12_SWI2']].astype(int)


pixel_df.head()

df_name = 'pixel_{}.csv'.format(pixel)
pixel_df.to_csv(os.path.join(data_dir, df_name), index=False)

In [None]:
check = pd.read_csv(os.path.join(data_dir,df_name))
#check

In [None]:
##############################################################################
### Split data - train and test
test_size = 0.1
train, test = train_test_split(pixel_df, test_size = test_size, random_state = 10)
print("Train shape: ",train.shape)
print("Test shape: ", test.shape)

Train shape:  (36, 13)
Test shape:  (5, 13)


In [None]:
##############################################################################
### Create new tables
_index = ['doy_sin','doy_cos','B2_blue','B3_green','B4_red','B5_RE1',
          'B6_RE2','B7_RE3','B8_NIR1','B8A_NIR2','B11_SWI1','B12_SWI2',
          'doy_sin','doy_cos','B2_blue','B3_green','B4_red','B5_RE1',
          'B6_RE2','B7_RE3','B8_NIR1','B8A_NIR2','B11_SWI1','B12_SWI2',
          'doy_sin','doy_cos','B2_blue','B3_green','B4_red','B5_RE1',
          'B6_RE2','B7_RE3','B8_NIR1','B8A_NIR2','B11_SWI1','B12_SWI2']
train_df = pd.DataFrame()
test_df = pd.DataFrame()

In [None]:
test

Unnamed: 0,doy,doy_sin,doy_cos,B2_blue,B3_green,B4_red,B5_RE1,B6_RE2,B7_RE3,B8_NIR1,B8A_NIR2,B11_SWI1,B12_SWI2
30,648,-0.987349,0.158559,158,284,294,897,2055,2647,2462,3133,1507,667
12,821,0.999991,0.004304,310,461,642,1146,1636,1902,2062,2313,2261,1430
33,988,-0.963471,-0.267814,174,359,190,754,2753,3384,3486,3866,1708,750
2,241,-0.845249,-0.534373,130,307,175,685,2687,3478,3530,3710,1658,719
20,148,0.559589,-0.82877,204,569,257,1175,3906,4786,4727,5010,2232,1058


# Training triplets

In [None]:
num_of_t = 79
num = 0

while num < num_of_t and len(train_df) < 1000:
    r1 = train.sample()
    crit = train.loc[(train['doy'] >= (int(r1.doy) - 30)) & 
                        (train['doy'] <= (int(r1.doy) + 30)) &
                        (train['doy'] != (int(r1.doy)))]
    while len(crit) < 2:
        r1 = train.sample()
        crit = train.loc[(train['doy'] >= (int(r1.doy) - 30)) & 
                            (train['doy'] <= (int(r1.doy) + 30)) &
                            (train['doy'] != (int(r1.doy)))]

    r2 = crit.sample(2)
    stack = pd.concat([r1,r2]).sort_values('doy')
    doys = tuple(stack.doy.values)
    stack_trans = stack.T
    new_col = pd.Series(stack_trans[1:].values.ravel('F'), name=doys)
    while new_col.name not in train_df.columns:
        train_df.insert(0,new_col.name,new_col, allow_duplicates=True)
        num += 1
    else:
        print('All possible combinations reached: ', num)

All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations reached:  67
All possible combinations

In [None]:
#train_df

In [None]:
# Add labels to table, and set it as the index
train_df.insert(0,'index',_index)
train_df.set_index('index', drop=True)

# Save as csv
csv_name = 'train_pixel_{}.csv'.format(pixel)

train_df.to_csv(os.path.join(data_dir, csv_name), index=False)

##############################################################################
### Check csv
checker_train = pd.read_csv(os.path.join(data_dir,csv_name), index_col=0)

# Testing triplets

In [None]:
test_df = pd.DataFrame()

In [None]:
num_of_t = 25
num = 0

while num < num_of_t and len(test_df) < 1000:
    r1 = test.sample()
    crit = pixel_df.loc[(pixel_df['doy'] >= (int(r1.doy) - 30)) & 
                        (pixel_df['doy'] <= (int(r1.doy) + 30)) &
                        (pixel_df['doy'] != (int(r1.doy)))]
    while len(crit) < 2:
        r1 = test.sample()
        crit = pixel_df.loc[(pixel_df['doy'] >= (int(r1.doy) - 30)) & 
                            (pixel_df['doy'] <= (int(r1.doy) + 30)) &
                            (pixel_df['doy'] != (int(r1.doy)))]

    r2 = crit.sample(2)
    stack = pd.concat([r1,r2]).sort_values('doy')
    doys = tuple(stack.doy.values)
    stack_trans = stack.T
    new_col = pd.Series(stack_trans[1:].values.ravel('F'), name=doys)
    while new_col.name not in test_df.columns:
        test_df.insert(0,new_col.name,new_col, allow_duplicates=True)
        num += 1
    else:
        print('All possible combinations reached: ', num)

All possible combinations reached:  1
All possible combinations reached:  2
All possible combinations reached:  3
All possible combinations reached:  3
All possible combinations reached:  4
All possible combinations reached:  5
All possible combinations reached:  6
All possible combinations reached:  6
All possible combinations reached:  6
All possible combinations reached:  6
All possible combinations reached:  6
All possible combinations reached:  7
All possible combinations reached:  7
All possible combinations reached:  8
All possible combinations reached:  9
All possible combinations reached:  10
All possible combinations reached:  10
All possible combinations reached:  10
All possible combinations reached:  10
All possible combinations reached:  10
All possible combinations reached:  10
All possible combinations reached:  10
All possible combinations reached:  10
All possible combinations reached:  10
All possible combinations reached:  10
All possible combinations reached:  11
A

In [None]:
test_df

Unnamed: 0,"(636, 648, 668)","(621, 648, 668)","(636, 643, 648)","(643, 648, 668)","(626, 638, 648)","(621, 638, 648)","(636, 638, 648)","(638, 643, 648)","(626, 648, 651)","(643, 648, 651)",...,"(638, 648, 668)","(213, 241, 243)","(626, 636, 648)","(241, 243, 251)","(621, 643, 648)","(621, 636, 648)","(621, 648, 651)","(821, 836, 841)","(626, 643, 648)","(626, 648, 668)"
0,-0.99888,-0.953681,-0.99888,-0.997325,-0.976011,-0.953681,-0.99888,-0.999917,-0.976011,-0.997325,...,-0.999917,-0.501242,-0.976011,-0.845249,-0.953681,-0.953681,-0.953681,0.999991,-0.976011,-0.976011
1,-0.047321,-0.30082,-0.047321,0.073095,-0.217723,-0.30082,-0.047321,-0.01291,-0.217723,0.073095,...,-0.01291,-0.865307,-0.217723,-0.534373,-0.30082,-0.30082,-0.30082,0.004304,-0.217723,-0.217723
2,98.0,152.0,98.0,163.0,147.0,152.0,98.0,135.0,147.0,163.0,...,135.0,214.0,147.0,130.0,152.0,152.0,152.0,310.0,147.0,147.0
3,270.0,325.0,270.0,335.0,290.0,325.0,270.0,304.0,290.0,335.0,...,304.0,414.0,290.0,307.0,325.0,325.0,325.0,461.0,290.0,290.0
4,143.0,178.0,143.0,188.0,162.0,178.0,143.0,167.0,162.0,188.0,...,167.0,223.0,162.0,175.0,178.0,178.0,178.0,642.0,162.0,162.0
5,631.0,662.0,631.0,813.0,601.0,662.0,631.0,792.0,601.0,813.0,...,792.0,829.0,601.0,685.0,662.0,662.0,662.0,1146.0,601.0,601.0
6,2225.0,2437.0,2225.0,2380.0,2331.0,2437.0,2225.0,2508.0,2331.0,2380.0,...,2508.0,3103.0,2331.0,2687.0,2437.0,2437.0,2437.0,1636.0,2331.0,2331.0
7,2995.0,3350.0,2995.0,3059.0,3076.0,3350.0,2995.0,3172.0,3076.0,3059.0,...,3172.0,4046.0,3076.0,3478.0,3350.0,3350.0,3350.0,1902.0,3076.0,3076.0
8,3357.0,3274.0,3357.0,3216.0,3096.0,3274.0,3357.0,3182.0,3096.0,3216.0,...,3182.0,3994.0,3096.0,3530.0,3274.0,3274.0,3274.0,2062.0,3096.0,3096.0
9,3295.0,3571.0,3295.0,3300.0,3332.0,3571.0,3295.0,3460.0,3332.0,3300.0,...,3460.0,4481.0,3332.0,3710.0,3571.0,3571.0,3571.0,2313.0,3332.0,3332.0


In [None]:
# Add labels to table, and set it as the index
test_df.insert(0,'index',_index)
test_df.set_index('index', drop=True)

# Save as csv
csv_name = 'test_pixel_{}.csv'.format(pixel)

test_df.to_csv(os.path.join(data_dir, csv_name), index=False)

##############################################################################
### Check csv
checker_test = pd.read_csv(os.path.join(data_dir,csv_name), index_col=0)

In [None]:
# checker_train
# checker_test

# Prep data for autoencoder

In [None]:
# Get CSVs
csv_name = 'train_pixel_{}.csv'.format(pixel)
csv_name1 = 'test_pixel_{}.csv'.format(pixel)

# Prep data
train = pd.read_csv(os.path.join(data_dir,csv_name), index_col=0)
train_data = train.T

test = pd.read_csv(os.path.join(data_dir,csv_name1), index_col=0)
test_data = test.T

# convert training data to array
train_data_array = np.array(train_data)
print('data shape:', train_data_array.shape)
train = train_data_array

# convert testing data to array
test_data_array = np.array(test_data)
print('data shape:', test_data_array.shape)
test = test_data_array

# Get column names
bands = train_data.columns

data shape: (79, 36)
data shape: (25, 36)


In [None]:
# Summarize
print('training shape', train.shape)
print('testing shape', test.shape)

training shape (79, 36)
testing shape (25, 36)


In [None]:
### Scaling
scaler = pp.StandardScaler()

# training
train_scaled = np.array(scaler.fit_transform(train))
joblib.dump(scaler, f'{mf_ae}scale_{idi}.mod')

# testing
scaler = joblib.load(f'{mf_ae}scale_{idi}.mod')
test_scaled = np.array(scaler.transform(test))

NameError: ignored

In [None]:
n_hidden_neurons = 16
optimizer = 'Adam'
batch_size = None