<a href="https://colab.research.google.com/github/scorning95/github-slideshow/blob/main/Prepare_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Goal:
Select 100 random pixels from each land class type.

Extract values for these 600 pixels from available images.

Select 10 triplets for each pixel for training, and 5 triplets for testing (6000 rows and 3000 rows, respectively).

## Connect to drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
directory = '/content/drive/MyDrive/ColabNotebooks/autoencoders/'
#directory = 'C:/Users/student/shelby'
os.chdir(directory)

Mounted at /content/drive


In [3]:
!pip install rioxarray

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rioxarray
  Downloading rioxarray-0.9.1.tar.gz (47 kB)
[K     |████████████████████████████████| 47 kB 2.8 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting rasterio
  Downloading rasterio-1.2.10-cp37-cp37m-manylinux1_x86_64.whl (19.3 MB)
[K     |████████████████████████████████| 19.3 MB 6.8 MB/s 
Collecting pyproj>=2.2
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 53.7 MB/s 
Collecting affine
  Downloading affine-2.3.1-py2.py3-none-any.whl (16 kB)
Collecting snuggs>=1.4.1
  Downloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting click-plugins
  Downloading click_plugins-1.1.1-py2.py3-none-

## Connect to libraries

In [4]:
##############################################################################
### Packages and fun
import matplotlib.pyplot as plt
import numpy as np
import rioxarray as rxr
import xarray as xr
import rasterio
import os
import pandas as pd
from random import random
from random import seed
from sklearn.model_selection import train_test_split
import math
import random
from rasterio.plot import show
import rasterio.plot

## Define pathways

In [5]:
##############################################################################
### Define directories
data_dir = f'{directory}data/'
cf_dir = f'{data_dir}S2_BPWW_UTM/cloudfree/'

### Create/define folder
tables_dir = f'{data_dir}tables/'
if not os.path.exists(tables_dir):
    os.mkdir(tables_dir)

## Date functions

In [6]:
##############################################################################
# functions to get date and doy from files
def filedate(x):
    return(x[-14:-4])

def is_leap_year(year):
    if year % 100 == 0:
        return year % 400 == 0
    return year % 4 == 0

# Date to DOY; requires is_leap_year result and Y, M, D
def get_doy(Y,M,D):
    """ given year, month, day return day of year
        Astronomical Algorithms, Jean Meeus, 2d ed, 1998, chap 7 """
    if is_leap_year(Y):
        K = 1
    else:
        K = 2
    my_doy = int((275 * M) / 9.0) - K * int((M + 9) / 12.0) + D - 30
    return my_doy

def cumulative_doy(image):
    begin_y = 2017
    leap_year = [1956,1960,1964,1968,1972,1976,1980,1984,1988,1992,1996,2000,
                 2004,2008,2012,2016,2020,2024,2028,2032,2036,2040,2044,2048]
    date = filedate(image)
    Y,M,D = splitter(date)
    doy = get_doy(Y,M,D)
    if Y == begin_y:
        return doy, Y
    else:
        year = begin_y
        count = 0
        while Y > year:
            if year in leap_year:
                count += 366
            else:
                count += 365
            year += 1
        cum_doy = doy + count
        return cum_doy, Y

def cyclical_doy(doy, Y):
    if is_leap_year(Y):
        doy_sin = np.sin(2 * np.pi * doy/366.)
        doy_cos = np.cos(2 * np.pi * doy/366.)
        return doy_sin, doy_cos
    else:
        doy_sin = np.sin(2 * np.pi * doy/365.)
        doy_cos = np.cos(2 * np.pi * doy/365.)
        return doy_sin, doy_cos

def splitter(date):
    y = int(date.split('-')[0])
    m = int(date.split('-')[1])
    d = int(date.split('-')[2])
    return y, m, d

def one_step(image):
  date = filedate(image)
  y,m,d = splitter(date)
  dofy = get_doy(y,m,d)
  doy_sin, doy_cos = cyclical_doy(dofy,y)
  return y, dofy, doy_sin, doy_cos

## Random pixels

1 = farmland

2 = grassland

3 = broadleaf

4 = needleleaf

5 = builtup

6 = waterbody

In [7]:
# Read CSV with landcover associated with pixel location
pix_df = pd.read_csv(os.path.join(data_dir,'_pixel_LC.csv'))

In [None]:
# Get 100 random samples from each lc class
class1 = pix_df.loc[pix_df['LC'] == 1]
lc_df1 = class1.sample(n=100, random_state=10)

class2 = pix_df.loc[pix_df['LC'] == 2]
lc_df2 = class2.sample(n=100, random_state=10)

class3 = pix_df.loc[pix_df['LC'] == 3]
lc_df3 = class3.sample(n=100, random_state=10)

class4 = pix_df.loc[pix_df['LC'] == 4]
lc_df4 = class4.sample(n=100, random_state=10)

class5 = pix_df.loc[pix_df['LC'] == 5]
lc_df5 = class5.sample(n=100, random_state=10)

class6 = pix_df.loc[pix_df['LC'] == 6]
lc_df6 = class6.sample(n=100, random_state=10)


In [None]:
# Create table of all 6 classes
data = pd.concat([lc_df1,lc_df2,lc_df3,lc_df4,lc_df5,lc_df6], axis=0)

Unnamed: 0,pixel,LC,x,y
1661188,4738520,1.0,586595.0,5345255.0
7807242,13881631,1.0,596845.0,5323885.0
3068976,6811956,1.0,572655.0,5340405.0
850259,3561655,1.0,582445.0,5348005.0
3792632,7966518,1.0,567675.0,5337705.0
...,...,...,...,...
3954712,8207617,6.0,582985.0,5337145.0
714189,3306755,6.0,600245.0,5348605.0
7111885,12969574,6.0,588415.0,5326015.0
3960509,8216196,6.0,583215.0,5337125.0


## Create CSV for each image

In [None]:
def create_csv(image):
  src = rxr.open_rasterio(os.path.join(cf_dir, image))
  lister = []
  for row in data.iterrows():  # for all pixels in the data dataframe
    pix_row = row[1]  # reads the row: pixel, LC, x, y
    pix, lc_type, easting, northing = int(pix_row[0]),int(pix_row[1]),pix_row[2],pix_row[3]  #assigns column values to variable
    #print(pix,lc_type, 'easting ', easting, 'northing ', northing)  #check for correct variable assignment
          
    pix_array = src.sel(y=northing, x=easting)
    band_data = list(pix_array.values)

    # keep bands 2,3,4,5,6,7,8,8A,11,12
    columns = ['pixel','lc','2','3','4','5','6','7','8','8a','11','12']
    my_vals = band_data[0:8]
    my_vals.append(band_data[9])
    my_vals.append(band_data[10])
    my_vals.insert(0,lc_type)
    my_vals.insert(0,pix)
    lister.append(my_vals)
  image_df = pd.DataFrame(lister, columns=columns)

  csv_name = image.replace('.tif', '.csv')
  image_df.to_csv(os.path.join(tables_dir, csv_name), index=False)
  print(csv_name)

In [None]:
for image in os.listdir(cf_dir):
  if image.endswith('.tif'):
    create_csv(image)

33UWP_122_2017-04-01.csv
33UWP_122_2017-06-20.csv
33UWP_122_2017-08-29.csv
33UWP_122_2017-09-08.csv
33UWP_122_2018-04-21.csv
33UWP_122_2018-05-06.csv
33UWP_122_2018-08-09.csv
33UWP_122_2018-08-29.csv
33UWP_122_2018-09-13.csv
33UWP_122_2018-09-18.csv
33UWP_122_2018-09-28.csv
33UWP_122_2018-10-13.csv
33UWP_122_2019-04-01.csv
33UWP_122_2019-04-16.csv
33UWP_122_2019-04-21.csv
33UWP_122_2019-06-30.csv
33UWP_122_2019-07-25.csv
33UWP_122_2020-04-05.csv
33UWP_122_2020-08-08.csv
33UWP_122_2020-09-12.csv
33UWP_79_2017-05-28.csv
33UWP_79_2017-08-01.csv
33UWP_79_2017-08-31.csv
33UWP_79_2017-09-30.csv
33UWP_79_2017-10-15.csv
33UWP_79_2018-04-08.csv
33UWP_79_2018-07-02.csv
33UWP_79_2018-08-21.csv
33UWP_79_2018-09-30.csv
33UWP_79_2018-10-05.csv
33UWP_79_2018-10-10.csv
33UWP_79_2018-10-30.csv
33UWP_79_2019-08-31.csv
33UWP_79_2019-09-15.csv
33UWP_79_2020-04-02.csv
33UWP_79_2020-04-07.csv
33UWP_79_2020-04-12.csv
33UWP_79_2020-04-22.csv
33UWP_79_2020-07-31.csv
33UWP_79_2020-09-09.csv
33UWP_79_2020-10-04.

## Create triplets (train, test)

In [None]:
##############################################################################
# initial list of image doy values (cumulative)
doy_list = []
doy_image_dict = {}
for image in os.listdir(cf_dir):
    if image.endswith('.tif'):
        doy, year = cumulative_doy(image)
        doy_list.append(doy)
        doy_image_dict[doy] = image

In [None]:
##############################################################################
# split into train-test sets
test_size = 0.1  # 10% as test
random_state = 10  # make sure the same five are in test set every time
train, test = train_test_split(doy_list,
                               test_size=test_size,
                               random_state=random_state)

In [None]:
##############################################################################
# check sizes and totals
print('train: ', len(train))  #36
print('test: ', len(test))  #5
print(test) # 648, 821, 988, 241, 148

train:  36
test:  5
[648, 821, 988, 241, 148]


In [None]:
##############################################################################
# create a list of random triplets using only training images
train_stack = []
trial = 0

while len(train_stack) < 79:
    r1 = random.choice(train)
    crit = [x for x in train if (x <= (r1+30)) & (x >= (r1-30)) & (x != r1)]
    while len(crit) < 2:
        r1 = random.choice(train)
        crit = [x for x in train if (x <= (r1+30)) & (x >= (r1-30)) & (x != r1)]
    r2 = random.sample(crit, k=2)
    triplet = [r1, r2[0],r2[1]]
    triplet.sort() # sort in ascending order
    triplet = tuple(triplet) # create tuple
    while triplet not in train_stack:
        train_stack.append(triplet)

In [None]:
##############################################################################
# create a list of random triplets using only test images
test_stack = []
trial = 0

while len(test_stack) < 25:
    r1 = random.choice(test)
    crit = [x for x in doy_list if (x <= (r1+30)) & (x >= (r1-30)) & (x != r1)]
    while len(crit) < 2:
        r1 = random.choice(test)
        crit = [x for x in doy_list if (x <= (r1+30)) & (x >= (r1-30)) & (x != r1)]
    r2 = random.sample(crit, k=2)
    triplet = [r1, r2[0],r2[1]]
    triplet.sort() # sort in ascending order
    triplet = tuple(triplet) # create tuple
    while triplet not in test_stack:
        test_stack.append(triplet)
        trial += 1

## Select triplets for each pixel

In [None]:
train_dic = {}
test_dic = {}
for row in data.iterrows(): # for all pixels in the data dataframe
  pixel = int(row[1][0])

  train_dic[pixel] = random.sample(train_stack, k=10)
  test_dic[pixel] = random.sample(test_stack, k=5)

## Create training table with variables

In [None]:
from pandas.io.parsers.readers import read_csv

cols = ['pixel','dates','doy_sin','doy_cos','B2_blue','B3_green','B4_red',
        'B5_RE1','B6_RE2','B7_RE3','B8_NIR1','B8A_NIR2','B11_SWI1','B12_SWI2',
        'doy_sin','doy_cos','B2_blue','B3_green','B4_red','B5_RE1',
        'B6_RE2','B7_RE3','B8_NIR1','B8A_NIR2','B11_SWI1','B12_SWI2',
        'doy_sin','doy_cos','B2_blue','B3_green','B4_red','B5_RE1',
        'B6_RE2','B7_RE3','B8_NIR1','B8A_NIR2','B11_SWI1','B12_SWI2']
training_table = pd.DataFrame(columns=cols)


for pixel, keys in train_dic.items():  #look at dictionary
  for triplet in keys:  # iterate through each triplet in a pixel
    lister = []
    for date in triplet:
      image = doy_image_dict[date]  # get image
      doy, year = cumulative_doy(image)
      doy_sin, doy_cos = cyclical_doy(doy, year)
      
      csv_name = image.replace('.tif','.csv')  # get csv name
      csv = pd.read_csv(os.path.join(tables_dir,csv_name))  # read csv
      csv.set_index('pixel', inplace=True)

      my_vals = list(csv.loc[pixel][1:])
      my_vals.insert(0, doy_cos)
      my_vals.insert(0,doy_sin)
      lister.append(my_vals)
      my_list = [item for sublist in lister for item in sublist]  # flatten list of doy my_val lists into single list
    my_list.insert(0,triplet)
    my_list.insert(0,pixel)
    training_table.loc[len(training_table)] = my_list

In [None]:
name = 'training_table.csv'
training_table.to_csv(os.path.join(data_dir,name), index=False)

## Create testing table with variables


In [None]:
from pandas.io.parsers.readers import read_csv

cols = ['pixel','dates','doy_sin','doy_cos','B2_blue','B3_green','B4_red',
        'B5_RE1','B6_RE2','B7_RE3','B8_NIR1','B8A_NIR2','B11_SWI1','B12_SWI2',
        'doy_sin','doy_cos','B2_blue','B3_green','B4_red','B5_RE1',
        'B6_RE2','B7_RE3','B8_NIR1','B8A_NIR2','B11_SWI1','B12_SWI2',
        'doy_sin','doy_cos','B2_blue','B3_green','B4_red','B5_RE1',
        'B6_RE2','B7_RE3','B8_NIR1','B8A_NIR2','B11_SWI1','B12_SWI2']
testing_table = pd.DataFrame(columns=cols)


for pixel, keys in test_dic.items():  #look at dictionary
  for triplet in keys:  # iterate through each triplet in a pixel
    lister = []
    for date in triplet:
      image = doy_image_dict[date]  # get image
      doy, year = cumulative_doy(image)
      doy_sin, doy_cos = cyclical_doy(doy, year)
      
      csv_name = image.replace('.tif','.csv')  # get csv name
      csv = pd.read_csv(os.path.join(tables_dir,csv_name))  # read csv
      csv.set_index('pixel', inplace=True)

      my_vals = list(csv.loc[pixel][1:])
      my_vals.insert(0, doy_cos)
      my_vals.insert(0,doy_sin)
      lister.append(my_vals)
      my_list = [item for sublist in lister for item in sublist]  # flatten list of doy my_val lists into single list
    my_list.insert(0,triplet)
    my_list.insert(0,pixel)
    testing_table.loc[len(testing_table)] = my_list

In [None]:
name = 'testing_table.csv'
testing_table.to_csv(os.path.join(data_dir,name), index=False)

## end of code