In [13]:
# This script is used for testing all adaptations and changes made
## comparison should be 1 and 2 (muestreado1 == 1 & muestreado2 == 1)
## potentially need to change data from wide format (Metadatatable) to long format (initial diagnostics table)
## less changes for this script would be needed this way

In [2]:
import pandas as pd
import numpy as np

from scipy import linalg
from scipy import stats

In [3]:
# Load data for change detection as a pandas data frame.
infys_df =  pd.read_csv("../data/iMAD_Data_12_F.csv") #need to change the dataset 
infys_df.head()

Unnamed: 0.1,Unnamed: 0,File,Cluster_ID,AvgDbh,AvgCrownDiameter,AvgCrownHeight,AvgCrownArea,SpeciesCount,AvgTreeHeight,TreeCount,J,X,Y
0,1,1,77652,16.103571,3.937736,3.367925,15.409993,8.0,8.555357,56,0.430012,-92.433389,15.588935
1,2,1,28451,10.380952,2.209524,,3.95692,2.0,2.909524,21,0.276195,-99.546778,26.072916
2,3,1,51197,12.280889,3.19901,5.3605,9.575256,32.0,9.168889,225,0.769687,-88.688611,20.654722
3,4,1,53152,12.208163,3.172527,3.315385,8.666242,7.0,5.419388,98,0.649813,-102.826388,20.825555
4,5,1,55242,14.996429,4.439683,4.557937,17.199388,5.0,5.458333,84,0.52917,-104.218713,20.403546


In [4]:
# Create new ids by concatenating 'Conglomerado' and 'Sitio'.
    ## don't think I need to do that
    ## Use existing ID of Metadataset called Cluster_ID -> should be able to leave that one out


# As we want to compare cycle 1 vs cycle 2 lets separate them.   
## Stays the same for long dataformat
## might figure out how to 
infys_df_c1 = infys_df[infys_df['File'] == 1]
infys_df_c2 = infys_df[infys_df['File'] == 2]

In [5]:
# Find intersection of new ID variable: 'cng_sit', so obs. that are present in both cycles.
s1 = infys_df_c1['Cluster_ID']
s2 = infys_df_c2['Cluster_ID']
cluster_id_inter = pd.Series(list(set(s1).intersection(set(s2))))
## might have already done this in R by using my filter(muestreado1 == 1 & muestreado2 == 1)



In [6]:
# Filter out obs. that dont appear in cng_sit_inter.
infys_df_c1 = infys_df_c1[infys_df_c1['Cluster_ID'].isin(cluster_id_inter)]
infys_df_c2 = infys_df_c2[infys_df_c2['Cluster_ID'].isin(cluster_id_inter)]

In [7]:
# Select only variables to be utilized in change detection.
## probably lose H and add tree counts and species counts
vars = ["SpeciesCount", "TreeCount", "J", "AvgTreeHeight", "AvgDbh", "AvgCrownDiameter", "AvgCrownHeight", "AvgCrownArea"]
nvars = len(vars)
infys_df_c1_c = infys_df_c1[infys_df_c1.columns.intersection(vars)]
infys_df_c2_c = infys_df_c2[infys_df_c2.columns.intersection(vars)]

c1_c_nans = infys_df_c1_c.isna().any(axis=1).to_numpy()
c2_c_nans = infys_df_c2_c.isna().any(axis=1).to_numpy()

In [8]:
# Create change data matrix, missing values mask and initial change weights.
## think this can stay the same
dm = np.zeros((2 * nvars, infys_df_c1_c.shape[0]))
dm[0:nvars] = np.transpose(infys_df_c1_c.to_numpy())
dm[nvars:] = np.transpose(infys_df_c2_c.to_numpy())

nodataidx = c1_c_nans | c2_c_nans
gooddataidx = nodataidx == False
dm = dm[:, gooddataidx]
ngood = np.sum(gooddataidx)

In [9]:
# Change detection iterations.
## also think this can stay -> can't: have to manage NAs for empty plots without throwing them out completely
# iteration of MAD   
wt = np.ones(int(ngood))
delta = 1.0
oldrho = np.zeros(nvars)
iter = 0
 
while (delta > 0.01) and (iter < 37): 
    print(iter)
    # Weighted covariance matrices and means.
    sumw = np.sum(wt)
    means = np.average(dm,axis=1, weights=wt)
    dmc = dm - means[:,np.newaxis]
    dmc = np.multiply(dmc,np.sqrt(wt))
    sigma = np.dot(dmc,dmc.T)/sumw
   
    s11 = sigma[0:nvars, 0:nvars]
    s22 = sigma[nvars:, nvars:]
    s12 = sigma[0:nvars, nvars:]
    s21 = sigma[nvars:, 0:nvars]
    
    # Solution of generalized eigenproblems.
    aux_1 = linalg.solve(s22, s21)
    lama, a = linalg.eig(np.dot(s12, aux_1), s11)
    aux_2 = linalg.solve(s11, s12)
    lamb, b = linalg.eig(np.dot(s21, aux_2), s22)
    
    # Sort a.  
    idx = np.argsort(lama)
    a = a[:, idx]
    
    # Sort b.        
    idx = np.argsort(lamb)
    b = b[:, idx]
    
    # Canonical correlations.        
    rho = np.sqrt(np.real(lamb[idx]))
    
    # Normalize dispersions.  
    tmp1 = np.dot(np.dot(a.T,s11), a)
    tmp2 = 1. / np.sqrt(np.diag(tmp1))
    tmp3 = np.tile(tmp2, (nvars, 1))
    a = np.multiply(a, tmp3)
    b = np.mat(b)
    tmp1 = np.dot(np.dot(b.T,s22), b)
    tmp2 = 1. / np.sqrt(np.diag(tmp1))
    tmp3 = np.tile(tmp2, (nvars, 1))
    b = np.multiply(b, tmp3)
        
    # Assure positive correlation
    tmp = np.diag(np.dot(np.dot(a.T,s12), b))
    b = np.dot(b,np.diag(tmp / np.abs(tmp)))

    # Canonical and MAD variates
    U = np.dot(a.T , (dm[0:nvars, :] - means[0:nvars, np.newaxis]))    
    V = np.dot(b.T , (dm[nvars:, :] - means[nvars:, np.newaxis]))          
    MAD = U - V  
        
    # New weights.        
    var_mad = np.tile(np.mat(2 * (1 - rho)).T, (1, ngood))    
    chisqr = np.sum(np.multiply(MAD, MAD) / var_mad, 0)
    wt = np.squeeze(1 - np.array(stats.chi2._cdf(chisqr, nvars)))
        
    # Continue iteration.        
    delta = np.sum(np.abs(rho - oldrho))
    oldrho = rho
    iter += 1
    
# reshape to original image size, by including nodata pixels    
MADout = np.zeros((int(nvars), infys_df_c1_c.shape[0]))
MADout[0:nvars, gooddataidx] = MAD
#MADout[nvars:(nvars + 1), gooddataidx] = chisqr

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36


In [10]:
# TODO add coordinates to table
# Pandas copied R maybe read a bit on python data frame manipulation
## should also already work - especially with the single source coordinates from the metadata
MADout = np.transpose(MADout).copy()
MADout_df = pd.DataFrame(MADout, columns=vars)

In [11]:
MADout_df["X"] = infys_df_c1["X"]
MADout_df["Y"] = infys_df_c1["Y"]

In [12]:
MADout_df.to_csv('iMAD_results_12_F.csv', index=False)