In [1]:
import pandas as pd
import numpy as np

### Area and accuracy estimation for sampling with inclusion probabilities weighted by pixel area, for sampling with replacement, from Tyukavina et al. (in review) "Global trends of forest loss due to fire, 2001-2019"

In [2]:
#Read final sample interpretation table
#Reference and map values == 1 for each sample represent forest loss due to fire class
#Map values corrrespond to the final map adjusted to match the sample-based estimate
data = pd.read_csv('Sample_data.txt', sep ='\t')
#Read strata info, including strata sizes and sample size in each stratum
strata = pd.read_csv('Strata_info.txt', sep = '\t')

In [3]:
data.head()

Unnamed: 0,ID,Region,Stratum,Reference,Pixarea,Map
0,1,SEA-AUS,10,1,0.000638,1
1,2,EUR,2,0,0.00043,0
2,3,AFR,1,0,0.000749,0
3,4,LAM,3,0,0.000733,0
4,5,LAM,8,1,0.000744,1


In [4]:
strata.head()

Unnamed: 0,Region,Stratum,Area_km2,Count,Sample_size
0,AFR,1,595255.0128,788889906,134
1,EUR,2,332992.9026,781490883,100
2,LAM,3,946369.3351,1280971777,213
3,NAM,4,486272.5356,859622001,109
4,SEA-AUS,5,669855.4746,909836775,150


In [5]:
#Merge sample table (data) with strata info table (strata) on the common column "Stratum"
data = data.merge(strata[['Stratum', 'Area_km2', 'Sample_size']])
data = data.rename(columns = {'Area_km2':'Ah', 'Sample_size':'nh', 'Pixarea':'au'})

In [6]:
data.head()

Unnamed: 0,ID,Region,Stratum,Reference,au,Map,Ah,nh
0,1,SEA-AUS,10,1,0.000638,1,161635.388,100
1,8,SEA-AUS,10,1,0.000626,1,161635.388,100
2,10,SEA-AUS,10,1,0.000769,1,161635.388,100
3,26,SEA-AUS,10,1,0.00067,1,161635.388,100
4,30,SEA-AUS,10,1,0.000649,1,161635.388,100


In [7]:
def estimate_area(df: pd.DataFrame) -> float:
    """ 
    Function to estimate class area from sample refernce values
    Input dataframe with number of lines equal the number of sample pixels,
    and the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Reference" (1 if the sample pixel was identified as target class in reference classification, and 0 otherwise)
    "Ah" (stratum area, km2)
    "nh" (number of sample pixels in stratum h)
    Returns estimated class area in km2
    From Tyukavina et al. (in review) "Global trends of forest loss due to fire, 2001-2019"
    Supplementary Information, equations 1 and 6
    """
    #Create a copy of columns "Stratum","nh","Ah" and a new column "ForArea", 
    #where u = Ah if the sample pixel has Reference == 1 (is of target class), and u = 0 if Reference == 0
    df1 = pd.concat([df['Stratum'], df['Ah'], df['nh'], pd.Series(((df['Reference']).astype(bool)* df['Ah']) , name = 'ForArea')], axis = 1)
    
    ByStratum = df1.groupby(by = ['Stratum'])
    
    # Equation 6, compute area of target class in each stratum
    areastrat = ByStratum.ForArea.sum()/ByStratum.nh.median()
    # Equation 1, sum target class areas over all strata
    area = areastrat.sum()

    return area

In [8]:
#Global area estimate
estimate_area(data)

1227725.1907783307

In [9]:
#Area estimates by region
data.groupby(by = ["Region"]).apply(estimate_area).reset_index(name='area')

Unnamed: 0,Region,area
0,AFR,16854.619667
1,EUR,572411.271631
2,LAM,128598.22941
3,NAM,385702.259811
4,SEA-AUS,124158.81026


In [24]:
def estimate_area_SE(df: pd.DataFrame) -> float:
    """ 
    Function to estimate Standard Error of the target class area estimated from sample refernce values
    Input dataframe with number of lines equal the number of sample pixels,
    and the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Reference" (1 if the sample pixel was identified as target class in reference classification, and 0 otherwise)
    "Ah" (stratum area, km2)
    "nh" (number of sample pixels in stratum h)
    Returns estimated SE of the target class area in km2
    From Tyukavina et al. (in review) "Global trends of forest loss due to fire, 2001-2019"
    Supplementary Information, equations 4 an 7
    """
    #Create a copy of columns "Stratum","nh" and a new column "ForVar", 
    #where u = Ah if the sample pixel is of the class being estimated and u = 0 if the sample pixel is not of that class
    df1 = pd.concat([df['Stratum'],df['nh'], pd.Series(df['Reference'].astype(bool) * df['Ah'], name = 'ForVar')],axis = 1)
    
    ByStratum = df1.groupby(by = ['Stratum'])
    
    #Equation 7, compute variance for each stratum
    StrataVar = ByStratum.ForVar.var() / ByStratum.nh.median() 
    #Equation 4, sum strata-specific variances
    StrataVarSum = StrataVar.sum()
    
    #Compute SE of the estimated class area from a sum of strata variances
    SE = np.sqrt(StrataVarSum)
    
    return SE

In [11]:
#Global estimate of area SE
estimate_area_SE(data)

31527.132247399582

In [12]:
#Standard error of area estimated by region
data.groupby(by = ["Region"]).apply(estimate_area_SE).reset_index(name='Area SE')

Unnamed: 0,Region,Area SE
0,AFR,6300.36232
1,EUR,21875.332734
2,LAM,14285.62742
3,NAM,10635.113566
4,SEA-AUS,12591.684909


In [22]:
def estimate_OA (df: pd.DataFrame) -> float:
    """ 
    Function to estimate Overall Accuracy of the map 
    Input dataframe with number of lines equal the number of sample pixels,
    ans the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Map" (1 if the sample pixel was mapped as target class, and 0 otherwise)
    "Reference" (1 if the sample pixel was identified as target class in reference classification, and 0 otherwise)
    "Ah" (stratum area, km2)
    "nh" (number of sample pixels in stratum h)
    Returns estimated Overall Accuracy of the map expressed as proportion (0-1) of the total study area
    From Tyukavina et al. (in review) "Global trends of forest loss due to fire, 2001-2019"
    Supplementary Information, equations 1 and 6, modified for Overall Accuracy computation:
    Y^h is defnied as area of correctly classified pixels within a stratum
    """
    #Create a copy of columns "Stratum","nh","Ah" and a new column "Correct, 
    #where u = Ah if the sample pixel is correctly classified, and u = 0 if sample is incorrectly classified
    df1 = pd.concat([df['Stratum'], df['Ah'], df['nh'], pd.Series((df['Reference'] == df['Map'])* df['Ah'] , name = 'Correct')], axis = 1)
    
    ByStratum = df1.groupby(by = ['Stratum'])
    
     # Equation 6, compute correctly classified area in each stratum
    CorrectlyClassified = ByStratum.Correct.sum()/ByStratum.nh.median()
    # Equation 1, sum correctly classified areas accross all strata
    CorrectlyClassifiedSum = CorrectlyClassified.sum()
    
    #Derive an Overall Accuracy metric by dividing the sum of correctly classified areas by the total study area
    OA = CorrectlyClassifiedSum / ((ByStratum.Ah.median()).sum())

    return OA

In [23]:
#Global estimate of Overall Accuracy
estimate_OA(data)

0.9443361626185658

In [15]:
#Overall accuracy estimated by region
data.groupby(by = ["Region"]).apply(estimate_OA).reset_index(name='Overall Accuracy')

Unnamed: 0,Region,Overall Accuracy
0,AFR,0.977111
1,EUR,0.933049
2,LAM,0.933761
3,NAM,0.967779
4,SEA-AUS,0.922604


In [25]:
def estimate_OA_SE(df: pd.DataFrame) -> float:
    """ 
    Function to estimate Standard Error of map Overall Accuracy
    Input dataframe with number of lines equal the number of sample pixels,
    and the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Map" (1 if the sample pixel was mapped as target class, and 0 otherwise)
    "Reference" (1 if the sample pixel was identified as target class in reference classification, and 0 otherwise)
    "Ah" (stratum area, km2)
    "nh" (number of sample pixels in stratum h)
    Returns estimated SE of the Overall Accuracy expressed as proportion (0-1) of the total study area
    From Tyukavina et al. (in review) "Global trends of forest loss due to fire, 2001-2019"
    Supplementary Information, equations 4 and 7, modified for Overall Accuracy computation:
    u = Ah if the sample pixel is classified correctly and u = 0 if the sample pixel is not classified correctly.
    """
    #Create a copy of columns "Stratum","nh","Ah" and a new column "Correct", 
    #where u = Ah if the sample pixel has Reference == 1 (is of target class), and u = 0 if Reference == 0
    
    df1 = pd.concat([df['Stratum'], df['Ah'], df['nh'], pd.Series((df['Reference'] == df['Map']) * df['Ah'], name = 'Correct')], axis = 1)
    
    ByStratum = df1.groupby(by = ['Stratum'])
    
    #Equation 7, compute variance for each stratum
    StrataVar = ByStratum.Correct.var() / ByStratum.nh.median() 
    #Equation 4, sum strata-specific variances
    StrataVarSum = StrataVar.sum()
    
    #Compute SE of the estimated class area from a sum of strata variances divided by the total study area
    SE = np.sqrt(StrataVarSum)/((ByStratum.Ah.median()).sum())
    
    return SE

In [26]:
estimate_OA_SE(data)

0.006215359496848278

In [27]:
#Standard Error of Overall accuracy estimated by region
data.groupby(by = ["Region"]).apply(estimate_OA_SE).reset_index(name='OA SE')

Unnamed: 0,Region,OA SE
0,AFR,0.010327
1,EUR,0.016209
2,LAM,0.012732
3,NAM,0.011804
4,SEA-AUS,0.014622


In [28]:
def estimate_UA (df: pd.DataFrame) -> float:
    """ 
    Function to estimate User's Accuracy of target class 
    Input dataframe with number of lines equal the number of sample pixels,
    and the following columns:
    "Map" (1 if the sample pixel was mapped as target class, and 0 otherwise)
    "Reference" (1 if the sample pixel was identified as target class in reference classification, and 0 otherwise)
    "Ah" (stratum area, km2)
    "nh" (number of sample pixels in stratum h)
    "au" (area of sampled pixel in km2)
    Returns estimated User's Accuracy of target class expressed as proportion (0-1) of the total study area
    From Tyukavina et al. (in review) "Global trends of forest loss due to fire, 2001-2019"
    Supplementary Information, equations 8 - 11
    """
    #Create a copy of columns "Stratum","nh", "Ah" and new columns: 
    #"Correct", where yu = au if the sample pixel is correctly classified as target class, and yu = 0 otherwise
    #"Mapped", where zu = au if the sample pixel is mapped as target class, and zu = 0 otherwise 
    df1 = pd.concat([df['nh'],df['Ah'],df['au'], pd.Series((((df['Reference'] == 1) & (df['Map'] == 1)).astype(bool) * df['au']) , name = 'Correct'),
                    pd.Series(((df['Map'] == 1).astype(bool)* df['au']) , name = 'Mapped')],axis = 1)
    
    #Equation 11 Compute inclusion probability for each sampled pixel
    df1['PIu'] = df1['nh'] * df1['au'] / df1['Ah']
    
    #Equation 9
    Yest = (df1.Correct/df1.PIu).sum()
    #Equation 10
    Zest = (df1.Mapped/df1.PIu).sum()
    
    #Equation 8
    UA = Yest/Zest

    return UA

In [29]:
estimate_UA(data)

0.8997915745423283

In [30]:
data.groupby(by = ["Region"]).apply(estimate_UA).reset_index(name ='UA')

Unnamed: 0,Region,UA
0,AFR,0.621951
1,EUR,0.937984
2,LAM,0.74359
3,NAM,0.947917
4,SEA-AUS,0.74026


In [38]:
def estimate_PA (df: pd.DataFrame) -> float:
    """ 
    Function to estimate Producer's Accuracy of target class 
    Input dataframe with number of lines equal the number of sample pixels,
    and the following columns:
    "Map" (1 if the sample pixel was mapped as target class, and 0 otherwise)
    "Reference" (1 if the sample pixel was identified as target class in reference classification, and 0 otherwise)
    "Ah" (stratum area, km2)
    "nh" (number of sample pixels in stratum h)
    "au" (area of sample pixel in km2)
    Returns estimated Producer's Accuracy of target class expressed as proportion (0-1) of the total study area
    From Tyukavina et al. (in review) "Global trends of forest loss due to fire, 2001-2019"
    Supplementary Information, equations 8 - 11
    """
    #Create a copy of columns "Stratum","nh", "Ah" and new columns: 
    #"Correct", where yu = au if the sample pixel is correctly classified as target class, and yu = 0 otherwise
    #"Ref", where zu = au if the sample pixel is mapped as target class, and zu = 0 otherwise 
    df1 = pd.concat([df['nh'],df['Ah'],df['au'], pd.Series((((df['Reference'] == 1) & (df['Map'] == 1)).astype(bool) * df['au']) , name = 'Correct'),
                    pd.Series(((df['Reference'] == 1).astype(bool)* df['au']) , name = 'Ref')],axis = 1)
    
    #Equation 11 Compute inclusion probability for each sampled pixel
    df1['PIu'] = df1['nh'] * df1['au'] / df1['Ah']
    
    #Equation 9
    Yest = (df1.Correct/df1.PIu).sum()
    #Equation 10
    Zest = (df1.Ref/df1.PIu).sum()
    
    #Equation 8
    PA = Yest/Zest

    return PA

In [39]:
estimate_PA(data)

0.8981668852260704

In [40]:
data.groupby(by = ["Region"]).apply(estimate_PA).reset_index(name ='PA')

Unnamed: 0,Region,PA
0,AFR,0.438489
1,EUR,0.943558
2,LAM,0.666173
3,NAM,0.978495
4,SEA-AUS,0.742051


In [31]:
def estimate_UA_SE(df: pd.DataFrame) -> float:
    """ 
    Function to estimate the Standard Error of User's Accuracy of target class 
    Input dataframe with number of lines equal the number of sample pixels,
    and the following columns:
    "ID" (sampled pixel ID)
    "Stratum" (strata IDs 1 - nstrata)
    "Map" (1 if the sample pixel was mapped as target class, and 0 otherwise)
    "Reference" (1 if the sample pixel was identified as target class in reference classification, and 0 otherwise)
    "au" (area of sample pixel in km2)
    "nh" (number of sample pixels in stratum h)
    "Ah" (stratum area, km2)
    Returns estimated SE of User's Accuracy of target class expressed as proportion (0-1) of the total study area
    From Tyukavina et al. (in review) "Global trends of forest loss due to fire, 2001-2019"
    Supplementary Information, equations 8 - 13
    """
    #Create a copy of columns "Stratum","nh", "Ah" and new columns: 
    #"Correct", where yu = au if the sample pixel is correctly classified as target class, and yu = 0 otherwise
    #"Mapped", where zu = au if the sample pixel is mapped as target class, and zu = 0 otherwise 
    df1 = pd.concat([df[['au','ID','Stratum','nh','Ah']], 
                     pd.Series((((df['Reference'] == 1) & (df['Map'] == 1)).astype(bool) * df['au']) , name = 'Correct'),
                     pd.Series(((df['Map'] == 1).astype(bool)* df['au']) , name = 'Mapped')],axis = 1)
    
    #Equation 11 Compute inclusion probability for each sampled pixel
    df1['PIu'] = df1['nh'] * df1['au'] / df1['Ah']
    
    #Equations 8-10
    Yest = (df1.Correct/df1.PIu).sum()
    Zest = (df1.Mapped/df1.PIu).sum()
    UA = Yest/Zest
    
    # Equation 13
    var_u =(1-df1['PIu'])*(df1['Correct'] - UA * df1['Mapped'])*(df1['Correct'] - UA * df1['Mapped'])/ (df1['PIu']*df1['PIu'])
    var = np.sum(var_u) / Zest ** 2
    
    SE = np.sqrt(var)
     
    return SE

In [32]:
%%time
estimate_UA_SE(data)

Wall time: 18 ms


0.014586350037767162

In [33]:
data.groupby(by = ["Region"]).apply(estimate_UA_SE).reset_index(name ='UA_SE')

Unnamed: 0,Region,UA_SE
0,AFR,0.053548
1,EUR,0.021235
2,LAM,0.049441
3,NAM,0.022678
4,SEA-AUS,0.049971


In [34]:
def estimate_PA_SE (df: pd.DataFrame) -> float:
    """ 
    Function to estimate the Standard Error of Producer's Accuracy of target class 
    Input dataframe with number of lines equal the number of sample pixels,
    and the following columns:
    "ID" (sampled pixel ID)
    "Stratum" (strata IDs 1 - nstrata)
    "Map" (1 if the sample pixel was mapped as target class, and 0 otherwise)
    "Reference" (1 if the sample pixel was identified as target class in reference classification, and 0 otherwise)
    "au" (area of sample pixel in km2)
    "nh" (number of sample pixels in stratum h)
    "Ah" (stratum area, km2)
    Returns estimated SE of Producer's Accuracy of target class expressed as proportion (0-1) of the total study area
    From Tyukavina et al. (in review) "Global trends of forest loss due to fire, 2001-2019"
    Supplementary Information, equations 8 - 13
    """
    #Create a copy of columns "Stratum","nh", "Ah" and new columns: 
    #"Correct", where yu = au if the sample pixel is correctly classified as target class, and yu = 0 otherwise
    #"Ref", where zu = au if the sample pixel is mapped as target class, and zu = 0 otherwise 
    df1 = pd.concat([df[['au','ID','Stratum','nh','Ah']],
                     pd.Series((((df['Reference'] == 1) & (df['Map'] == 1)).astype(bool) * df['au']) , name = 'Correct'),
                     pd.Series(((df['Reference'] == 1).astype(bool)* df['au']) , name = 'Ref')],axis = 1)
   
    #Equation 11 Compute inclusion probability for each sampled pixel
    df1['PIu'] = df1['nh'] * df1['au'] / df1['Ah']
    
    #Equations 8-10
    Yest = (df1.Correct/df1.PIu).sum()
    Zest = (df1.Ref/df1.PIu).sum()
    PA = Yest/Zest
    
    # Equation 13
    var_u =(1-df1['PIu'])*(df1['Correct'] - PA * df1['Ref'])*(df1['Correct'] - PA * df1['Ref'])/ (df1['PIu']*df1['PIu'])
    var = np.sum(var_u) / Zest ** 2
    
    SE = np.sqrt(var)
     
    return SE

In [35]:
estimate_PA_SE(data)

0.017090040550356903

In [36]:
data.groupby(by = ["Region"]).apply(estimate_PA_SE).reset_index(name ='PA_SE')

Unnamed: 0,Region,PA_SE
0,AFR,0.167205
1,EUR,0.019609
2,LAM,0.073371
3,NAM,0.015042
4,SEA-AUS,0.069025


In [41]:
#Run all models for all regions together
functions = [estimate_area, estimate_area_SE, estimate_OA, estimate_OA_SE, estimate_UA, estimate_UA_SE, estimate_PA, estimate_PA_SE]
names = ['area','area_SE', 'OA', 'OA_SE', 'UA','UA_SE', 'PA','PA_SE']
values = {nm:[fn(data)] for fn, nm in zip(functions,names)}
results = pd.DataFrame(values)

print (results.round(2).to_string(index=False))

       area   area_SE    OA  OA_SE   UA  UA_SE   PA  PA_SE
 1227725.19  31527.13  0.94   0.01  0.9   0.01  0.9   0.02


In [42]:
#Run all models for each region
functions = [estimate_area, estimate_area_SE, estimate_OA, estimate_OA_SE, estimate_UA, estimate_UA_SE, estimate_PA, estimate_PA_SE]
names = ['area','area_SE', 'OA', 'OA_SE', 'UA','UA_SE', 'PA','PA_SE']
results = pd.concat([data.groupby(by = ["Region"]).apply(fn).reset_index(name = nm).set_index("Region") for fn, nm in zip(functions,names)], axis = 1).reset_index()

print (results.round(2).to_string(index=False))

  Region       area   area_SE    OA  OA_SE    UA  UA_SE    PA  PA_SE
     AFR   16854.62   6300.36  0.98   0.01  0.62   0.05  0.44   0.17
     EUR  572411.27  21875.33  0.93   0.02  0.94   0.02  0.94   0.02
     LAM  128598.23  14285.63  0.93   0.01  0.74   0.05  0.67   0.07
     NAM  385702.26  10635.11  0.97   0.01  0.95   0.02  0.98   0.02
 SEA-AUS  124158.81  12591.68  0.92   0.01  0.74   0.05  0.74   0.07
