In [1]:
import pandas as pd
import numpy as np

### Area and accuracy estimation for sampling of units with esual area, strata weighted by number of units, from Tyukavina et al. (in review) "Options for global sampling of geographic data"

#### Example of input data

In [2]:
#Read strata info table with columns:
#"Stratum" - stratum ID, 1 - nstrata;
#"Area_km2" - stratum area in km2 or any other area units, needs to be consistend with pixel size area units in data table;
#"Count" - total number of units (pixels, polygons) in each stratum
strata = pd.read_csv('2-3.strata_info.txt', sep = '\t')

In [3]:
strata.head().style.hide_index()

Stratum,Area_km2,Count
1,595255.0128,788889906
2,332992.9026,781490883
3,946369.3351,1280971777
4,486272.5356,859622001
5,669855.4746,909836775


In [4]:
#Read sample interpretation table with columns:
#"Stratum" - stratum ID, 1 - nstrata;
#"Map"(for accuracy assessment only) - proportion of target class from the map (0-1) for each sample unit;
#"Reference" - proportion of target class from reference sample classification for each sample unit;
#allowed values are from -1 to 1 for area estimation, and from 0 to 1 for map accuracy assessment.
#(optional)"RefType" - type labels, if the are of target class needs to be estimated separately for multiple sub-types
#(optional)"Correct" - proportion of sample unit (0-1), which is correctly mapped. 
#This column is necessary for maps with more than two classes. 
#For map with two classes it is computed in the function "estimate_OA_two_classes" directly from "Map" and "Reference" columns

data = pd.read_csv('2-3.Sample_data.txt', sep ='\t')

#Merge data table with sample info table
data = data.merge(strata)
data = data.rename(columns = {'Area_km2':'Ah', 'Count':'Nh'})

In [5]:
data.head().style.hide_index()

Stratum,Map,Reference,RefType,Correct,Ah,Nh
10,1.0,1.0,Type1,1.0,161635.388,234157691
10,1.0,1.0,Type1,1.0,161635.388,234157691
10,1.0,1.0,Type1,1.0,161635.388,234157691
10,1.0,1.0,Type1,1.0,161635.388,234157691
10,1.0,1.0,Type1,1.0,161635.388,234157691


#### Functions to estimate class area and its standard error

In [6]:
def estimate_area(df: pd.DataFrame) -> float:
    """ 
    Function to estimate target class area from sample refernce values 
    for sampling of units (pixels/polygons) with equal area.
    Strata weighted by their respective unit counts (Nh).
    ~~~
    Input dataframe with number of lines equal the number of sample pixels/polygons,
    and the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Reference" (0-1 - proportion of the sample pixel/polygon identified as target class in reference classification;
    Reference data column could be defined as values in range from -1 to 1 to compute net change area of a target class,
    in this case negative proportions mean net loss, and positive proportions - net gain)
    "Ah" (stratum area, km²<or any other area unit>)
    "Nh" (number of units in each stratum h)
    ~~~
    Returns estimated target class area in units of Ah, 
    negative area in net change computations means overall net loss of a target class
    ~~~
    From Tyukavina et al. (in review) "Options for global sampling of geographic data"
    Appendix, equations 3 and 4
    """
    #Group input dataset by stratum
    ByStratum = df.groupby(by = ['Stratum'])
    
    # Equation 3
    Nh =  ByStratum.Nh.median()
    N = Nh.sum()
    proportion = Nh * ByStratum.Reference.mean() / N
    
    #Sum over all strata and multiply by total area (Equation 4)
    Atot = ByStratum.Ah.median().sum()
    area = proportion.sum() * Atot

    return area

In [7]:
def estimate_area_SE(df: pd.DataFrame) -> float:
    """ 
    Function to estimate target class area from sample refernce values 
    for sampling of units (pixels/polygons) with equal area.
    Strata weighted by their respective unit counts (Nh).
    ~~~
    Input dataframe with number of lines equal the number of sample pixels/polygons,
    and the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Reference" (0-1 - proportion of the sample pixel/polygon identified as target class in reference classification;
    Reference data column could be defined as values in range from -1 to 1 to compute net change area of a target class,
    in this case negative proportions mean net loss, and positive proportions - net gain)
    "Ah" (stratum area, km²<or any other area unit>)
    "Nh" (number of units in each stratum h)
    ~~~
    Returns estimated SE of the target class area in in units of Ah,
    SE is always a positive number, even if the estimated target class area is negative
    ~~~
    From Tyukavina et al. (in review) "Options for global sampling of geographic data"
    Appendix, equations 6 and 7
    """
    
    ByStratum = df.groupby(by = ['Stratum'])
    
    #Equation 6
    Nh = ByStratum.Nh.median()
    nh = ByStratum.Reference.count()
    Forstrata = ByStratum.Reference.var(ddof=1) / nh
    StrataVar = Forstrata * (1 - nh / Nh) * Nh**2
    N = Nh.sum()
    StrataVarSum = StrataVar.sum() / N / N
    #Equation 7
    Atot = ByStratum.Ah.median().sum()
    SE = np.sqrt(StrataVarSum) * Atot
    
    return SE

In [8]:
#Estimate target class area
estimate_area(data)

1564753.3915654607

In [9]:
#Estimate standard error of the target class area
estimate_area_SE(data)

38348.157702564364

In [10]:
#Estimate target class are for each unique type from the column RefType

functions = [estimate_area, estimate_area_SE]
names = ['area, km²','area SE, km²']
results = pd.DataFrame()
datacopy = pd.DataFrame()

for classtype in data['RefType'].unique():
    datacopy = data.copy()
    #A new reference column, where reference values are set to zero if class type is different from the current class type
    datacopy['Reference'] = np.where(datacopy['RefType'] == classtype, datacopy['Reference'], 0)
    
    values = {}
    values = {nm:[fn(datacopy)] for fn, nm in zip(functions,names)}
    values["Estimate"] = classtype
    values_pd = pd.DataFrame(values).set_index("Estimate")
    results = pd.concat([values_pd, results])

results = results.reset_index()

results.style.hide_index().format({name: '{:.4f}' for name in names})


Estimate,"area, km²","area SE, km²"
Type3,499503.8249,45035.4896
Type2,769153.3741,50682.092
Type0,0.0,0.0
Type1,296096.1926,37117.3059


#### Functions to estimate overall map area and its standard error

In [11]:
def estimate_OA_two_classes (df: pd.DataFrame) -> float:
    """ 
    Function to estimate overall accuracy of the map that has two classes (target class vs. no target class)
    for sampling of units (pixels/polygons) with equal area.
    Strata weighted by their respective unit counts (Nh).
    ~~~
    Input dataframe with number of lines equal the number of sample pixels/polygons,
    ans the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Map" (0-1 - proportion of the sample pixel/polygon identified as target class in the map)
    "Reference" (0-1 - proportion of the sample pixel/polygon identified as target class in reference classification)
    "Nh" (number of units in each stratum h)
    ~~~
    Returns estimated overall accuracy of the map expressed as percent of the total study area
    ~~~
    From Tyukavina et al. (in review) "Options for global sampling of geographic data"
    Appendix, equations 3 and 5
    """
    #Create a copy of columns "Stratum","Nh" and a new column "Correct, 
    #where the proportion of the sample unit that is correctly classified (as either of map classes) is computed as: 
    #min ("Map", "Reference") + min (1-"Map", 1-"Reference")
    df1 = pd.concat([df['Stratum'], df['Nh'],
                     pd.Series((np.minimum(df['Map'],df['Reference'])+ np.minimum((1-df['Map']),(1-df['Reference']))),name = 'Correct')], axis = 1)
                     
    ByStratum = df1.groupby(by = ['Stratum'])
    
    #Equation 3
    proportion = ByStratum.Nh.median() * ByStratum.Correct.mean() / (ByStratum.Nh.median()).sum()
    
    #Equation 5
    OA = proportion.sum() * 100

    return OA

In [12]:
estimate_OA_two_classes(data)

91.41793783017692

In [13]:
def estimate_OA_multiple_classes (df: pd.DataFrame) -> float:
    """ 
    Function to estimate overall accuracy of the map that has multiple classes
    for sampling of units (pixels/polygons) with equal area.
    Strata weighted by their respective unit counts (Nh).
    ~~~
    Input dataframe with number of lines equal the number of sample pixels/polygons,
    ans the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Correct" - proportion of sample unit (0-1), which is correctly mapped
    "Nh" (number of units in each stratum h)
    ~~~
    Returns estimated overall accuracy of the map expressed as percent of the total study area
    ~~~
    From Tyukavina et al. (in review) "Options for global sampling of geographic data"
    Appendix, equations 3 and 5
    """               
    ByStratum = df.groupby(by = ['Stratum'])
    
    #Equation 3
    proportion = ByStratum.Nh.median() * ByStratum.Correct.mean() / (ByStratum.Nh.median()).sum()
    
    #Equation 5
    OA = proportion.sum() * 100

    return OA

In [14]:
estimate_OA_multiple_classes(data)

91.41793783017692

In [15]:
def estimate_SE_OA_two_classes (df: pd.DataFrame) -> float:
    """ 
    Function to estimate the standard error of overall accuracy of the map that has two classes 
    (target class vs. no target class) for sampling of units (pixels/polygons) with equal area.
    Strata weighted by their respective unit counts (Nh).
    ~~~
    Input dataframe with number of lines equal the number of sample pixels/polygons,
    ans the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Map" (0-1 - proportion of the sample pixel/polygon identified as target class in the map)
    "Reference" (0-1 - proportion of the sample pixel/polygon identified as target class in reference classification)
    "Nh" (number of units in each stratum h)
    ~~~
    Returns estimated standard error of the overall accuracy of the map expressed as percent of the total study area
    ~~~
    From Tyukavina et al. (in review) "Options for global sampling of geographic data"
    Appendix, equations 6 and 7
    """
    #Create a copy of columns "Stratum","Nh" and a new column "Correct, 
    #where the proportion of the sample unit that is correctly classified (as either of map classes) is computed as: 
    #min ("Map", "Reference") + min (1-"Map", 1-"Reference")
    df1 = pd.concat([df['Stratum'], df['Nh'],
                     pd.Series((np.minimum(df['Map'],df['Reference']) + np.minimum((1 - df['Map']),(1 - df['Reference']))),name = 'Correct')], axis = 1)
                     
    ByStratum = df1.groupby(by = ['Stratum'])
    
    #Equation 6
    Nh = ByStratum.Nh.median()
    nh = ByStratum.Correct.count()
    Forstrata = ByStratum.Correct.var(ddof=1) / nh
    StrataVar = Forstrata * (1 - nh / Nh) * Nh**2
    N = Nh.sum()
    StrataVarSum = StrataVar.sum() / N / N
    #Equation 7
    SE = np.sqrt(StrataVarSum) * 100
    
    return SE

In [16]:
estimate_SE_OA_two_classes(data)

0.858039217405162

In [17]:
def estimate_SE_OA_multiple_classes (df: pd.DataFrame) -> float:
    """ 
    Function to estimate the standard error of overall accuracy of the map that has multiple classes
    for sampling of units (pixels/polygons) with equal area.
    Strata weighted by their respective unit counts (Nh).
    ~~~
    Input dataframe with number of lines equal the number of sample pixels/polygons,
    ans the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Correct" - proportion of sample unit (0-1), which is correctly mapped
    "Nh" (number of units in each stratum h)
    ~~~
    Returns estimated standard error of the overall accuracy of the map expressed as percent of the total study area
    ~~~
    From Tyukavina et al. (in review) "Options for global sampling of geographic data"
    Appendix, equations 6 and 7
    """
    ByStratum = df.groupby(by = ['Stratum'])
    
    #Equation 6
    Nh = ByStratum.Nh.median()
    nh = ByStratum.Correct.count()
    Forstrata = ByStratum.Correct.var(ddof=1) / nh
    StrataVar = Forstrata * (1 - nh / Nh) * Nh**2
    N = Nh.sum()
    StrataVarSum = StrataVar.sum() / N / N
    #Equation 7
    SE = np.sqrt(StrataVarSum) * 100
    
    return SE

In [18]:
estimate_SE_OA_multiple_classes(data)

0.858039217405162

In [19]:
def estimate_UA (df: pd.DataFrame) -> float:
    """ 
    Function to estimate user's accuracy of target class
    for sampling of units (pixels/polygons) with equal area.
    Strata weighted by their respective unit counts (Nh).
    ~~~
    Input dataframe with number of lines equal the number of sample pixels/polygons,
    and the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Map" (0-1 - proportion of the sample pixel/polygon identified as target class in the map)
    "Reference" (0-1 - proportion of the sample pixel/polygon identified as target class in reference classification)
    "Nh" (number of units in each stratum h)
    ~~~
    Returns estimated user's accuracy of target class expressed as percentage
    ~~~
    From Tyukavina et al. (in review) "Options for global sampling of geographic data"
    Appendix, equation 9
    """
    #Create a copy of columns "Nh", "Map" and a new column computed as:
    #"Correct", the proportion of the sample unit that is correctly classified as target class: min ("Map", "Reference")
    df1 = pd.concat([df['Stratum'],df['Nh'],df['Map'], 
                     pd.Series((np.minimum(df['Map'],df['Reference'])) , name = 'Correct')],axis = 1)
    
    ByStratum = df1.groupby(by = ['Stratum'])
    
    #Equation 9
    Nh = ByStratum.Nh.median()
    Yest = (Nh * ByStratum.Correct.mean()).sum()
    Xest = (Nh * ByStratum.Map.mean()).sum()
    UA = Yest / Xest * 100

    return UA

In [20]:
estimate_UA(data)

82.59206405937368

In [21]:
def estimate_PA (df: pd.DataFrame) -> float:
    """ 
    Function to estimate producer's accuracy of target class
    for sampling of units (pixels/polygons) with equal area.
    Strata weighted by their respective unit counts (Nh).
    ~~~ 
    Input dataframe with number of lines equal the number of sample pixels/polygons,
    and the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Map" (0-1 - proportion of the sample pixel/polygon identified as target class in the map)
    "Reference" (0-1 - proportion of the sample pixel/polygon identified as target class in reference classification)
    "Nh" (number of units in each stratum h)
    ~~~
    Returns estimated producer's accuracy of target class expressed as percentage
    ~~~
    From Tyukavina et al. (in review) "Options for global sampling of geographic data"
    Appendix, equation 9
    """
    #Create a copy of columns "Nh", "Reference" and a new column computed as:
    #"Correct", the proportion of the sample unit that is correctly classified as target class: min ("Map", "Reference")
    df1 = pd.concat([df['Stratum'],df['Nh'],df['Reference'], 
                     pd.Series((np.minimum(df['Map'],df['Reference'])) , name = 'Correct')],axis = 1)
    
    ByStratum = df1.groupby(by = ['Stratum'])
    
    #Equation 9
    Nh = ByStratum.Nh.median()
    Yest = (Nh * ByStratum.Correct.mean()).sum()
    Xest = (Nh * ByStratum.Reference.mean()).sum()
    PA = Yest / Xest * 100

    return PA

In [22]:
estimate_PA(data)

95.76568315876409

In [23]:
def estimate_UA_SE(df: pd.DataFrame) -> float:
    """ 
    Function to estimate the standard error (SE) of user's accuracy of target class
    for sampling of units (pixels/polygons) with equal area.
    Strata weighted by their respective unit counts (Nh).
    ~~~
    Input dataframe with number of lines equal the number of sample pixels/polygons,
    and the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Map" (0-1 - proportion of the sample pixel/polygon identified as target class in the map)
    "Reference" (0-1 - proportion of the sample pixel/polygon identified as target class in reference classification)
    "Nh" (number of units in each stratum h)
    ~~~
    Returns estimated SE of user's accuracy of target class expressed as percentage
    ~~~
    From Tyukavina et al. (in review) "Options for global sampling of geographic data"
    Appendix, equations 9-11
    """
    #Create a copy of columns "Ah","Stratum", "Map" and new columns computed as:
    #"Correct", the proportion of the sample unit that is correctly classified as target class: min ("Map", "Reference");
    df1 = pd.concat([df['Nh'],df['Stratum'], df['Map'],
                     pd.Series((np.minimum(df['Map'],df['Reference'])) , name = 'Correct')],axis = 1)
    
    df1['XY'] = df1['Map'] * df1['Correct']
    
    ByStratum = df1.groupby(by = ['Stratum'])
    
    #Equation 11
    Nh = ByStratum.Nh.median()
    Xest = (Nh * ByStratum.Map.mean()).sum()
    
    #Equation 9
    Yest = (Nh * ByStratum.Correct.mean()).sum()
    R = Yest / Xest

    # Equation 10
    nh = ByStratum.Correct.count()
    meanyh = ByStratum.Correct.mean()
    meanxh = ByStratum.Map.mean()
    vary = ByStratum.Correct.var(ddof=1)
    varx = ByStratum.Map.var(ddof=1)
    covarxy = (ByStratum.XY.sum() - nh * meanyh * meanxh) / (nh - 1)
    StrataVar = Nh**2 * (1 - nh / Nh) * (vary + R**2 * varx - 2 * R * covarxy) / nh
    StrataVarSum = StrataVar.sum() / Xest / Xest
    SE = np.sqrt(StrataVarSum) * 100
     
    return SE

In [24]:
estimate_UA_SE(data)

1.9330022817204517

In [25]:
def estimate_PA_SE(df: pd.DataFrame) -> float:
    """ 
    Function to estimate the standard error (SE) of producer's accuracy of target class
    for sampling of units (pixels/polygons) with equal area.
    Strata weighted by their respective unit counts (Nh).
    ~~~
    Input dataframe with number of lines equal the number of sample pixels/polygons,
    and the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Map" (0-1 - proportion of the sample pixel/polygon identified as target class in the map)
    "Reference" (0-1 - proportion of the sample pixel/polygon identified as target class in reference classification)
    "Nh" (number of units in each stratum h)
    ~~~
    Returns estimated SE of producer's accuracy of target class expressed as percentage
    ~~~
    From Tyukavina et al. (in review) "Options for global sampling of geographic data"
    Appendix, equations 9-11
    """
    #Create a copy of columns "Ah","Stratum", "Reference" and new columns computed as:
    #"Correct", the proportion of the sample unit that is correctly classified as target class: min ("Map", "Reference");
    df1 = pd.concat([df['Nh'],df['Stratum'], df['Reference'],
                     pd.Series((np.minimum(df['Map'],df['Reference'])) , name = 'Correct')],axis = 1)
    
    df1['XY'] = df1['Reference'] * df1['Correct']
    
    ByStratum = df1.groupby(by = ['Stratum'])
    
    #Equation 11
    Nh = ByStratum.Nh.median()
    Xest = (Nh * ByStratum.Reference.mean()).sum()
    
    #Equation 9
    Yest = (Nh * ByStratum.Correct.mean()).sum()
    R = Yest / Xest

    # Equation 10
    nh = ByStratum.Correct.count()
    meanyh = ByStratum.Correct.mean()
    meanxh = ByStratum.Reference.mean()
    vary = ByStratum.Correct.var(ddof=1)
    varx = ByStratum.Reference.var(ddof=1)
    covarxy = (ByStratum.XY.sum() - nh * meanyh * meanxh) / (nh - 1)
    StrataVar = Nh**2 * (1 - nh / Nh) * (vary + R**2 * varx - 2 * R * covarxy) / nh
    StrataVarSum = StrataVar.sum() / Xest / Xest
    SE = np.sqrt(StrataVarSum) * 100
     
    return SE

In [26]:
estimate_PA_SE(data)

0.9336026822407393

#### Functions to estimate % of class1 from class2

In [27]:
def estimate_class_percent (df: pd.DataFrame) -> float:
    """ 
    Function to estimate percent of class1 from class2, both estimated from the sample
    for sampling of units (pixels/polygons) with equal area.
    Strata weighted by their respective unit counts (Nh).
    ~~~ 
    Input dataframe with number of lines equal the number of sample pixels/polygons,
    and the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Class1" (0-1 - proportion of the sample pixel/polygon identified as class 1 (percentage numerator))
    "Class2" (0-1 - proportion of the sample pixel/polygon identified as class 1 (percentage denumerator))
    "Nh" (number of units in each stratum h)
    ~~~
    Returns estimated percent of class1 from the area of class2
    ~~~
    From Tyukavina et al. (in review) "Options for global sampling of geographic data"
    Appendix, equation 9
    """
    ByStratum = df.groupby(by = ['Stratum'])
    
    #Equation 9
    Nh = ByStratum.Nh.median()
    Yest = (Nh * ByStratum.Class1.mean()).sum()
    Xest = (Nh * ByStratum.Class2.mean()).sum()
    PERC = Yest / Xest * 100

    return PERC

In [28]:
def estimate_class_percent_SE(df: pd.DataFrame) -> float:
    """ 
    Function to estimate the standard error (SE) of percent of class1 from class2, both estimated from the sample
    for sampling of units (pixels/polygons) with equal area.
    Strata weighted by their respective unit counts (Nh).
    ~~~
    Input dataframe with number of lines equal the number of sample pixels/polygons,
    and the following columns:
    "Stratum" (strata IDs 1 - nstrata)
    "Class1" (0-1 - proportion of the sample pixel/polygon identified as class 1 (percentage numerator))
    "Class2" (0-1 - proportion of the sample pixel/polygon identified as class 1 (percentage denumerator))
    "Nh" (number of units in each stratum h)
    ~~~
    Returns estimated SE of percent of class1 from class2
    ~~~
    From Tyukavina et al. (in review) "Options for global sampling of geographic data"
    Appendix, equations 9-11
    """
    df1 = df.copy()
    
    df1['XY'] = df1['Class1'] * df1['Class2']
    
    ByStratum = df1.groupby(by = ['Stratum'])
    
    #Equation 11
    Nh = ByStratum.Nh.median()
    Xest = (Nh * ByStratum.Class2.mean()).sum()
    
    #Equation 9
    Yest = (Nh * ByStratum.Class1.mean()).sum()
    R = Yest / Xest

    # Equation 10
    nh = ByStratum.Class1.count()
    meanyh = ByStratum.Class1.mean()
    meanxh = ByStratum.Class2.mean()
    vary = ByStratum.Class1.var(ddof=1)
    varx = ByStratum.Class2.var(ddof=1)
    covarxy = (ByStratum.XY.sum() - nh * meanyh * meanxh) / (nh - 1)
    StrataVar = Nh**2 * (1 - nh / Nh) * (vary + R**2 * varx - 2 * R * covarxy) / nh
    StrataVarSum = StrataVar.sum() / Xest / Xest
    SE = np.sqrt(StrataVarSum) * 100
    
    return SE

In [29]:
#Example of use: estimate % of Types 0-3 from the overall area of the target class (Reference>0).

functions = [estimate_class_percent, estimate_class_percent_SE]
names = ['% from target class','SE']
results=pd.DataFrame()
datacopy=pd.DataFrame()

for classtype in data['RefType'].unique():
    datacopy = data.copy()
    #A new Class1 column, where reference values are set to zero if class type is different from the current class type
    datacopy['Class1']=np.where(datacopy['RefType'] == classtype, datacopy['Reference'], 0)
    
    #Set Class2 equal to Reference (total area of target class)
    datacopy['Class2']=datacopy['Reference']
    
    values={}
    values = {nm:[fn(datacopy)] for fn, nm in zip(functions,names)}
    values["Estimate"]=classtype
    values_pd = pd.DataFrame(values).set_index("Estimate")
    results = pd.concat([values_pd, results])

results=results.reset_index()

results.style.hide_index().format({name: '{:.2f}' for name in names})


Estimate,% from target class,SE
Type3,31.92,2.79
Type2,49.15,2.98
Type0,0.0,0.0
Type1,18.92,2.33
