### Partial imputation experiments on ESA landcover data

#### Imputation based on latitude and elevation. Random downscaling

In [197]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import mean_absolute_error as mae

In [198]:
# Partial imputation algorithm as a function
def partial_imputation(observation, cluster_average):
    if not isinstance(observation, np.ndarray):
        observation = np.array(observation)
    if not isinstance(cluster_average, np.ndarray):
        cluster_average = np.array(cluster_average)

    difference = np.maximum(cluster_average - observation, 0)
    total_difference = np.sum(difference)
    if total_difference != 0 and not np.isnan(cluster_average).any():
        proportion = difference / total_difference
        imp_observation = observation + proportion * (1 - np.sum(observation))
    else:
        imp_observation = observation

    return imp_observation

In [199]:
# Reading the data
dataset0 = pd.read_csv('ESA_landcover_data.csv')
esa = dataset0.copy()
esa = esa.astype(np.float32)

# Selecting the columns of vegetation types
columns_to_select = ['e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e9', 'e11', 'e13']
esa = esa.loc[:, columns_to_select]


In [200]:
# ESA landcover fractions
esa

Unnamed: 0,e1,e2,e3,e4,e5,e6,e7,e9,e11,e13
0,0.15900,0.03183,0.00189,0.00000,0.03622,0.04039,0.02871,0.09058,0.16102,0.0
1,0.02589,0.02934,0.00516,0.00000,0.02031,0.03032,0.01938,0.06882,0.43829,0.0
2,0.04600,0.01767,0.00394,0.00000,0.01335,0.01932,0.01009,0.03914,0.32218,0.0
3,0.04120,0.02608,0.00322,0.00000,0.02367,0.02795,0.02021,0.06767,0.27331,0.0
4,0.62481,0.01201,0.00000,0.00000,0.05531,0.04435,0.01203,0.03684,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...
52292,0.35223,0.00000,0.00000,0.00000,0.12222,0.13089,0.10284,0.14722,0.10067,0.0
52293,0.12890,0.00000,0.00000,0.00000,0.10955,0.00000,0.00000,0.00000,0.00000,0.0
52294,0.05928,0.00000,0.00000,0.00000,0.07672,0.07795,0.06861,0.08156,0.07085,0.0
52295,0.49759,0.01806,0.00393,0.00393,0.10594,0.11033,0.07869,0.10671,0.07476,0.0


In [201]:
# Selecting values of elevation and latitude
latitude = np.copy(dataset0['lat'])
elevation = np.copy(dataset0['elevation'])

In [202]:
# Defining the cutoffs for the elevation bins
bins = [-np.inf, 500, 1000, 2000, 3000, np.inf]
# Defining the values to assign to each bin
values = [1, 2, 3, 4, 5]
elevation_bins = pd.cut(elevation, bins=bins, labels=False)
elevation_values = np.select([elevation_bins == i for i in range(len(values))], values)
# Assigning the new values to the elevation variable
elevation = elevation_values

In [203]:
# Finding which ESA observations sum up to 1
index_unity = np.round(esa, 2).sum(axis = 1) == 1

# Making data frame only of complete observations
esa_complete = esa[index_unity]

In [204]:
# Subsetting latitudes of complete observations
latitudes_complete = np.round(latitude[index_unity])
# Subsetting elevation of complete observations
elevation_complete = elevation[index_unity]

lat_el = esa_complete.copy()
lat_el['lat']  = latitudes_complete
lat_el['elevation']  = elevation_complete

In [205]:
# A function for random downscaling of the fractions in complete observations
def make_incomplete(random_seed, rows, vtypes):
    # Generating random values
    np.random.seed(random_seed)
    incompleteness = np.random.rand(rows, vtypes)

    # Making all complete observations incomplete
    esa_incomplete = esa_complete * incompleteness

    return esa_incomplete

In [206]:
# A function for imputing incomplete observations
def impute_incomplete(percent):
    percent_new = percent.copy()
    for i in range(0,len(percent)):
        
        lat_obs = lat_el.iloc[i]['lat']
        el_obs =   lat_el.iloc[i]['elevation']
        cluster = esa_complete[np.logical_and(lat_el['lat']==lat_obs,lat_el['elevation']==el_obs)]
        cluster_mean = cluster[np.round(cluster.sum(axis = 1),3 )== 1].mean(axis = 0)

        impute = percent.iloc[i]

        if round(impute.sum(axis =0), 3) != 1:
            new = partial_imputation(impute,cluster_mean)
            percent_new.iloc[i] = new
    return percent_new        

In [207]:
# Calculating MAE after making observations incomplete and the imputation
mae_errors = []
for i in range(1, 1001):
    mae_errors.append(mae(esa_complete, impute_incomplete(make_incomplete(i,esa_complete.shape[0],esa_complete.shape[1] )))*100)
round(np.array(mae_errors).mean(),2)  

1.94