In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("workshop_cemeai_spatinterp_data.csv")
data.head()

Unnamed: 0,Nr,ID,POINT_X,POINT_Y,Sand,Silt,Clay,Ca,set
0,440,A1,756995.5,7520304.0,53,10,37,35,validation
1,893,B1,756995.5,7520304.0,64,4,32,19,validation
2,182,A3,756106.75,7519687.5,14,17,69,42,validation
3,642,B3,756106.75,7519687.5,36,28,36,33,validation
4,331,A5,755907.5,7519786.0,34,17,49,67,cal_candidate


In [3]:
data_a = data[data['ID'].str.startswith('A')]
data_b = data[data['ID'].str.startswith('B')]

len(data_a), len(data_b)

(458, 452)

In [4]:
data_a.to_csv('data_group_a.csv', index=False)
data_b.to_csv('data_group_b.csv', index=False)

In [5]:
def haversine_dist(lat1, lon1, lat2, lon2):
    R = 6373.0  # aproximacao do raio da terra

    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    # deltas
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2

    # Haversine formula
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c

def haversine_dmatrix(dataset):
    dist = np.zeros((len(dataset), len(dataset)))
    tmp = dataset.loc[:, ['POINT_X', 'POINT_Y']].values
    
    for i in range(len(dataset)):
        dist[i, :] = haversine_dist(
            tmp[i, 0], tmp[i, 1],
            tmp[:, 0], tmp[:, 1]
        )
    
    return dist

In [6]:
cdata_a, vdata_a = data_a[data_a['set'] == 'cal_candidate'], data_a[data_a['set'] == 'validation']
cdata_b, vdata_b = data_b[data_b['set'] == 'cal_candidate'], data_b[data_b['set'] == 'validation']
                                                                    
cdata_a.to_csv('calibration_data_group_a.csv', index=False)
cdata_b.to_csv('calibration_data_group_b.csv', index=False)

vdata_a.to_csv('validation_data_group_a.csv', index=False)
vdata_b.to_csv('validation_data_group_b.csv', index=False)

In [7]:
# k-NN que prediz o valor mediano entre os vizinhos: baseline
def knn_median_pred(trd, vld, k=3):
    preds = np.zeros(len(vld))

    for i, vsample in enumerate(vld):
        dists = haversine_dist(trd[:, 0], trd[:, 1], vsample[0], vsample[1])

        pos = np.argsort(dists)
        preds[i] = np.median(trd[pos, 2][:k])
    
    return vld[:, -1], preds

In [8]:
# Grupo A

trd = cdata_a.loc[:, ['POINT_X', 'POINT_Y', 'Ca']].values
vld = vdata_a.loc[:, ['POINT_X', 'POINT_Y', 'Ca']].values

y, y_pred = knn_median_pred(trd, vld, 10)

print('MAE:', np.mean(np.abs(y - y_pred)))
print('RMSE:', np.sqrt(np.mean((y - y_pred) ** 2)))

MAE: 8.809734513274336
RMSE: 13.985691170697423


In [9]:
# Grupo B

trd = cdata_b.loc[:, ['POINT_X', 'POINT_Y', 'Ca']].values
vld = vdata_b.loc[:, ['POINT_X', 'POINT_Y', 'Ca']].values

y, y_pred = knn_median_pred(trd, vld, 10)

print('MAE:', np.mean(np.abs(y - y_pred)))
print('RMSE:', np.sqrt(np.mean((y - y_pred) ** 2)))

MAE: 7.973684210526316
RMSE: 12.024645743769664
