In [None]:
!pip install scikit-learn==1.0
!pip install xgboost==1.4.2
!pip install catboost==0.26.1
!pip install pandas==1.3.3
!pip install radiant-mlhub==0.3.0
!pip install rasterio==1.2.8
!pip install numpy==1.21.2
!pip install pathlib==1.0.1
!pip install tqdm==4.62.3
!pip install joblib==1.0.1
!pip install matplotlib==3.4.3
!pip install Pillow==8.3.2
!pip install torch==1.9.1
!pip install plotly==5.3.1

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from joblib import Parallel,delayed
import gc

In [None]:
train_df_mean = pd.read_csv('train_mean.csv')
train_df_coordinates = pd.read_csv('train_coordinates_lat_lon.csv')
test_df_coordinates  = pd.read_csv('test_coordinates_lat_lon.csv')

train_df_coordinates = train_df_coordinates.merge(train_df_mean,on=['field_id'],how='left')[['field_id','lat','long','label']]

In [None]:
train_df_coordinates = train_df_coordinates[train_df_coordinates['label'].isin(range(1,10))]


In [None]:
merge_df = train_df_coordinates.append(test_df_coordinates)
merge_df = merge_df.fillna(10)
merge_df.shape

(122368, 4)

In [None]:
field_label_dict = dict(zip(merge_df['field_id'].values,merge_df['label'].values))
field_range_dict = dict(zip(range(len(merge_df)),merge_df['field_id'].values))
len(field_label_dict)

122368

### Nearest points within radius of 0.25

In [None]:
vals  = merge_df[['lat','long']].values
radius = 0.25

In [None]:
neigh       = NearestNeighbors(radius=radius,metric='haversine',n_jobs=-1).fit(vals)

In [None]:
def get_frequency(closest_indices):
    count_dict = {1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0}
    for i in closest_indices:

        label = field_label_dict[field_range_dict[i]]
        count_dict[label]+=1
    return [count_dict[i] for i in range(1,11)]

def get_nearest(point,radius):
    nbrs = neigh.radius_neighbors([point],radius=radius)
    indices = nbrs[1][0]
    return get_frequency(indices)




In [None]:
outs_train = Parallel(n_jobs=-1,timeout=100000,backend="multiprocessing", verbose=1)(delayed(get_nearest)(point=vals[i],radius=radius) for i in range(vals.shape[0]))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 708 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 1408 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 2308 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 3408 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 4708 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done 6208 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-1)]: Done 7908 tasks      | elapsed:   59.6s
[Parallel(n_jobs=-1)]: Done 9808 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 11908 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 14208 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 16708 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 22308 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done

In [None]:
column_names = [f'Crop_{i}_{radius}' for i in range(1,11)]

nearest_data   = pd.DataFrame(data = outs_train,columns = column_names)
nearest_data['field_id'] = merge_df['field_id'].values.tolist()
nearest_data

Unnamed: 0,Crop_1_0.6,Crop_2_0.6,Crop_3_0.6,Crop_4_0.6,Crop_5_0.6,Crop_6_0.6,Crop_7_0.6,Crop_8_0.6,Crop_9_0.6,Crop_10_0.6,field_id
0,5716,9882,2299,20022,1894,4919,7492,1071,45,22413,3020
1,5565,9714,2247,20017,1841,4815,7244,1040,41,22142,7478
2,5650,9823,2289,20025,1886,4887,7414,1057,44,22315,15902
3,5530,9676,2243,19996,1834,4794,7186,1037,38,22090,38846
4,5592,9735,2256,20024,1852,4820,7279,1042,41,22180,42856
...,...,...,...,...,...,...,...,...,...,...,...
122363,6376,11083,2736,17440,2419,6286,9348,1321,303,23695,95767
122364,6357,11050,2729,17404,2415,6279,9338,1320,303,23682,101421
122365,6411,10972,2696,17806,2418,6196,9263,1316,279,23598,105889
122366,6390,10987,2695,17619,2412,6215,9288,1316,288,23623,115157


In [None]:
nearest_data[f'count_{radius}'] = nearest_data[column_names].sum(axis=1)
for i in column_names:
    nearest_data[i] = 100*(nearest_data[i]/nearest_data[f'count_{radius}'])

In [None]:
nearest_data.to_csv(f'full_nearest_radius_{radius}.csv',index=False)

### Nearest data within radius of 0.4

In [None]:
vals  = merge_df[['lat','long']].values
radius = 0.4

In [None]:
neigh       = NearestNeighbors(radius=radius,metric='haversine',n_jobs=-1).fit(vals)

In [None]:
def get_frequency(closest_indices):
    count_dict = {1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0}
    for i in closest_indices:

        label = field_label_dict[field_range_dict[i]]
        count_dict[label]+=1
    return [count_dict[i] for i in range(1,11)]

def get_nearest(point,radius):
    nbrs = neigh.radius_neighbors([point],radius=radius)
    indices = nbrs[1][0]
    return get_frequency(indices)




In [None]:
outs_train = Parallel(n_jobs=-1,timeout=100000,backend="multiprocessing", verbose=1)(delayed(get_nearest)(point=vals[i],radius=radius) for i in range(vals.shape[0]))

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 708 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 1408 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 2308 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 3408 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 4708 tasks      | elapsed:   40.0s
[Parallel(n_jobs=-1)]: Done 6208 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 7908 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 9808 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 11908 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 14208 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 16708 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 19408 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 22308 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done

In [None]:
column_names = [f'Crop_{i}_{radius}' for i in range(1,11)]

nearest_data   = pd.DataFrame(data = outs_train,columns = column_names)
nearest_data['field_id'] = merge_df['field_id'].values.tolist()
nearest_data

Unnamed: 0,Crop_1_0.8,Crop_2_0.8,Crop_3_0.8,Crop_4_0.8,Crop_5_0.8,Crop_6_0.8,Crop_7_0.8,Crop_8_0.8,Crop_9_0.8,Crop_10_0.8,field_id
0,6890,11876,2974,20106,2711,6435,9473,1336,361,25670,3020
1,6842,11703,2919,20102,2678,6313,9391,1325,296,25472,7478
2,6874,11818,2960,20103,2698,6384,9446,1332,331,25611,15902
3,6836,11637,2901,20102,2671,6298,9375,1322,291,25416,38846
4,6854,11741,2926,20103,2680,6330,9407,1325,303,25503,42856
...,...,...,...,...,...,...,...,...,...,...,...
122363,7340,12921,3529,20239,3450,7238,10236,1457,1024,27651,95767
122364,7339,12919,3529,20238,3445,7237,10236,1457,1022,27643,101421
122365,7325,12892,3491,20223,3344,7221,10164,1450,979,27493,105889
122366,7328,12897,3508,20223,3371,7227,10178,1455,997,27523,115157


In [None]:
nearest_data[f'count_{radius}'] = nearest_data[column_names].sum(axis=1)
for i in column_names:
    nearest_data[i] = 100*(nearest_data[i]/nearest_data[f'count_{radius}'])

In [None]:
nearest_data

Unnamed: 0,Crop_1_0.8,Crop_2_0.8,Crop_3_0.8,Crop_4_0.8,Crop_5_0.8,Crop_6_0.8,Crop_7_0.8,Crop_8_0.8,Crop_9_0.8,Crop_10_0.8,field_id,count_0.8
0,7.844521,13.521268,3.386010,22.891429,3.086574,7.326487,10.785363,1.521086,0.411012,29.226250,3020,87832
1,7.860663,13.445388,3.353592,23.094863,3.076711,7.252904,10.789168,1.522271,0.340070,29.264370,7478,87041
2,7.850886,13.497493,3.380655,22.959900,3.081421,7.291250,10.788401,1.521295,0.378039,29.250660,15902,87557
3,7.871133,13.399118,3.340280,23.145920,3.075453,7.251667,10.794598,1.522182,0.335064,29.264586,38846,86849
4,7.862616,13.468774,3.356582,23.061304,3.074382,7.261506,10.791309,1.519983,0.347589,29.255954,42856,87172
...,...,...,...,...,...,...,...,...,...,...,...,...
122363,7.719409,13.588894,3.711416,21.285166,3.628333,7.612137,10.765105,1.532313,1.076931,29.080297,95767,95085
122364,7.719981,13.589649,3.712197,21.288592,3.623836,7.612686,10.767370,1.532636,1.075054,29.077999,101421,95065
122365,7.744603,13.630501,3.690977,21.381447,3.535556,7.634645,10.746231,1.533061,1.035081,29.067899,105889,94582
122366,7.737548,13.617790,3.704056,21.353226,3.559399,7.630904,10.746830,1.536317,1.052720,29.061210,115157,94707


In [None]:
nearest_data.to_csv(f'full_nearest_radius_{radius}.csv',index=False)

In [None]:
# train_nearest = nearest_data[:train_df_coordinates.shape[0]]
# test_nearest  = nearest_data[train_df_coordinates.shape[0]:]

# # train_nearest.to_csv('nearest_data_train.csv',index=False)
# # test_nearest.to_csv('nearest_data_test.csv',index=False)
# train_nearest['label'] = train_df_coordinates['label'].values.tolist()
# train_nearest.shape,test_nearest.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


((87073, 13), (35295, 12))