In [None]:
!pip install scikit-learn==1.0
!pip install xgboost==1.4.2
!pip install catboost==0.26.1
!pip install pandas==1.3.3
!pip install radiant-mlhub==0.3.0
!pip install rasterio==1.2.8
!pip install numpy==1.21.2
!pip install pathlib==1.0.1
!pip install tqdm==4.62.3
!pip install joblib==1.0.1
!pip install matplotlib==3.4.3
!pip install Pillow==8.3.2
!pip install torch==1.9.1
!pip install plotly==5.3.1


In [None]:
import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings('RuntimeWarning')

from radiant_mlhub import Collection
import tarfile
import os
from pathlib import Path
import json
from tqdm import tqdm
from joblib import Parallel,delayed
import datetime
import rasterio
import numpy as np
import pandas as pd

import gc

gc.collect()

0

In [None]:
competition_train_df = pd.read_csv('test_data_sentinel2.csv')

In [None]:
def get_date_format(month,day):
    '''
        Structures the dates in a particular format
    '''
    if (str(month)=='nan') or (str(day)=='nan'):
        return 'nan'
    else:
        if month>=10:
            if day>=10:
                return f'month_{str(int(month))}_day_{str(int(day))}'
            else:
                return f'month_{str(int(month))}_day_0{str(int(day))}'
        else:
            if day>=10:
                return f'month_0{str(int(month))}_day_{str(int(day))}'
            else:
                return f'month_0{str(int(month))}_day_0{str(int(day))}'
    

In [None]:
competition_train_df['month'] = pd.to_datetime(competition_train_df['datetime']).dt.month.values 
competition_train_df['day']   = pd.to_datetime(competition_train_df['datetime']).dt.day.values

competition_train_df['dates'] = competition_train_df.apply(lambda z: get_date_format(z['month'],z['day']),axis=1)

unique_dates = competition_train_df['dates'].unique()
unique_dates = np.array([z for z in unique_dates if 'nan' not in z])

print(f'Length of unique dates {len(unique_dates)}')

Length of unique dates 76


In [None]:
date_dict = dict(zip(competition_train_df['datetime'].dropna().unique(),unique_dates))
date_dict = dict(sorted(date_dict.items(), key=lambda item: item[1]))
date_order_to_consider = np.array(list(date_dict.values()))
tile_ids_train = competition_train_df['tile_id'].unique()

In [None]:
def get_bands(tile_date_times,tile_df,band,date_dict):
    '''
        Getting band dictionary with dates
    '''
    X_tile    = np.zeros((256 * 256, 76))
    X_tile[:] = np.nan
    
    for date_time in tile_date_times[:1]:
        
        
        source   = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']==band)]['file_path'].values[0])

        ### Flattening the file to get a vector for the image 
        array    = np.expand_dims(source.read(1).flatten(), axis=1)

        ### Capturing the date at which we need to replace the vector
        val      = date_dict[date_time]

        ### index at which replacement is to be done
        indices  = np.where(date_order_to_consider==val)[0][0]            

        X_tile[:,indices]   = array.ravel()

    return X_tile

In [None]:
def get_dataframe(data_dict_band,band,y,field_ids):
    
    X = np.array([values.tolist() for _,values in tqdm(data_dict_band.items())]).reshape(-1,76)
    colnames          = [band+'_'+z for z in date_order_to_consider]
    data              = pd.DataFrame(X,columns=colnames)
   
    data['field_id']  = field_ids
    
    mean_df           = data.groupby('field_id').mean().reset_index()
    low_df            = data.groupby('field_id').quantile(0.25).reset_index()
    up_df             = data.groupby('field_id').quantile(0.75).reset_index()
    med_df            = data.groupby('field_id').median().reset_index()
    
    
    return mean_df,low_df,up_df,med_df

In [None]:
len(tile_ids_train)

1137

In [None]:
import xarray as xr
from rasterio.warp import transform

def convert_lat_lon(filename,crs):
    da = xr.open_rasterio(filename)
    x  = da['x']
    y  = da['y']
    ny, nx  = len(da['y']), len(da['x'])
    y, x    = np.meshgrid(da['y'], da['x'])
    
    lon,lat = transform(crs, {'init': 'EPSG:4326'},
                     x.flatten(), y.flatten())
    lon = np.asarray(lon).reshape((ny, nx))
    lat = np.asarray(lat).reshape((ny, nx))
    
    return lon,lat

In [None]:
lb = 0 
ub = 200

bands_available = ['B01']

bigdf = []

for band in bands_available:

    for batch in tqdm(range(0,6)):
        print(f'Performing operations for batch {batch+1}/14 for band {band}')
        data_dict_band = {}
        

        count          = 1

        y              = np.empty((0, 1))


        field_ids      = np.empty((0, 1))
        encode_x_val   = np.empty((0, 1))
        encode_y_val   = np.empty((0, 1))

        for tile_id in tile_ids_train[lb+(batch*200):ub+(batch*200)]:
            if tile_id != '1951': # avoid using this specific tile for the Hackathon as it might have a missing file

                tile_df               = competition_train_df[competition_train_df['tile_id']==tile_id]

                field_id_src          = rasterio.open(tile_df[tile_df['asset']=='field_ids']['file_path'].values[0])

                field_id_array        = field_id_src.read(1)
                
                lon,lat               = convert_lat_lon(tile_df[tile_df['asset']=='field_ids']['file_path'].values[0],field_id_src.crs)
                
                
                encode_x = np.zeros((256,256))
                encode_y = np.zeros((256,256))
                
                
                for i in range(field_id_array.shape[0]):
                    for j in range(field_id_array.shape[1]):
                        if field_id_array[i,j]>0:
                            encode_x[i,j] = lat[i][j]
                            encode_y[i,j] = lon[i][j]
                            
                tempdf = pd.DataFrame(columns = ['field_id','long','lat'])            
                tempdf['field_id'] = field_id_array.flatten()
                tempdf['long'] = encode_y.flatten()
                tempdf['lat'] = encode_x.flatten()
                tempdf      = tempdf[tempdf['field_id']!=0]
#                 print(tempdf.groupby(['field_id']).agg({'X':'median','Y':'median'}).reset_index().rename({'index':'field_id'}))
                gdf = tempdf.groupby(['field_id']).agg({'long':'median','lat':'median'}).reset_index().rename({'index':'field_id'})
                if len(bigdf)==0:
                    bigdf = gdf
                else:
                    bigdf = bigdf.append(gdf)
                    
                print(bigdf.shape)
                

                

                count                 = count+1
        gc.collect()        

#         mean_df,low_df,up_df,med_df = get_dataframe(data_dict_band,band,y,field_ids)
        
        batchid           = int(batch)+1

#         mean_df.to_csv(f'train_position.csv',index=False)
        
        

#         del field_ids,data_dict_band,label_src,field_id_src,label_array,field_id_array,y,mean_df,med_df,up_df,low_df
#         del tile_df,tile_date_times
#         gc.collect()
        gc.collect()
        gc.collect()

  0%|          | 0/6 [00:00<?, ?it/s]

Performing operations for batch 1/14 for band B01
(20, 3)
(72, 3)
(76, 3)
(77, 3)
(90, 3)
(94, 3)
(116, 3)
(131, 3)
(189, 3)
(196, 3)
(198, 3)
(229, 3)
(241, 3)
(264, 3)
(580, 3)
(583, 3)
(621, 3)
(636, 3)
(677, 3)
(680, 3)
(684, 3)
(695, 3)
(702, 3)
(735, 3)
(775, 3)
(790, 3)
(794, 3)
(834, 3)
(835, 3)
(837, 3)
(856, 3)
(1057, 3)
(1072, 3)
(1096, 3)
(1131, 3)
(1166, 3)
(1211, 3)
(1213, 3)
(1214, 3)
(1256, 3)
(1260, 3)
(1262, 3)
(1264, 3)
(1274, 3)
(1297, 3)
(1404, 3)
(1440, 3)
(1489, 3)
(1500, 3)
(1502, 3)
(1518, 3)
(1521, 3)
(1572, 3)
(1589, 3)
(1611, 3)
(1621, 3)
(1665, 3)
(1690, 3)
(1705, 3)
(1726, 3)
(1761, 3)
(1762, 3)
(1766, 3)
(1881, 3)
(1903, 3)
(1935, 3)
(1936, 3)
(1962, 3)
(2028, 3)
(2043, 3)
(2073, 3)
(2076, 3)
(2079, 3)
(2085, 3)
(2118, 3)
(2159, 3)
(2190, 3)
(2203, 3)
(2211, 3)
(2235, 3)
(2253, 3)
(2257, 3)
(2291, 3)
(2311, 3)
(2353, 3)
(2358, 3)
(2384, 3)
(2464, 3)
(2478, 3)
(2588, 3)
(2622, 3)
(2648, 3)
(2654, 3)
(2679, 3)
(2681, 3)
(2706, 3)
(2834, 3)
(2855, 3)
(2858, 

 17%|█▋        | 1/6 [00:43<03:39, 43.89s/it]

(6204, 3)
Performing operations for batch 2/14 for band B01
(6449, 3)
(6478, 3)
(6483, 3)
(6540, 3)
(6543, 3)
(6549, 3)
(6573, 3)
(6574, 3)
(6604, 3)
(6637, 3)
(6649, 3)
(6653, 3)
(6655, 3)
(6676, 3)
(6858, 3)
(6859, 3)
(6869, 3)
(6902, 3)
(6919, 3)
(6924, 3)
(6930, 3)
(6955, 3)
(6986, 3)
(7010, 3)
(7018, 3)
(7100, 3)
(7104, 3)
(7133, 3)
(7144, 3)
(7170, 3)
(7213, 3)
(7252, 3)
(7260, 3)
(7264, 3)
(7279, 3)
(7329, 3)
(7342, 3)
(7581, 3)
(7583, 3)
(7620, 3)
(7653, 3)
(7680, 3)
(7686, 3)
(7694, 3)
(7696, 3)
(7710, 3)
(7748, 3)
(7772, 3)
(7778, 3)
(7817, 3)
(7831, 3)
(7874, 3)
(7913, 3)
(7983, 3)
(7988, 3)
(8095, 3)
(8124, 3)
(8158, 3)
(8173, 3)
(8174, 3)
(8177, 3)
(8305, 3)
(8307, 3)
(8311, 3)
(8317, 3)
(8330, 3)
(8339, 3)
(8446, 3)
(8513, 3)
(8547, 3)
(8558, 3)
(8561, 3)
(8576, 3)
(8637, 3)
(8638, 3)
(8653, 3)
(8667, 3)
(8763, 3)
(8796, 3)
(8798, 3)
(8802, 3)
(8806, 3)
(8825, 3)
(8826, 3)
(8843, 3)
(8858, 3)
(8909, 3)
(8942, 3)
(8974, 3)
(9001, 3)
(9023, 3)
(9027, 3)
(9067, 3)
(9081, 3)


 33%|███▎      | 2/6 [01:28<02:56, 44.12s/it]

(12357, 3)
Performing operations for batch 3/14 for band B01
(12370, 3)
(12402, 3)
(12403, 3)
(12414, 3)
(12447, 3)
(12456, 3)
(12458, 3)
(12468, 3)
(12505, 3)
(12534, 3)
(12561, 3)
(12614, 3)
(12627, 3)
(12640, 3)
(12645, 3)
(12751, 3)
(12784, 3)
(12834, 3)
(12863, 3)
(12915, 3)
(12918, 3)
(12959, 3)
(12997, 3)
(13015, 3)
(13172, 3)
(13224, 3)
(13243, 3)
(13394, 3)
(13405, 3)
(13449, 3)
(13453, 3)
(13513, 3)
(13690, 3)
(13692, 3)
(13701, 3)
(13740, 3)
(13759, 3)
(13783, 3)
(13785, 3)
(13819, 3)
(13839, 3)
(13845, 3)
(13865, 3)
(13902, 3)
(13944, 3)
(13984, 3)
(13989, 3)
(13998, 3)
(14043, 3)
(14046, 3)
(14067, 3)
(14071, 3)
(14101, 3)
(14114, 3)
(14136, 3)
(14190, 3)
(14205, 3)
(14206, 3)
(14222, 3)
(14239, 3)
(14264, 3)
(14276, 3)
(14281, 3)
(14301, 3)
(14302, 3)
(14393, 3)
(14401, 3)
(14419, 3)
(14438, 3)
(14485, 3)
(14509, 3)
(14511, 3)
(14560, 3)
(14591, 3)
(14602, 3)
(14615, 3)
(14620, 3)
(14639, 3)
(14676, 3)
(14692, 3)
(14707, 3)
(14742, 3)
(14820, 3)
(14822, 3)
(14866, 3)
(149

 50%|█████     | 3/6 [02:12<02:12, 44.26s/it]

(19069, 3)
Performing operations for batch 4/14 for band B01
(19073, 3)
(19081, 3)
(19110, 3)
(19111, 3)
(19119, 3)
(19161, 3)
(19258, 3)
(19259, 3)
(19279, 3)
(19426, 3)
(19446, 3)
(19465, 3)
(19467, 3)
(19499, 3)
(19532, 3)
(19565, 3)
(19582, 3)
(19583, 3)
(19603, 3)
(19614, 3)
(19658, 3)
(19673, 3)
(19693, 3)
(19718, 3)
(19736, 3)
(19760, 3)
(19795, 3)
(19840, 3)
(19881, 3)
(20165, 3)
(20167, 3)
(20239, 3)
(20262, 3)
(20263, 3)
(20282, 3)
(20310, 3)
(20347, 3)
(20356, 3)
(20388, 3)
(20399, 3)
(20403, 3)
(20436, 3)
(20451, 3)
(20455, 3)
(20485, 3)
(20591, 3)
(20593, 3)
(20594, 3)
(20607, 3)
(20646, 3)
(20688, 3)
(20745, 3)
(20771, 3)
(20790, 3)
(20796, 3)
(20867, 3)
(20880, 3)
(20923, 3)
(20978, 3)
(20984, 3)
(21022, 3)
(21060, 3)
(21318, 3)
(21321, 3)
(21326, 3)
(21334, 3)
(21348, 3)
(21396, 3)
(21440, 3)
(21755, 3)
(21885, 3)
(21911, 3)
(21913, 3)
(21957, 3)
(21987, 3)
(22023, 3)
(22045, 3)
(22073, 3)
(22120, 3)
(22146, 3)
(22179, 3)
(22282, 3)
(22315, 3)
(22351, 3)
(22405, 3)
(224

 67%|██████▋   | 4/6 [02:56<01:28, 44.00s/it]

Performing operations for batch 5/14 for band B01
(25261, 3)
(25385, 3)
(25473, 3)
(25481, 3)
(25593, 3)
(25696, 3)
(25743, 3)
(25809, 3)
(25831, 3)
(25860, 3)
(25873, 3)
(25911, 3)
(25930, 3)
(25948, 3)
(25964, 3)
(26007, 3)
(26060, 3)
(26062, 3)
(26078, 3)
(26086, 3)
(26090, 3)
(26092, 3)
(26094, 3)
(26145, 3)
(26173, 3)
(26206, 3)
(26219, 3)
(26305, 3)
(26307, 3)
(26326, 3)
(26376, 3)
(26414, 3)
(26427, 3)
(26463, 3)
(26473, 3)
(26479, 3)
(26566, 3)
(26603, 3)
(26607, 3)
(26612, 3)
(26642, 3)
(26723, 3)
(26752, 3)
(26763, 3)
(26794, 3)
(26982, 3)
(27016, 3)
(27038, 3)
(27059, 3)
(27062, 3)
(27104, 3)
(27105, 3)
(27159, 3)
(27165, 3)
(27193, 3)
(27215, 3)
(27223, 3)
(27233, 3)
(27267, 3)
(27278, 3)
(27285, 3)
(27308, 3)
(27313, 3)
(27321, 3)
(27345, 3)
(27364, 3)
(27365, 3)
(27391, 3)
(27424, 3)
(27426, 3)
(27510, 3)
(27560, 3)
(27589, 3)
(27643, 3)
(27656, 3)
(27657, 3)
(27714, 3)
(27729, 3)
(27749, 3)
(27827, 3)
(27851, 3)
(27855, 3)
(27907, 3)
(27929, 3)
(27953, 3)
(27957, 3)
(279

 83%|████████▎ | 5/6 [03:39<00:43, 43.90s/it]

(31256, 3)
Performing operations for batch 6/14 for band B01
(31267, 3)
(31321, 3)
(31324, 3)
(31341, 3)
(31393, 3)
(31406, 3)
(31445, 3)
(31512, 3)
(31518, 3)
(31543, 3)
(31567, 3)
(31589, 3)
(31590, 3)
(31691, 3)
(31693, 3)
(31711, 3)
(31719, 3)
(31732, 3)
(31763, 3)
(31856, 3)
(31858, 3)
(31897, 3)
(31951, 3)
(32001, 3)
(32005, 3)
(32015, 3)
(32016, 3)
(32037, 3)
(32061, 3)
(32239, 3)
(32283, 3)
(32307, 3)
(32341, 3)
(32345, 3)
(32366, 3)
(32397, 3)
(32422, 3)
(32457, 3)
(32480, 3)
(32622, 3)
(32624, 3)
(32627, 3)
(32645, 3)
(32656, 3)
(32658, 3)
(32712, 3)
(32720, 3)
(32743, 3)
(32761, 3)
(32766, 3)
(32781, 3)
(32834, 3)
(32846, 3)
(32853, 3)
(32879, 3)
(32893, 3)
(32894, 3)
(32909, 3)
(32949, 3)
(32950, 3)
(32965, 3)
(32977, 3)
(32991, 3)
(33001, 3)
(33006, 3)
(33012, 3)
(33029, 3)
(33033, 3)
(33221, 3)
(33248, 3)
(33265, 3)
(33295, 3)
(33479, 3)
(33483, 3)
(33484, 3)
(33492, 3)
(33499, 3)
(33507, 3)
(33643, 3)
(33706, 3)
(33719, 3)
(33773, 3)
(33847, 3)
(33885, 3)
(33905, 3)
(339

100%|██████████| 6/6 [04:10<00:00, 41.72s/it]

(35295, 3)





In [None]:
test_coordinates = bigdf[bigdf['field_id']!=0]

In [None]:
test_coordinates.to_csv('test_coordinates_lat_lon.csv',index=False)

In [None]:
test_coordinates

Unnamed: 0,field_id,long,lat
0,1825,18.148975,-33.015222
1,3079,18.125699,-33.028011
2,3848,18.129977,-33.019151
3,28857,18.146981,-33.014430
4,33478,18.142351,-33.029189
...,...,...,...
31,95767,18.433037,-33.389965
32,101421,18.424278,-33.391237
33,105889,18.437014,-33.404925
34,115157,18.425845,-33.401630
