In [1]:
%%writefile requirements.txt

pandas==1.3.5
scikit-learn==1.0.2
tqdm==4.64.0
rasterio==1.2.0
numpy==1.21.6
radiant_mlhub==0.4.1

Writing requirements.txt


In [2]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tqdm==4.64.0
  Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 3.6 MB/s 
[?25hCollecting rasterio==1.2.0
  Downloading rasterio-1.2.0-cp37-cp37m-manylinux1_x86_64.whl (19.1 MB)
[K     |████████████████████████████████| 19.1 MB 395 kB/s 
Collecting radiant_mlhub==0.4.1
  Downloading radiant_mlhub-0.4.1-py3-none-any.whl (36 kB)
Collecting click-plugins
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting affine
  Downloading affine-2.3.1-py2.py3-none-any.whl (16 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting snuggs>=1.4.1
  Downloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Collecting pystac~=1.1
  Downloading pystac-1.5.0-py3-none-any.whl (146 kB)
[K     |████████████████████████████████| 146 kB 57.8 MB/s 
[?25hCollecting requests~=2.25
  Downloading requests

In [3]:
import os
import json
import getpass
import rasterio
import numpy as np
import pandas as pd
from tqdm import tqdm
from radiant_mlhub import Dataset
import tarfile
import shutil
import numpy as np
import pandas as pd

In [4]:
data_dir = 'data'

In [5]:
collection_name = 'ref_agrifieldnet_competition_v1'

source_collection = f'{collection_name}_source'
train_label_collection = f'{collection_name}_labels_train'
test_label_collection = f'{collection_name}_labels_test'

In [6]:
if not os.path.exists(data_dir):
  os.environ['MLHUB_API_KEY'] =  getpass.getpass(prompt="MLHub API Key: ")

  dataset = Dataset.fetch(collection_name)
  dataset.download(output_dir=data_dir)
  for fn in os.listdir(data_dir):
    with tarfile.open(os.path.join(data_dir, fn)) as f:
      f.extractall(data_dir + '/') 
    os.remove(os.path.join(data_dir, fn))
else:
  print("Dataset already exists")

MLHub API Key: ··········


  0%|          | 0/0.5 [00:00<?, ?M/s]

  0%|          | 0/324.3 [00:00<?, ?M/s]



  0%|          | 0/1.3 [00:00<?, ?M/s]

## Prepare Training data


### Create training dataframe

In [7]:
train_paths = os.listdir(os.path.join(data_dir, train_label_collection))
train_ids = [fn.split('_')[-1] for fn in train_paths if 'labels_train' in fn]

field_paths = [f'{data_dir}/{train_label_collection}/{train_label_collection}_{i}/field_ids.tif' 
               for i in train_ids]
label_paths = [f'{data_dir}/{train_label_collection}/{train_label_collection}_{i}/raster_labels.tif' 
               for i in train_ids]
source_paths = [f'{data_dir}/{source_collection}/{source_collection}_{i}/' 
               for i in train_ids]

In [8]:
train_data = pd.DataFrame(np.array([train_ids, field_paths, label_paths, source_paths]).transpose(), 
                          columns=['folder_id', 'field_path', 'label_path', 'source_path'])
train_data.head()

Unnamed: 0,folder_id,field_path,label_path,source_path
0,67f7d,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...
1,9d575,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...
2,b2a94,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...
3,65812,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...
4,260a3,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...



## Extract field-crop data

In [9]:
def extract_field_crop_data(data):
  field_ids = []
  crop_type = []

  for i in tqdm(range(len(data))):
      with rasterio.open(data['field_path'].iloc[i]) as src:
          field_data = src.read()[0]
      with rasterio.open(data['label_path'].iloc[i]) as src:
          crop_data = src.read()[0]

      for field_id in np.unique(field_data)[1:]:
          ind = np.where(field_data == field_id)
          field_ids.append(field_id)
          crop_type.append(np.unique(crop_data[ind])[-1])

  df = pd.DataFrame(np.array([field_ids, crop_type]).transpose(),
                    columns=['field_id', 'crop_type'])
  return df

In [10]:
df = extract_field_crop_data(train_data)
df.head()

100%|██████████| 1165/1165 [00:09<00:00, 118.86it/s]


Unnamed: 0,field_id,crop_type
0,762,2
1,764,1
2,471,6
3,472,4
4,473,2


In [11]:
train_data

Unnamed: 0,folder_id,field_path,label_path,source_path
0,67f7d,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...
1,9d575,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...
2,b2a94,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...
3,65812,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...
4,260a3,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...
...,...,...,...,...
1160,1fc67,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...
1161,a7019,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...
1162,946e2,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...
1163,8c459,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_labels_tr...,data/ref_agrifieldnet_competition_v1_source/re...


In [12]:
selected_bands = ['B01', 'B02', 'B03', 'B04','B05', 'B06', 'B07', 'B08','B8A', 'B09', 'B11', 'B12']
img_sh = 256
n_selected_bands= len(selected_bands)
n_obs = 1 

def feature_extractor(data_ ,   path ):
    '''
        data_: Dataframe with 'field_paths' and 'unique_folder_id' columns
        path: Path to source collections files

        returns: pixel dataframe with corresponding field_ids
        '''
    
    X = np.empty((0, n_selected_bands * n_obs))
    X_tile = np.empty((img_sh * img_sh, 0))
    X_arrays = []
        
    field_ids = np.empty((0, 1))

    for idx, tile_id in tqdm(enumerate(data_['folder_id'])):
        
        field_src =   rasterio.open( data_['field_path'].values[idx])
        field_array = field_src.read(1)
        field_ids = np.append(field_ids, field_array.flatten())
        
        
        bands_src = [rasterio.open(f'{data_dir}/{path}/{path}_{tile_id}/{band}.tif') for band in selected_bands]
        bands_array = [np.expand_dims(band.read(1).flatten(), axis=1) for band in bands_src]
        
        X_tile = np.hstack(bands_array)

        X_arrays.append(X_tile)
        

    X = np.concatenate(X_arrays)
    
    data = pd.DataFrame(X, columns=selected_bands)

    data['field_id'] = field_ids

    return data[data['field_id']!=0]

In [13]:
train = feature_extractor(train_data, source_collection)
train.head()

1165it [02:31,  7.71it/s]


Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B09,B11,B12,field_id
61294,42,36,35,33,38,56,64,59,71,12,63,41,762.0
61549,43,39,39,39,43,58,66,63,74,12,74,52,762.0
61550,43,37,36,34,40,57,66,63,71,12,67,44,762.0
61551,43,39,38,37,40,57,66,63,71,12,67,44,762.0
61805,43,39,38,38,43,58,66,63,74,12,74,52,762.0


In [14]:
train['labels']=train.field_id

In [15]:
train.labels=train.labels.apply(lambda x:df[df.field_id==int(x)].crop_type.mean())

In [16]:
train.labels=train.labels.apply(lambda x:int(x))

In [20]:
train=train.reset_index(drop=True)

In [21]:
train.head()

Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B09,B11,B12,field_id,labels
0,42,36,35,33,38,56,64,59,71,12,63,41,762.0,2
1,43,39,39,39,43,58,66,63,74,12,74,52,762.0,2
2,43,37,36,34,40,57,66,63,71,12,67,44,762.0,2
3,43,39,38,37,40,57,66,63,71,12,67,44,762.0,2
4,43,39,38,38,43,58,66,63,74,12,74,52,762.0,2


**Preparation Test Data**

In [22]:
test_paths = os.listdir(os.path.join(data_dir, test_label_collection))
test_ids = [fn.split('_')[-1] for fn in test_paths if 'labels_test' in fn]

In [23]:
field_paths = [f'{data_dir}/{test_label_collection}/{test_label_collection}_{i}/field_ids.tif' 
               for i in test_ids]
label_paths = [f'{data_dir}/{test_label_collection}/{test_label_collection}_{i}/raster_labels.tif' 
               for i in test_ids]
source_paths = [f'{data_dir}/{source_collection}/{source_collection}_{i}/' 
               for i in test_ids]

In [24]:
test_data = pd.DataFrame(np.array([test_ids, field_paths, label_paths, source_paths]).transpose(), 
                          columns=['folder_id', 'field_path', 'label_path', 'source_path'])
test_data.head()

Unnamed: 0,folder_id,field_path,label_path,source_path
0,1d6e9,data/ref_agrifieldnet_competition_v1_labels_te...,data/ref_agrifieldnet_competition_v1_labels_te...,data/ref_agrifieldnet_competition_v1_source/re...
1,2f048,data/ref_agrifieldnet_competition_v1_labels_te...,data/ref_agrifieldnet_competition_v1_labels_te...,data/ref_agrifieldnet_competition_v1_source/re...
2,324e6,data/ref_agrifieldnet_competition_v1_labels_te...,data/ref_agrifieldnet_competition_v1_labels_te...,data/ref_agrifieldnet_competition_v1_source/re...
3,f8a2a,data/ref_agrifieldnet_competition_v1_labels_te...,data/ref_agrifieldnet_competition_v1_labels_te...,data/ref_agrifieldnet_competition_v1_source/re...
4,9c09a,data/ref_agrifieldnet_competition_v1_labels_te...,data/ref_agrifieldnet_competition_v1_labels_te...,data/ref_agrifieldnet_competition_v1_source/re...


In [25]:
test = feature_extractor(test_data,  source_collection)
test.head()

707it [01:07, 10.53it/s]


Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B09,B11,B12,field_id
3115,44,41,40,43,43,57,66,63,73,12,73,50,469.0
3370,44,38,37,38,43,57,66,60,73,12,73,50,469.0
3371,44,38,37,37,43,57,66,62,73,12,73,50,469.0
3372,44,39,39,40,46,59,67,63,74,12,76,54,469.0
3373,44,41,41,43,46,59,67,64,74,12,76,54,469.0


In [26]:
test=test.reset_index(drop=True)
test.head()

Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B09,B11,B12,field_id
0,44,41,40,43,43,57,66,63,73,12,73,50,469.0
1,44,38,37,38,43,57,66,60,73,12,73,50,469.0
2,44,38,37,37,43,57,66,62,73,12,73,50,469.0
3,44,39,39,40,46,59,67,63,74,12,76,54,469.0
4,44,41,41,43,46,59,67,64,74,12,76,54,469.0


In [28]:
import pandas as pd
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE,RandomOverSampler

In [29]:
crop=['Wheat', 'Mustard', 'Lentil', 'No Crop', 'Green pea', 'Sugarcane',
      'Garlic', 'Maize', 'Gram', 'Coriander', 'Potato', 'Bersem', 'Rice']

In [30]:
def model_evaluation(x_proba,y):
    pred=x_proba
    a=0
    for i in tqdm(range(len(pred))):
        b=np.log(pred[i][y[i]])
        if b==-np.inf:
            a+=-32
        else:
            a+=b
    return -a/len(pred)


MODEL 1

In [31]:
train1=train.drop('field_id',axis=1)

In [32]:
train1.shape

(188228, 13)

In [33]:
train1=train1.drop_duplicates()

In [34]:
train1.shape

(173666, 13)

In [35]:
le=LabelEncoder()

In [36]:
X_train,y_train=train1.drop('labels',axis=1),train1.labels

In [37]:
smote=SMOTE(random_state=42)
X_train,y_train=smote.fit_resample(X_train,y_train)

In [38]:
y_train=le.fit_transform(y_train)

In [39]:
model1_xg=XGBClassifier(random_state=0)
model1_lg=LGBMClassifier(random_state=0)

In [42]:
model1_xg.fit(X_train,y_train)
model1_lg.fit(X_train,y_train)

In [None]:
sv1=model1_xg.predict_proba(test.drop('field_id',axis=1))
sv1=pd.DataFrame(sv1,columns=crop)
sv1['field_id']=test.field_id
sv1=sv1.groupby('field_id').mean().reset_index()

In [None]:
sv1.drop('field_id',axis=1).idxmax(axis=1).value_counts() #xgb

In [None]:
sv2=model1_lg.predict_proba(test.drop('field_id',axis=1))
sv2=pd.DataFrame(sv2,columns=crop)
sv2['field_id']=test.field_id
sv2=sv2.groupby('field_id').mean().reset_index()

In [None]:
sv2.drop('field_id',axis=1).idxmax(axis=1).value_counts() #LgB

**MODEL 2**

In [None]:
extra_mean=train.groupby(['field_id','labels']).mean().reset_index()
extra_median=train.groupby(['field_id','labels']).median().reset_index()
extra_min=train.groupby(['field_id','labels']).min().reset_index()
extra_max=train.groupby(['field_id','labels']).max().reset_index()
train2=pd.concat([train,extra_mean,extra_median,extra_min,extra_max]).reset_index().drop('index',axis=1)

In [None]:
X_train,y_train=train2.drop(['labels','field_id'],axis=1),train2.labels

In [None]:
smote2=SMOTE(random_state=42)
X_train,y_train=smote2.fit_resample(X_train,y_train)

In [None]:
le2=LabelEncoder()
y_train=le2.fit_transform(y_train)

In [None]:
model2_lg=LGBMClassifier(random_state=0)

In [None]:
model2_lg.fit(X_train,y_train)

In [None]:
extra_mean=test.groupby(['field_id']).mean().reset_index()
extra_median=test.groupby(['field_id']).median().reset_index()
extra_min=test.groupby(['field_id']).min().reset_index()
extra_max=test.groupby(['field_id']).max().reset_index()
test2=pd.concat([test,extra_mean,extra_median,extra_min,extra_max]).reset_index().drop('index',axis=1)

In [None]:
sv3=model2_lg.predict_proba(test2.drop('field_id',axis=1))
sv3=pd.DataFrame(sv3,columns=crop)
sv3['field_id']=test.field_id
sv3=sv3.groupby('field_id').mean().reset_index()

In [None]:
sv3.drop('field_id',axis=1).idxmax(axis=1).value_counts()

In [None]:
df1=sv1.drop('field_id',axis=1)
df2=sv2.drop('field_id',axis=1)
df3=sv3.drop('field_id',axis=1)

In [None]:
df=df1*0.3+df2*0.4+df3*0.3  # ensemble techniques to imprve accuracy

In [None]:
df.insert(0,'field_id',sv1.field_id.astype(int))

In [None]:
df

In [None]:
# df.to_csv('submit.csv',index=False)