In [None]:
!pip install scikit-learn==1.0
!pip install xgboost==1.4.2
!pip install catboost==0.26.1
!pip install pandas==1.3.3
!pip install radiant-mlhub==0.3.0
!pip install rasterio==1.2.8
!pip install numpy==1.21.2
!pip install pathlib==1.0.1
!pip install tqdm==4.62.3
!pip install joblib==1.0.1
!pip install matplotlib==3.4.3
!pip install Pillow==8.3.2
!pip install torch==1.9.1
!pip install plotly==5.3.1


In [None]:
import pandas as pd
import numpy as np
import random
import torch
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_all(13)

In [None]:
import warnings
warnings.filterwarnings("ignore")

import gc
import pandas as pd
import numpy as np
from sklearn.metrics import *
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from indices_creation import *
import re
from tqdm import tqdm
from sklearn.linear_model import LinearRegression

In [None]:
import os
os.getcwd()

'/root/sentinel2_updated'

In [None]:
train_df_mean = pd.read_csv('/root/sentinel2_updated/Train_data_prep/merged_train_data/train_mean.csv')
#### we need to drop 'label' and 'field_id' later in the code 

test_df_mean  = pd.read_csv('/root/sentinel2_updated/Test_data_prep/merged_test_data/test_mean.csv')
#### we need to drop 'field_id' later in the code 


train_df_median = pd.read_csv('/root/sentinel2_updated/Train_data_prep/merged_train_data/train_median.csv')
#### we need to drop  'field_id' later in the code 

test_df_median  = pd.read_csv('/root/sentinel2_updated/Test_data_prep/merged_test_data/test_median.csv')
#### we need to drop 'field_id' later in the code 

train_size    = pd.read_csv('/root/sentinel2_updated/Train_data_prep/merged_train_data/size_of_field_train.csv')
test_size     = pd.read_csv('/root/sentinel2_updated/Test_data_prep/merged_test_data/size_of_field_test.csv')

train_size    = train_size.rename({'Field_id':'field_id'},axis=1)
test_size     = test_size.rename({'Field_id':'field_id'},axis=1)

train_df_median = train_df_mean.merge(train_size, on =['field_id'],how='left')
test_df_median  = test_df_mean.merge(test_size, on =['field_id'],how='left')


gc.collect()

0

In [None]:
print(f'The shape of train data before outlier removal - {train_df_mean.shape}')

train_df_mean = train_df_mean[train_df_mean.label.isin(list(range(1,10)))]

print(f'The shape of train data after outlier removal - {train_df_mean.shape}')

The shape of train data before outlier removal - (87113, 914)
The shape of train data after outlier removal - (87073, 914)


In [None]:
relevant_fids   = train_df_mean['field_id'].values.tolist()
train_df_median = train_df_median[train_df_median['field_id'].isin(relevant_fids)]

print(f'The shape of median train data - {train_df_median.shape} and mean train data {train_df_mean.shape}' )

###extra columns in train_df_mean being 'label' 

The shape of median train data - (87073, 915) and mean train data (87073, 914)


In [None]:
fids_train = train_df_median['field_id'].values.tolist()
fids_test  = test_df_median['field_id'].values.tolist()
len(fids_train),len(fids_test)

(87073, 35295)

In [None]:
cols              = ['B01_','B02_','B03_','B04_','B05_','B06_','B07_','B08_','B09_','B8A_','B11_','B12_']

columns_available = train_df_mean.columns.tolist()

cols2consider     = []

for col in cols:
  cols2consider.extend( [c for c in columns_available if col in c])

bands_with_dates = [c for c in columns_available if 'B01_' in c]

dates            = [c.replace('B01_','') for c in bands_with_dates]

print(f'The sample showing the commencement dates where observations were seen is {dates[:10]}')
print(f'The sample showing the ending dates where observations were seen is {dates[-10:]}')

The sample showing the commencement dates where observations were seen is ['month_04_day_01', 'month_04_day_04', 'month_04_day_11', 'month_04_day_14', 'month_04_day_21', 'month_04_day_24', 'month_05_day_01', 'month_05_day_04', 'month_05_day_11', 'month_05_day_14']
The sample showing the ending dates where observations were seen is ['month_11_day_05', 'month_11_day_07', 'month_11_day_10', 'month_11_day_12', 'month_11_day_15', 'month_11_day_17', 'month_11_day_20', 'month_11_day_22', 'month_11_day_27', 'month_11_day_30']


In [None]:
train_df_mean   = train_df_mean[cols2consider+['label']]
test_df_mean    = test_df_mean[cols2consider]

train_df_median = train_df_median[cols2consider+['size_of_field']]
test_df_median  = test_df_median[cols2consider+['size_of_field']]


In [None]:
train_df_median = get_band_ndvi_red(train_df_median,dates)
train_df_median = get_band_afri(train_df_median,dates)
train_df_median = get_band_evi2(train_df_median,dates)
train_df_median = get_band_ndmi(train_df_median,dates)
train_df_median = get_band_ndvi(train_df_median,dates)
train_df_median = get_band_evi(train_df_median,dates)
train_df_median = get_band_bndvi(train_df_median,dates)
train_df_median = get_band_nli(train_df_median,dates)
# train_df_median = get_band_lci(train_df_median,dates)


test_df_median = get_band_ndvi_red(test_df_median,dates)
test_df_median = get_band_afri(test_df_median,dates)
test_df_median = get_band_evi2(test_df_median,dates)
test_df_median = get_band_ndmi(test_df_median,dates)
test_df_median = get_band_ndvi(test_df_median,dates)
test_df_median = get_band_evi(test_df_median,dates)
test_df_median = get_band_bndvi(test_df_median,dates)
test_df_median = get_band_nli(test_df_median,dates)
# test_df_median = get_band_lci(test_df_median,dates)


# train_df_median = train_df_median.drop(cols2consider,axis=1)
# test_df_median  = test_df_median.drop(cols2consider,axis=1)

In [None]:
train_df_median = train_df_median.reset_index(drop=True)
test_df_median  = test_df_median.reset_index(drop=True)

In [None]:
train_df_median.head(3)

Unnamed: 0,B01_month_04_day_01,B01_month_04_day_04,B01_month_04_day_11,B01_month_04_day_14,B01_month_04_day_21,B01_month_04_day_24,B01_month_05_day_01,B01_month_05_day_04,B01_month_05_day_11,B01_month_05_day_14,...,month_11_day_05_NLI,month_11_day_07_NLI,month_11_day_10_NLI,month_11_day_12_NLI,month_11_day_15_NLI,month_11_day_17_NLI,month_11_day_20_NLI,month_11_day_22_NLI,month_11_day_27_NLI,month_11_day_30_NLI
0,58.26087,,8.826087,,23.73913,,98.565217,,120.173913,,...,,0.979151,,0.988446,,0.9786,,0.992188,0.978221,
1,22.491803,,24.770492,,13.885246,,12.344262,,26.04918,,...,,0.987253,,0.989632,,0.988729,,0.989941,0.990254,
2,10.785714,,10.142857,,8.885714,,6.385714,,47.542857,,...,,0.993347,,0.991496,,0.994595,,0.992143,0.994363,


In [None]:
bands = ['B01_','B02_','B03_','B04_','B05_','B06_','B07_','B08_','B8A_','B09_','B11_','B12_']

In [None]:
for band in bands:
    subtrain = train_df_median[[cols for cols in train_df_median.columns if band in cols]]
    print(subtrain.shape)

    
        



    ########## Calculating slopes for the index #############

    for i in tqdm(range(subtrain.shape[0])):

        for month in range(4,12):

            cols_month = [col for col in subtrain.columns if (('month_0'+str(month) in col or 'month_'+str(month) in col) )]

            array      = subtrain.loc[i,cols_month].dropna().values


            model      = LinearRegression(n_jobs=-1)

            _          = model.fit(np.array([range(1,len(array)+1)]).reshape(-1, 1),array)

            colname    = 'month_'+str(month)+band+'_slope'

            train_df_median.loc[i,colname] = model.coef_[0]

    

  0%|          | 3/87073 [00:00<1:04:18, 22.57it/s]

(87073, 76)


100%|██████████| 87073/87073 [15:55<00:00, 91.08it/s] 
  0%|          | 8/87073 [00:00<19:02, 76.18it/s]

(87073, 76)


100%|██████████| 87073/87073 [15:42<00:00, 92.36it/s] 
  0%|          | 12/87073 [00:00<12:21, 117.44it/s]

(87073, 76)


100%|██████████| 87073/87073 [15:56<00:00, 91.08it/s] 
  0%|          | 10/87073 [00:00<15:39, 92.65it/s]

(87073, 76)


100%|██████████| 87073/87073 [15:40<00:00, 92.54it/s] 
  0%|          | 8/87073 [00:00<20:39, 70.24it/s]

(87073, 76)


100%|██████████| 87073/87073 [15:07<00:00, 95.98it/s] 
  0%|          | 11/87073 [00:00<13:39, 106.29it/s]

(87073, 76)


100%|██████████| 87073/87073 [15:07<00:00, 95.92it/s] 
  0%|          | 10/87073 [00:00<15:51, 91.54it/s]

(87073, 76)


100%|██████████| 87073/87073 [15:54<00:00, 91.24it/s] 
  0%|          | 13/87073 [00:00<11:57, 121.35it/s]

(87073, 76)


100%|██████████| 87073/87073 [15:46<00:00, 92.03it/s] 
  0%|          | 12/87073 [00:00<13:01, 111.38it/s]

(87073, 76)


100%|██████████| 87073/87073 [15:25<00:00, 94.05it/s] 
  0%|          | 8/87073 [00:00<18:21, 79.01it/s]

(87073, 76)


100%|██████████| 87073/87073 [13:58<00:00, 103.90it/s]
  0%|          | 7/87073 [00:00<21:41, 66.88it/s]

(87073, 76)


100%|██████████| 87073/87073 [13:16<00:00, 109.36it/s]
  0%|          | 12/87073 [00:00<12:07, 119.69it/s]

(87073, 76)


100%|██████████| 87073/87073 [13:58<00:00, 103.90it/s]


In [None]:
train_df_median['field_id'] = fids_train 

In [None]:
for band in bands:
    subtest = test_df_median[[cols for cols in test_df_median.columns if band in cols]]
    print(subtest.shape)

    ########## Calculating slopes for the index #############

    for i in tqdm(range(subtest.shape[0])):

        for month in range(4,12):

            cols_month = [col for col in subtest.columns if (('month_0'+str(month) in col or 'month_'+str(month) in col) )]

            array      = subtest.loc[i,cols_month].dropna().values


            model      = LinearRegression()

            _          = model.fit(np.array([range(1,len(array)+1)]).reshape(-1, 1),array)

            colname    = 'month_'+str(month)+band+'_slope'

            test_df_median.loc[i,colname] = model.coef_[0]

    

  0%|          | 13/35295 [00:00<04:41, 125.24it/s]

(35295, 76)


100%|██████████| 35295/35295 [05:46<00:00, 101.94it/s]
  0%|          | 10/35295 [00:00<06:13, 94.57it/s]

(35295, 76)


100%|██████████| 35295/35295 [05:47<00:00, 101.57it/s]
  0%|          | 12/35295 [00:00<05:11, 113.42it/s]

(35295, 76)


100%|██████████| 35295/35295 [05:42<00:00, 103.14it/s]
  0%|          | 10/35295 [00:00<06:12, 94.80it/s]

(35295, 76)


100%|██████████| 35295/35295 [05:49<00:00, 100.91it/s]
  0%|          | 8/35295 [00:00<07:21, 79.97it/s]

(35295, 76)


100%|██████████| 35295/35295 [05:43<00:00, 102.77it/s]
  0%|          | 13/35295 [00:00<04:32, 129.38it/s]

(35295, 76)


100%|██████████| 35295/35295 [05:46<00:00, 101.75it/s]
  0%|          | 9/35295 [00:00<06:37, 88.70it/s]

(35295, 76)


100%|██████████| 35295/35295 [05:44<00:00, 102.49it/s]
  0%|          | 11/35295 [00:00<05:43, 102.86it/s]

(35295, 76)


100%|██████████| 35295/35295 [05:40<00:00, 103.53it/s]
  0%|          | 12/35295 [00:00<04:55, 119.59it/s]

(35295, 76)


100%|██████████| 35295/35295 [05:54<00:00, 99.57it/s] 
  0%|          | 9/35295 [00:00<06:49, 86.09it/s]

(35295, 76)


100%|██████████| 35295/35295 [05:45<00:00, 102.03it/s]
  0%|          | 9/35295 [00:00<07:03, 83.32it/s]

(35295, 76)


100%|██████████| 35295/35295 [05:54<00:00, 99.42it/s] 
  0%|          | 12/35295 [00:00<05:00, 117.38it/s]

(35295, 76)


100%|██████████| 35295/35295 [05:50<00:00, 100.64it/s]


In [None]:
test_df_median['field_id'] = fids_test 

In [None]:
train_df_median.to_csv('train_with_slopes.csv',index=False)
test_df_median.to_csv('test_with_slopes.csv',index=False)

In [None]:
train_df_median

Unnamed: 0,B01_month_04_day_01,B01_month_04_day_04,B01_month_04_day_11,B01_month_04_day_14,B01_month_04_day_21,B01_month_04_day_24,B01_month_05_day_01,B01_month_05_day_04,B01_month_05_day_11,B01_month_05_day_14,...,month_11B11__slope,month_4B12__slope,month_5B12__slope,month_6B12__slope,month_7B12__slope,month_8B12__slope,month_9B12__slope,month_10B12__slope,month_11B12__slope,field_id
0,58.260870,,8.826087,,23.739130,,98.565217,,120.173913,,...,-0.997516,-2.347826,2.769565,-2.369565,3.535404,0.539130,-5.400000,-10.731677,-1.490683,2.0
1,22.491803,,24.770492,,13.885246,,12.344262,,26.049180,,...,6.793911,-7.868852,1.709836,-0.016393,2.288525,8.373770,8.201639,-5.334895,4.158782,29.0
2,10.785714,,10.142857,,8.885714,,6.385714,,47.542857,,...,-0.140000,0.814286,11.794286,21.921429,0.957959,3.990204,-6.908571,-5.638776,-1.568163,78.0
3,9.940594,,9.039604,,8.811881,,10.772277,,161.980198,,...,-10.843847,0.920792,2.770297,44.272277,2.159264,11.214710,4.733663,-0.563225,-10.149929,92.0
4,9.922936,,7.808257,,7.249541,,5.009174,,136.221101,,...,-0.300105,0.096330,3.175138,23.994037,5.240288,15.775072,-9.950550,-9.025767,-0.985688,104.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87068,33.291785,0.000000,23.810198,0.000000,22.005666,,95.643059,,40.201133,,...,0.836364,0.830595,-5.420397,15.143667,-2.427059,1.603201,7.999433,-6.082960,0.229977,122371.0
87069,13.775216,224.293948,11.123919,6.694524,9.873199,6.420749,7.112392,5.334294,186.092219,3.893372,...,3.716296,-0.734953,5.104673,1.983121,-0.087705,1.829649,-1.447887,0.256864,2.671077,122381.0
87070,15.895765,,16.146580,,16.876221,,8.044517,,138.827362,,...,0.770436,11.784473,21.609121,19.607492,5.825159,8.607445,0.428122,-1.382690,0.795409,122440.0
87071,20.438291,0.000000,19.069620,0.000000,16.447785,,118.457278,,109.474684,,...,-0.507192,1.308861,-9.946361,11.215371,-2.069659,1.547988,9.306962,-6.703481,-0.285256,122631.0


In [None]:
test_df_median

Unnamed: 0,B01_month_04_day_01,B01_month_04_day_04,B01_month_04_day_11,B01_month_04_day_14,B01_month_04_day_21,B01_month_04_day_24,B01_month_05_day_01,B01_month_05_day_04,B01_month_05_day_11,B01_month_05_day_14,...,month_11B11__slope,month_4B12__slope,month_5B12__slope,month_6B12__slope,month_7B12__slope,month_8B12__slope,month_9B12__slope,month_10B12__slope,month_11B12__slope,field_id
0,73.701354,100.326142,26.533841,19.632826,24.760152,20.890440,36.981810,19.036802,170.071489,18.530457,...,1.684410,9.286137,9.776499,3.954351,-1.987706,-2.634948,4.798470,1.438071,0.324177,30.0
1,74.124260,,18.011834,,16.532544,,19.656805,,242.443787,,...,2.924260,-2.547337,6.684024,-7.428994,2.200507,4.846323,7.144379,-1.900930,2.484869,56.0
2,34.531390,,21.071749,,21.556054,,15.105381,,76.939462,,...,6.409289,19.115471,13.808969,-2.968610,-0.609994,8.870980,6.200000,-1.275593,7.583152,60.0
3,20.033514,,15.532775,,15.319862,,20.195663,,255.000000,,...,4.670253,3.238788,7.789946,-17.890094,1.354348,-22.003422,-1.434697,-0.511075,0.487615,97.0
4,26.327756,255.000000,12.861220,14.796260,12.991142,12.741142,187.115157,11.292323,75.319882,12.980315,...,0.547754,0.267323,3.609533,-0.389651,-0.611234,0.311653,-2.571834,0.316042,-0.213985,103.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35290,83.645933,44.404306,31.665072,26.674641,32.744019,27.708134,176.978469,24.732057,166.526316,14.361244,...,-0.225098,3.653178,-2.937286,-0.367532,-0.086819,-0.334560,3.072408,3.069076,-1.545520,122646.0
35291,31.777107,254.787083,30.400893,24.085062,28.940929,27.208979,30.870832,26.772119,183.965083,24.696246,...,0.806382,4.474733,-0.985364,-1.264674,-0.945762,-0.894149,-2.073274,1.486315,-0.033719,122679.0
35292,88.998331,214.305509,24.110601,60.517112,25.696160,22.569699,14.520868,12.383139,55.834725,14.619783,...,0.400687,7.949630,8.204254,4.272728,-4.876373,0.571575,9.047851,1.806651,0.252284,122692.0
35293,9.074074,,9.728395,,9.358025,,7.814815,,108.691358,,...,-3.869489,3.876543,1.302469,25.716049,8.513580,0.407760,-1.861728,0.709347,-6.221869,122696.0
