# SETUP

# NOTE: Please Run this on  Tesla P100-PCIE [ 16280 Mib ]

In [1]:
!nvidia-smi

In [None]:
!pip install -r requirements_kaggle.txt -q

# DATA

> To speed up the review process , i provided the ***drive id*** of the data i've created from the Train creation folder noteboooks .
---
> I  also add each data drive link in the Readme Pdf file attached with this solution

In [None]:
!pip install gdown -q

In [None]:
!gdown --id 1hNRbtcqd9F6stMOK1xAZApDITwAjiSDJ
!gdown --id 1-QCmWsNGREXuWArifN0nD_Sp4hJxf0tu

In [None]:
!gdown --id 1-47L_1NKLeVgW1vWmqXXXCuWZ3gwZWsS
!gdown --id 1-aO4FEtv5CF-ZOcxDSO3jGEzPcIFdxgP

In [None]:
!gdown --id 1-8J_xFgI0WKT5UXFnfH4q1KUw_KgNY37
!gdown --id 1-a55a7N6a4SoqolPF_wI4C6Q70u_d7Hj

In [None]:
!gdown --id 1-BgXQwmXqBuk_P8VtvLfdLqy83dv56Kz
!gdown --id 1-hQGF2TNBbsy3jsGNtndmK55egbdFDjs

## LIBRARIES

In [None]:
#import necessary dependecies
import os
import numpy as np  
import pandas as pd

import random
from tqdm import tqdm 
import copy


import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import QuantileTransformer

import warnings
warnings.filterwarnings('ignore') 

# fix seed
np.random.seed(111)
random.seed(111)

## Train Creation

In [None]:
def create_train():
  train =pd.read_csv("S2TrainObs1.csv" )

  train = train.groupby('field_id').median().reset_index().sort_values('field_id')
  train.label = train.label.astype('int')
  return train

In [None]:
def create_test():
  test =pd.read_csv("S2TestObs1.csv" )
  return  test

In [None]:
def createObs2_train():
  train =pd.read_csv("S2TrainObs2.csv" )

  train = train.groupby('field_id').median().reset_index().sort_values('field_id')
  train.label = train.label.astype('int')
  return train

In [None]:
def createObs2_test():
  test =pd.read_csv("S2TestObs2.csv" )
  return test

In [None]:
def createObs3_train():
  train =pd.read_csv("S2TrainObs3.csv" )

  train = train.groupby('field_id').median().reset_index().sort_values('field_id')
  train.label = train.label.astype('int')
  return train

In [None]:
def createObs3_test():
  test =pd.read_csv("S2TestObs3.csv" )
  return test

In [None]:
def createObs4_train():
  train =pd.read_csv("S2TrainObs4.csv" )

  train = train.groupby('field_id').median().reset_index().sort_values('field_id')
  train.label = train.label.astype('int')
  return train

In [None]:
def createObs4_test():
  test =pd.read_csv("S2TestObs4.csv" )
  return test

## Feature Engineering

In [None]:
def process(T) :

  # process bands
  Bcols = T.filter(like='B').columns.tolist()
  Vcols = T.filter(like='V').columns.tolist()
  Obs1 = T.filter(like='Month4').columns.tolist()
  Obs2 = T.filter(like='Month5').columns.tolist()
  Obs3 = T.filter(like='Month6').columns.tolist()
  Obs4 = T.filter(like='Month7').columns.tolist()
  Obs5 = T.filter(like='Month8').columns.tolist()
  Obs6 = T.filter(like='Month9').columns.tolist()
  Obs7 = T.filter(like='Month10').columns.tolist()
  Obs8 = T.filter(like='Month11').columns.tolist()


  # vegetation indexes 
  B8cols = T.filter(like='B8_').columns.tolist()
  B8cols = [x for x in B8cols if 'std' not in x]
  
  B4cols = T.filter(like='B4_').columns.tolist()
  B4cols = [x for x in B4cols if 'std' not in x]

  B3cols = T.filter(like='B3_').columns.tolist()
  B3cols = [x for x in B3cols if 'std' not in x]

  B5cols = T.filter(like='B5_').columns.tolist()
  B5cols = [x for x in B5cols if 'std' not in x]

  B3cols = T.filter(like='B3_').columns.tolist()
  B3cols = [x for x in B3cols if 'std' not in x]

  B2cols = T.filter(like='B2_').columns.tolist()
  B2cols = [x for x in B2cols if 'std' not in x]

  B7cols = T.filter(like='B7_').columns.tolist()
  B7cols = [x for x in B7cols if 'std' not in x]

  B8Acols = T.filter(like='B8A_').columns.tolist()
  B8Acols = [x for x in B8Acols if 'std' not in x]

  B6cols = T.filter(like='B6_').columns.tolist()
  B6cols = [x for x in B6cols if 'std' not in x]

  B12cols = T.filter(like='B12_').columns.tolist()
  B12cols = [x for x in B12cols if 'std' not in x]

  B11cols = T.filter(like='B11_').columns.tolist()
  B11cols = [x for x in B11cols if 'std' not in x]

  B1cols = T.filter(like='B1_').columns.tolist()
  B1cols = [x for x in B1cols if 'std' not in x]

  B9cols = T.filter(like='B9_').columns.tolist()
  B9cols = [x for x in B9cols if 'std' not in x]

  L = 0.725
  for b1,b2 ,b3 ,b4, b5 , b6, b7, b8 ,b8a ,b9,b11,b12 in zip(B1cols,B2cols,B3cols,B4cols,B5cols,B6cols,B7cols,B8cols,B8Acols,B9cols,B11cols,B12cols) :
    T[f'NDVI_{b8.split("_")[1]}']   = ((T[b8] - T[b4]) /  (T[b8] + T[b4]))
    T[f'SAVI_{b8.split("_")[1]}']   = ((T[b8] - T[b4]) /  (T[b8] + T[b4]+L) * (1.0 + L))
    T[f'GRNDVI_{b8.split("_")[1]}'] = ((T[b8] - (T[b3]+T[b4])) /  (T[b8] + (T[b3]+T[b4])))
    T[f'GNDVI_{b8.split("_")[1]}']  = ((T[b8] - T[b3] ) /  (T[b8] + T[b3]))
    T[f'NDRE_{b8.split("_")[1]}']   = ((T[b5] - T[b4])/ (T[b5] + T[b4]))
    T[f'EVI_{b8.split("_")[1]}']    = (2.5 * (T[b8]  - T[b4] ) / ((T[b8]  + 6.0 * T[b4]  - 7.5 * T[b2]) + 1.0)).values.clip(min=-5,max=5)
    T[f'WDRVI_{b8.split("_")[1]}']  = (((8 * T[b8]) - T[b4])/ ((8* T[b8]) + T[b4]))
    T[f'ExBlue_{b8.split("_")[1]}']  = ((2 * T[b2]) - (T[b3]+T[b4]))
    T[f'ExGreen_{b8.split("_")[1]}']  = ((2 * T[b3]) - (T[b2]+T[b4]) )
    T[f'NDRE7_{b8.split("_")[1]}']   = ((T[b7] - T[b4])/ (T[b7] + T[b4]))
    T[f'MTCI_{b8.split("_")[1]}']   = ((T[b8a] - T[b6])/ (T[b7] + T[b6]))
    T[f'VARI_{b8.split("_")[1]}']   = ((T[b3] - T[b4])/ (T[b3] + T[b4] - T[b2]))
    T[f'ARVI_{b8.split("_")[1]}']   = ( ((T[b8] - T[b4])-(T[b4] - T[b2])) /  ((T[b8] + T[b4])-(T[b4] - T[b2])) )

    # Bands Relations
    T[f'b7b5_{b8.split("_")[1]}']  = (T[b7] - T[b5])/ (T[b7] + T[b5])    # B7  / B5
    T[f'b7b6_{b8.split("_")[1]}']  = (T[b7] - T[b6])/ (T[b7] + T[b6])    # B7  / B6
    T[f'b8ab5_{b8.split("_")[1]}'] = (T[b8a] - T[b5])/ (T[b8a]  + T[b5]) # B8A / B5
    T[f'b6b5_{b8.split("_")[1]}']  = (T[b6] - T[b5])/ (T[b6] + T[b5])    # B6  / B5 
    
    # ASSAZZIN bands relations 
    T[f'b3b1_{b8.split("_")[1]}']  = (T[b3] - T[b1])/ (T[b3] + T[b1])    
    T[f'b11b8_{b8.split("_")[1]}']  = (T[b11] - T[b8])/ (T[b11] + T[b8])    
    T[f'b12b11_{b8.split("_")[1]}']  = (T[b12] - T[b11])/ (T[b12] + T[b11])    
    T[f'b3b4_{b8.split("_")[1]}']  = (T[b3] - T[b4])/ (T[b3] + T[b4])    
    T[f'b9b4_{b8.split("_")[1]}']  = (T[b9] - T[b4])/ (T[b9] + T[b4])    
    T[f'b5b3_{b8.split("_")[1]}']  = (T[b5] - T[b3])/ (T[b5] + T[b3])    
    T[f'b12b3_{b8.split("_")[1]}']  = (T[b12] - T[b3])/ (T[b12] + T[b3])    

    T[f'b2b1_{b8.split("_")[1]}']  = (T[b2] - T[b1])/ (T[b2] + T[b1])    
    T[f'b4b1_{b8.split("_")[1]}']  = (T[b4] - T[b1])/ (T[b4] + T[b1])    
    T[f'b11b3_{b8.split("_")[1]}']  = (T[b11] - T[b3])/ (T[b11] + T[b3])    
    
    T[f'b12b8_{b8.split("_")[1]}']  = (T[b12] - T[b8])/ (T[b12] + T[b8])    
    T[f'b3b2_{b8.split("_")[1]}']  = (T[b3] - T[b2])/ (T[b3] + T[b2])    
    T[f'b8ab3_{b8.split("_")[1]}'] = (T[b8a] - T[b3])/ (T[b8a]  + T[b3]) 
    T[f'b8ab2_{b8.split("_")[1]}'] = (T[b8a] - T[b2])/ (T[b8a]  + T[b2]) 

    T[f'b8b1_{b8.split("_")[1]}']  = (T[b8] - T[b1])/ (T[b8] + T[b1])    
    T[f'ARVI2_{b8.split("_")[1]}']   = ( ((T[b3] - T[b4])-(T[b4] - T[b2])) /  ((T[b3] + T[b4])+(T[b4] + T[b2])) )
    T[f'ARVI3_{b8.split("_")[1]}']   = ( ((T[b5] - T[b3])-(T[b3] - T[b2])) /  ((T[b5] + T[b3])+(T[b3] + T[b2])) )
    T[f'b8b9_{b8.split("_")[1]}']  = (T[b8] - T[b9])/ (T[b8] + T[b9])    
    T[f'b3b9_{b8.split("_")[1]}']  = (T[b3] - T[b9])/ (T[b3] + T[b9])    
    T[f'b2b9_{b8.split("_")[1]}']  = (T[b2] - T[b9])/ (T[b2] + T[b9])    

    T[f'b12b9_{b8.split("_")[1]}']  = (T[b12] - T[b9])/ (T[b12] + T[b9])    
    T[f'b12b8_{b8.split("_")[1]}']  = (T[b12] - T[b8])/ (T[b12] + T[b8])    

  for col in Bcols :
    T[col] = np.sqrt(T[col])
  for b2 ,b3 ,b4 in zip(B2cols,B3cols,B4cols) :
    T[f'RGB_STD_{b3.split("_")[1]}'] = T[[b2,b3,b4]].std(axis=1)
    T[f'RGB_MEAN_{b3.split("_")[1]}'] = T[[b2,b3,b4]].mean(axis=1)

  for col in Vcols :
    T[col] = np.sqrt(T[col])

  for col1,col2,col3,col4,col5,col6,col7,col8 in zip(Obs1,Obs2,Obs3,Obs4,Obs5,Obs6,Obs7,Obs8) :
    T[f'{col1.split("_")[0]}_std'] = T[[col1,col2,col3,col4,col5,col6,col7,col8]].std(axis=1)
  
  # process Vegetation indexes
  ObsN   = T.filter(like='NDVI_').columns.tolist()
  ObsSA  = T.filter(like='SAVI_').columns.tolist()
  ObsCC  = T.filter(like='CCCI_').columns.tolist()
  ObsWDR = T.filter(like='WDRVI_').columns.tolist()
  ObsNDRE7 = T.filter(like='NDRE7_').columns.tolist()

  T['NDVI_max']    = T[ObsN].max(axis=1)
  T['NDVI_min']    = T[ObsN].min(axis=1)
  
  T['SAVI_max']    = T[ObsSA].max(axis=1)
  T['SAVI_mmin']   = T[ObsSA].min(axis=1)

  T['WDRVI_max']  = T[ObsWDR].max(axis=1)
  T['WDRVI_min']  = T[ObsWDR].min(axis=1)

  T['NDRE7_max']   = T[ObsNDRE7].max(axis=1)
  T['NDRE7_min']   = T[ObsNDRE7].min(axis=1)

  return T

In [None]:
Train = create_train()
Test = create_test()

In [None]:
Train2 = createObs2_train()
Test2 = createObs2_test()

In [None]:
Train3 = createObs3_train()
Test3 = createObs3_test()

In [None]:
Train4 = createObs4_train()
Test4 = createObs4_test()

In [None]:
Train.shape , Test.shape

In [None]:
Train2.shape , Test2.shape

In [None]:
Train3.shape , Test3.shape

In [None]:
Train4.shape , Test4.shape

In [None]:
Train = process(Train)
Test = process(Test)

In [None]:
Train2 = process(Train2)
Test2 = process(Test2)

In [None]:
Train3 = process(Train3)
Test3 = process(Test3)

In [None]:
Train4 = process(Train4)
Test4 = process(Test4)

In [None]:
Train.shape , Test.shape

In [None]:
Train2.shape , Test2.shape

In [None]:
Train3.shape , Test3.shape

In [None]:
Train4.shape , Test4.shape

In [None]:
Train = pd.concat([Train,Train2.drop(columns=['field_id','label']),Train3.drop(columns=['field_id','label']),Train4.drop(columns=['field_id','label'])],axis=1)
Train.shape

In [None]:
Test = pd.concat([Test,Test2.drop(columns=['field_id']),Test3.drop(columns=['field_id'])],axis=1)
Test = pd.merge(Test,Test4,on='field_id',how='left')
Test.shape

In [None]:
import gc ; gc.collect()

# MODELING

In [None]:
X    = Train.replace(np.inf,50).drop(['field_id','label'], axis=1)
y    = Train.label
TEST = Test.replace(np.inf,50).drop(['field_id'], axis=1)

In [None]:
TEST.columns = X.columns.tolist()

In [None]:
data = pd.concat([X,TEST])
qt=QuantileTransformer(output_distribution="normal",random_state=42)
data= pd.DataFrame(qt.fit_transform(data),columns=X.columns)

In [None]:
X = data[:X.shape[0]].values
TEST = data[X.shape[0]:].values

In [None]:
X.shape , TEST.shape

In [None]:
##############################################################################################################################################################################

### Cross Validation

In [None]:
seed = 47
sk = StratifiedKFold(n_splits= 10,random_state=seed,shuffle=True)

def DefineModel(name='lgbm') :
  if name =='lgbm':
    return lgb.LGBMClassifier(learning_rate = 0.1,n_estimators = 3000,
                            objective ='multiclass',random_state = 111,
                            num_leaves = 80,max_depth = 6,
                            metric = 'multi_logloss',
                            colsample_bytree = 0.5 ,
                            bagging_freq= 5, bagging_fraction= 0.75,
                            lambda_l2 = 100
                            )
  elif name =='catboost' :
    cat_params = {"loss_function": "MultiClass","eval_metric": "MultiClass","learning_rate": 0.1,
              "random_seed": 42,"l2_leaf_reg": 3,"bagging_temperature": 1, 
              "depth": 6,"od_type": "Iter","od_wait": 50,"thread_count": 16,"iterations": 50000,
              "use_best_model": True,'task_type':"GPU",'devices':'0:1'}
    return CatBoostClassifier(**cat_params
                              )
  else :
    return xgb.XGBClassifier(objective = 'multi:softmax',
                             base_score = np.mean(y),eval_metric ="mlogloss",
                             n_estimators = 2000,
                             subsample= 0.75,
                             seed=seed,random_state = seed,num_class = 9,
                             colsample_bytree = 0.5 ,
                             reg_lambda =100,
                             reg_alpha = 0.5,
                             tree_method = 'gpu_hist', grow_policy = 'lossguide',gpu_id = 0,
                             ) 

def Run5fold(name,X,y,TEST) :
  print(f'TRAINING {name}')
  cv_score_ = 0
  oof_preds = np.zeros((Train.shape[0],9))
  final_predictions = np.zeros((Test.shape[0],9))

  for fold, (train_idx, test_idx) in enumerate(sk.split(X,y)):
    print()
    print(f'######### FOLD {fold+1} / {sk.n_splits} ')
    
    X_train,y_train = X[train_idx,:],y[train_idx]
    X_test,y_test   = X[test_idx,:] ,y[test_idx]

    model = DefineModel(name=name)
    model.fit(X_train,y_train,
        eval_set = [(X_test,y_test)],
        early_stopping_rounds  = 100,
        verbose = 100
    )
    oof_prediction = model.predict_proba(X_test)
    cv_score_ += log_loss(y_test,oof_prediction) / sk.n_splits
    print(f'Log Loss Fold {fold} : {log_loss(y_test,oof_prediction) }')
    oof_preds[test_idx] = oof_prediction

    test_prediction = model.predict_proba(TEST)
    final_predictions += test_prediction / sk.n_splits

  return oof_preds , final_predictions

In [None]:
oof_xgb , predictions_xgb = Run5fold(name='xgb',X=X,y=y,TEST=TEST) #9144 seconde

In [None]:
print('XGBOOST LOG LOSS :',log_loss(y,oof_xgb)) 

In [None]:
# In this part we format the DataFrame to have column names and order similar to the sample submission file. 
pred_df = pd.DataFrame(predictions_xgb) #*0.5+predictions_cat*0.5)
pred_df = pred_df.rename(columns={
    0:'Crop_ID_1',
    1:'Crop_ID_2', 
    2:'Crop_ID_3',
    3:'Crop_ID_4',
    4:'Crop_ID_5',
    5:'Crop_ID_6',
    6:'Crop_ID_7',
    7:'Crop_ID_8',
    8:'Crop_ID_9'
})
pred_df['field_id'] = Test['field_id'].astype('int').values
pred_df = pred_df[['field_id', 'Crop_ID_1', 'Crop_ID_2', 'Crop_ID_3', 'Crop_ID_4', 'Crop_ID_5', 'Crop_ID_6', 'Crop_ID_7', 'Crop_ID_8', 'Crop_ID_9']]
pred_df.head()

In [None]:
# Write the predicted probabilites to a csv for submission
pred_df.to_csv('S2_Xgboost.csv', index=False)

In [None]:
np.save('S2_oof_xgb.npy',oof_xgb)