# Workflow

## Setup Project Environment

Install required python dependencies

In [7]:
!python -m pip install --upgrade pip -q
!python -m pip install -r ../requirements.txt -q

In [30]:
from pathlib import Path
import pandas as pd
import altair as alt

## Input Datasets

In [11]:
input_file = Path("../data/formated_dataset.csv")
df = pd.read_csv(input_file)

In [12]:
# To display the top 5 rows
df.head(5)

Unnamed: 0,target,relief_elev_focalrange1000m_3s,LONGITUDE_GRID1_clip,LOC_distance_to_coast,ceno_euc_aust1,clim_EPA_albers,be-30y-85m-avg-CLAY-PC2.filled.lzw.nodata,relief_roughness,be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata,be-30y-85m-avg-ND-SWIR1-SWIR2.filled.lzw.nodata,...,tpi_300,clim_WDA_albers,be-l8-all-85m-avg-BLUE.filled.lzw.nodata,s2-dpca-85m_1,s2-dpca-85m_2,s2-dpca-85m_3,water-85m_1,water-85m_2,water-85m_3,saga_wetSM_85_reprojected
0,0.012695,21.217691,133.67166,2.006358,0.065748,1833.6868,0.266981,3.810077,0.67142,0.614047,...,-0.264486,-1029.791,0.0606,192.0,88.0,63.0,0.026451,-0.039139,-0.062414,10.004804
1,0.004673,16.920124,133.65765,2.025787,0.058309,1826.2247,0.249358,4.741497,0.679385,0.614248,...,0.076412,-1023.5242,0.0549,210.0,74.0,47.0,0.017286,-0.018131,-0.033444,9.931655
2,0.005242,16.05458,133.65843,2.024727,0.059387,1828.1902,0.223515,2.878581,0.675992,0.617949,...,0.634743,-1024.9868,0.052,210.0,74.0,46.0,0.012877,-0.002545,-0.01403,10.118466
3,0.004727,14.514371,133.66623,2.014133,0.070222,1831.6632,0.25964,1.472411,0.673164,0.615496,...,0.401459,-1027.2472,0.0543,232.0,105.0,67.0,0.021657,-0.044284,-0.063202,10.619378
4,0.01698,8.90559,136.11418,0.288717,0.026916,1880.6023,0.27031,0.298546,0.646681,0.607906,...,0.033232,-1059.9712,0.0548,70.0,136.0,79.0,0.078101,-0.029294,-0.108972,11.260143


In [13]:
# Checking the data type
df.dtypes

target                                             float64
relief_elev_focalrange1000m_3s                     float64
LONGITUDE_GRID1_clip                               float64
LOC_distance_to_coast                              float64
ceno_euc_aust1                                     float64
clim_EPA_albers                                    float64
be-30y-85m-avg-CLAY-PC2.filled.lzw.nodata          float64
relief_roughness                                   float64
be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata       float64
be-30y-85m-avg-ND-SWIR1-SWIR2.filled.lzw.nodata    float64
dem_fill                                           float64
Dose_2016                                          float64
be-30y-85m-avg-GREEN.filled.lzw.nodata             float64
be-30y-85m-avg_BLUE+SWIR2                          float64
be-30y-85m-avg-SWIR1.filled.lzw.nodata             float64
LATITUDE_GRID1_clip                                float64
Potassium_2016                                     float

In [18]:
df = df.rename(columns={'relief_elev_focalrange1000m_3s':'relief_elev_1000m',
'LONGITUDE_GRID1_clip':'longitude',
'LOC_distance_to_coast':'distance_to_coast',
'ceno_euc_aust1':'ceno',
'clim_EPA_albers':'clim',
'be-30y-85m-avg-CLAY-PC2.filled.lzw.nodata':'clay_pc2',
'relief_roughness':'relief_roughness',
'be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata':'red_blue',
'be-30y-85m-avg-ND-SWIR1-SWIR2.filled.lzw.nodata':'swir1_swir2',
'dem_fill':'dem_fill',
'Dose_2016':'dose',
'be-30y-85m-avg-GREEN.filled.lzw.nodata':'green',
'be-30y-85m-avg_BLUE+SWIR2':'blue_swir2',
'be-30y-85m-avg-SWIR1.filled.lzw.nodata':'swir1',
'LATITUDE_GRID1_clip':'latitude',
'Potassium_2016':'potassium',
'Clim_Prescott_LindaGregory':'prescott',
'relief_mrvbf_3s_mosaic':'mrvbf_mosaic',
'Rad2016U_Th':'rad2016u_th',
'mrvbf_9':'mrvbf',
'be-30y-85m-avg-ND-SWIR1-NIR.filled.lzw.nodata':'swir1_nir',
'slope_fill2':'slope_fill',
'clim_RSM_albers':'clim',
'3dem_mag2':'dem_mag',
'relief_elev_focalrange300m_3s':'relief_elev_300m',
'3dem_mag1_fin':'3dem_mag1_fin',
'be-30y-85m-avg-ND-NIR-GREEN.filled.lzw.nodata':'nir_green',
'clim_PTA_albers':'albers',
'be-30y-85m-avg-RED.filled.lzw.nodata':'red',
'be-30y-85m-avg-SWIR2.filled.lzw.nodata':'swir2',
'3dem_mag0.fin':'dem_mag',
'si_geol1':'si_geol',
'national_Wii_RF_multirandomforest_prediction':'wii',
'Thorium_2016':'thorium',
'Grav_lane_clip':'grav_lane_clip',
'be-l8-all-85m-avg-NIR.filled.lzw.nodata':'nir',
'Rad2016K_Th':'rad2016k_th',
'tpi_300':'tpi',
'clim_WDA_albers':'clim_wda_albers',
'be-l8-all-85m-avg-BLUE.filled.lzw.nodata':'blue',
's2-dpca-85m_1':'s2-dpca-85m_1',
's2-dpca-85m_2':'s2-dpca-85m_2',
's2-dpca-85m_3':'s2-dpca-85m_3',
'water-85m_1':'water-85m_1',
'water-85m_2':'water-85m_2',
'water-85m_3':'water-85m_3',
'saga_wetSM_85_reprojected':'water-85m_1'})
df.head(5)

Unnamed: 0,target,relief_elev_1000m,longitude,distance_to_coast,ceno,clim,clay_pc2,relief_roughness,red_blue,swir1_swir2,...,tpi,clim_wda_albers,blue,s2-dpca-85m_1,s2-dpca-85m_2,s2-dpca-85m_3,water-85m_1,water-85m_2,water-85m_3,water-85m_1.1
0,0.012695,21.217691,133.67166,2.006358,0.065748,1833.6868,0.266981,3.810077,0.67142,0.614047,...,-0.264486,-1029.791,0.0606,192.0,88.0,63.0,0.026451,-0.039139,-0.062414,10.004804
1,0.004673,16.920124,133.65765,2.025787,0.058309,1826.2247,0.249358,4.741497,0.679385,0.614248,...,0.076412,-1023.5242,0.0549,210.0,74.0,47.0,0.017286,-0.018131,-0.033444,9.931655
2,0.005242,16.05458,133.65843,2.024727,0.059387,1828.1902,0.223515,2.878581,0.675992,0.617949,...,0.634743,-1024.9868,0.052,210.0,74.0,46.0,0.012877,-0.002545,-0.01403,10.118466
3,0.004727,14.514371,133.66623,2.014133,0.070222,1831.6632,0.25964,1.472411,0.673164,0.615496,...,0.401459,-1027.2472,0.0543,232.0,105.0,67.0,0.021657,-0.044284,-0.063202,10.619378
4,0.01698,8.90559,136.11418,0.288717,0.026916,1880.6023,0.27031,0.298546,0.646681,0.607906,...,0.033232,-1059.9712,0.0548,70.0,136.0,79.0,0.078101,-0.029294,-0.108972,11.260143


In [25]:
# Total number of rows and columns
rows = df.shape[0]
print("number of total rows: ", rows)
duplicate_rows_df = df.drop_duplicates()
print("number of duplicate rows: ", rows - duplicate_rows_df.shape[0])

number of total rows:  489317
number of duplicate rows:  0


In [24]:
# Used to count the number of rows before removing the data
df.count() 

target               489317
relief_elev_1000m    489317
longitude            489317
distance_to_coast    489317
ceno                 489317
clim                 489317
clay_pc2             489317
relief_roughness     489317
red_blue             489317
swir1_swir2          489317
dem_fill             489317
dose                 489317
green                489317
blue_swir2           489317
swir1                489317
latitude             489317
potassium            489317
prescott             489317
mrvbf_mosaic         489317
rad2016u_th          489317
mrvbf                489317
swir1_nir            489317
slope_fill           489317
clim                 489317
dem_mag              489317
relief_elev_300m     489317
3dem_mag1_fin        489317
nir_green            489317
albers               489317
red                  489317
swir2                489317
dem_mag              489317
si_geol              489317
wii                  489317
thorium              489317
grav_lane_clip      

In [29]:
# Finding the null values.
print(df.isnull().sum())

target               0
relief_elev_1000m    0
longitude            0
distance_to_coast    0
ceno                 0
clim                 0
clay_pc2             0
relief_roughness     0
red_blue             0
swir1_swir2          0
dem_fill             0
dose                 0
green                0
blue_swir2           0
swir1                0
latitude             0
potassium            0
prescott             0
mrvbf_mosaic         0
rad2016u_th          0
mrvbf                0
swir1_nir            0
slope_fill           0
clim                 0
dem_mag              0
relief_elev_300m     0
3dem_mag1_fin        0
nir_green            0
albers               0
red                  0
swir2                0
dem_mag              0
si_geol              0
wii                  0
thorium              0
grav_lane_clip       0
nir                  0
rad2016k_th          0
tpi                  0
clim_wda_albers      0
blue                 0
s2-dpca-85m_1        0
s2-dpca-85m_2        0
s2-dpca-85m

## Detecting Outliers

In [None]:
sns.boxplot()

In [32]:

alt.Chart(df).mark_boxplot().encode(
    x=df['blue']
)

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000). For information on how to plot larger datasets in Altair, see the documentation

alt.Chart(...)

## Exploratory Data Analysis (EDA) 
Distribution
- Histograms
- Boxplot

Correlation



# Preprocessing
- Outlier Removal
- Feature Engineering
- Sample Balancing (SMOTE)
- Replace NAN (Impute/Interpolation)
- PCA
- Scaling



        Remove features that have too many NAN or fill NAN with another value
        Remove features that will introduce data leakage
        Encode categorical features into integers
        Extract new useful features (between and within current features)



## Test-Train Split
- K-fold


    
      randomisation test train split

## Choose Model
- Random Forest
- Xgboost
- LightGBM
- CatBoost

## Model Fit
- Bayesian Optimization

## Model Evaluation
- Accuracy
- R2 Score
- Confustion Matrix (range buckets)

## Final Model
- OOS Testing

## Prediction
- Get New Data
- Get the Selected Model
- Make Predictions

In [3]:
# Do Bysien Optimaztion to find best model parameters

In [None]:
# Do Grid Search to find best model parameters