In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from scipy.stats import norm, lognorm, anderson, kstest
from sklearn import metrics
from rf_functions import data_setup, run_rf_reg, run_rf_cla
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
pd.options.mode.chained_assignment = None

# Regression - single date

In [None]:
s1_fp = 'input_data/s1_zonal_data.csv'
stats_fp = 'output/stats/scale_shape_both_dates.csv'
moran_fp = 'input_data/moran_max.csv'
output_dir_predict = 'output/rf_predictions/single_date/'
output_dir_metrics = 'output/rf_metrics/single_date/'
out_file_prefix = 'func_test_dB'

targets, predictors = data_setup(s1_fp, stats_fp, moran_fp, s1_units='dB', date=None) # regression
cols_0218 = [c for c in targets.columns if '0219' in c]
cols_0302 = [c for c in targets.columns if '0304' in c]
cols_misc = [c for c in targets.columns if c not in cols_0218 and c not in cols_0302]
cols_misc.remove('Point_ID')
cols_misc.remove('moran_p')
cols_0218 += cols_misc
cols_0302 += cols_misc

cols_0218 = [c for c in cols_0218 if '10m' in c]
cols_0302 = [c for c in cols_0302 if '10m' in c]

targets_0218 = targets[cols_0218].dropna()
targets_0302 = targets[cols_0302].dropna()

# run_rf_reg(targets_0218, predictors.loc[targets_0218.index], n_runs=100, 
#        rf_type='single_target', output_dir_predict=output_dir_predict,
#        output_dir_metrics=output_dir_metrics, out_file_prefix=out_file_prefix)
# run_rf_reg(targets_0302, predictors.loc[targets_0302.index], n_runs=100, 
#        rf_type='single_target', output_dir_predict=output_dir_predict,
#        output_dir_metrics=output_dir_metrics, out_file_prefix=out_file_prefix)



# Regression - multi date

Note - need to run the previous cell to load in the data

In [None]:
output_dir_predict = 'output/rf_predictions/multi_date/'
output_dir_metrics = 'output/rf_metrics/multi_date/'
out_file_prefix = 'func_test_dB'

def remove_date(col):
    if '0219' in col:
        date_start = col.index('0219')
    elif '0304' in col:
        date_start = col.index('0304')
    else:
        return col
    
    new_col = col[:date_start] + col[date_start+5:]
    if '_1_' in new_col:
        ind = new_col.index('_1_')
        new_col = new_col[:ind] + new_col[ind+2:]
    return new_col

cols_0218_new = [remove_date(c) for c in cols_0218]
cols_0302_new = [remove_date(c) for c in cols_0302]

targets_0218.columns = cols_0218_new
targets_0302.columns = cols_0302_new

targets_all = pd.concat([targets_0218, targets_0302])

# run_rf_reg(targets_all, predictors, n_runs=100, rf_type='single_target', 
#        output_dir_predict=output_dir_predict, output_dir_metrics=output_dir_metrics, 
#        out_file_prefix=out_file_prefix)



# Classification - single date

In [2]:
for thresh in [50, 60, 70, 80, 90, 100]:
    s1_fp = f'input_data/s1_classification_data_rev1_t{thresh}.csv' # classification
    output_dir_predict = 'output/rf_classification/visually_based/class_results/single_date/'
    output_dir_cm = 'output/figures/confusion/'
    out_file_prefix = f'rev1_t{thresh}'

    targets, predictors = data_setup(s1_fp, s1_units='dB', date=None, 
                                    drop_vv_glcm=False, drop_ad=False)
    targets.rename(columns={'class_assignment':'class'}, inplace=True)
    print(targets)
    # For now, drop 'NC' class
    targets = targets.loc[targets['class']!='NC']
    targets = targets.loc[targets['class']!='mixed']
    targets = targets.loc[targets['class']!='water']

    # Dumb indexing to maintain multiindex values to pass to func below
    targets_0218 = targets.loc[[i for i in targets.index if i[0] == '0218']]
    targets_0302 = targets.loc[[i for i in targets.index if i[0] == '0302']]
    print(len(targets_0218), len(targets_0302))

    # Resample datasets for even samples in all classes
    n_samples = min(sum(targets_0218['class'] == 'rough'), sum(targets_0218['class'] == 'smooth'))
    print(f'threshold: {thresh} -- n_samples: {n_samples}')
    targets_0218_rough = targets_0218[targets_0218['class'] == 'rough'].sample(n_samples)
    targets_0218_smooth = targets_0218[targets_0218['class'] == 'smooth'].sample(n_samples)
    try:
        targets_0218_mixed = targets_0218[targets_0218['class'] == 'mixed'].sample(n_samples)
        targets_0218 = pd.concat([targets_0218_rough, targets_0218_smooth, targets_0218_mixed])
    except:
        targets_0218 = pd.concat([targets_0218_rough, targets_0218_smooth])

    targets_0302_rough = targets_0302[targets_0302['class'] == 'rough'].sample(n_samples)
    targets_0302_smooth = targets_0302[targets_0302['class'] == 'smooth'].sample(n_samples)
    try:
        targets_0302_mixed = targets_0302[targets_0302['class'] == 'mixed'].sample(n_samples)
        targets_0302 = pd.concat([targets_0302_rough, targets_0302_smooth, targets_0302_mixed])
    except:
        targets_0302 = pd.concat([targets_0302_rough, targets_0302_smooth])

    targets = pd.concat([targets_0218, targets_0302])

    rf_params = {'n_estimators':2000,
                'max_features':'sqrt',
                'max_depth':None, 
                'random_state':5033}

    run_rf_cla(pd.DataFrame(targets_0218['class']), predictors.loc[targets_0218.index], n_runs=100,
        class_split_method='custom', rf_params=rf_params,
        output_dir_predict=output_dir_predict, out_file_prefix=out_file_prefix,
        output_dir_cm=output_dir_cm)
    run_rf_cla(pd.DataFrame(targets_0302['class']), predictors.loc[targets_0302.index], n_runs=100, 
        class_split_method='custom', rf_params=rf_params,
        output_dir_predict=output_dir_predict, out_file_prefix=out_file_prefix,
        output_dir_cm=output_dir_cm)

    output_dir_predict = 'output/rf_classification/visually_based/class_results/multi_date/'
    output_dir_cm = 'output/figures/confusion/'
    out_file_prefix = f'rev1_t{thresh}'

    run_rf_cla(pd.DataFrame(targets['class']), predictors.loc[targets.index], n_runs=100, class_split_method='custom',
        output_dir_predict=output_dir_predict, out_file_prefix=out_file_prefix, rf_params=rf_params,
        output_dir_cm=output_dir_cm)
        

                        AREA  PERCENTAGE   class
date S1_pixel_ID                                
0218 1             96.846579   96.846579  smooth
     6            100.000000  100.000000   mixed
     9             49.661367   49.661367      NC
     11            63.555294   63.555294  smooth
     13           100.000000  100.000000   mixed
...                      ...         ...     ...
0302 2521          28.133500   28.133500      NC
     2524         100.000000  100.000000   mixed
     2529           8.622913    8.622913      NC
     2531          72.366325   72.366325   mixed
     2533          73.664000   73.664000  smooth

[1923 rows x 3 columns]
158 538
threshold: 50 -- n_samples: 78
2022-10-25 17:04:37 -- Starting RF classification (1 targets total).
2022-10-25 17:04:37 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-10-25 17:10:12 -- Starting RF classification (1 targets total).
2022-10-25 17:10:12 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-10-25 17:15:54 -- Starting RF classification (1 targets total).
2022-10-25 17:15:54 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

                        AREA  PERCENTAGE   class
date S1_pixel_ID                                
0218 1             96.846579   96.846579  smooth
     6            100.000000  100.000000   mixed
     9             49.661367   49.661367      NC
     11            63.555294   63.555294  smooth
     13           100.000000  100.000000   mixed
...                      ...         ...     ...
0302 2521          28.133500   28.133500      NC
     2524         100.000000  100.000000   mixed
     2529           8.622913    8.622913      NC
     2531          72.366325   72.366325   mixed
     2533          73.664000   73.664000  smooth

[1923 rows x 3 columns]
145 481
threshold: 60 -- n_samples: 70
2022-10-25 17:22:29 -- Starting RF classification (1 targets total).
2022-10-25 17:22:29 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-10-25 17:28:14 -- Starting RF classification (1 targets total).
2022-10-25 17:28:14 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-10-25 17:33:44 -- Starting RF classification (1 targets total).
2022-10-25 17:33:44 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

                        AREA  PERCENTAGE   class
date S1_pixel_ID                                
0218 1             96.846579   96.846579  smooth
     6            100.000000  100.000000   mixed
     9             49.661367   49.661367      NC
     11            63.555294   63.555294      NC
     13           100.000000  100.000000   mixed
...                      ...         ...     ...
0302 2521          28.133500   28.133500      NC
     2524         100.000000  100.000000   mixed
     2529           8.622913    8.622913      NC
     2531          72.366325   72.366325   mixed
     2533          73.664000   73.664000  smooth

[1923 rows x 3 columns]
123 436
threshold: 70 -- n_samples: 57
2022-10-25 17:39:43 -- Starting RF classification (1 targets total).
2022-10-25 17:39:43 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-10-25 17:44:59 -- Starting RF classification (1 targets total).
2022-10-25 17:44:59 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-10-25 17:50:21 -- Starting RF classification (1 targets total).
2022-10-25 17:50:21 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

                        AREA  PERCENTAGE   class
date S1_pixel_ID                                
0218 1             96.846579   96.846579  smooth
     6            100.000000  100.000000   mixed
     9             49.661367   49.661367      NC
     11            63.555294   63.555294      NC
     13           100.000000  100.000000   mixed
...                      ...         ...     ...
0302 2521          28.133500   28.133500      NC
     2524         100.000000  100.000000   mixed
     2529           8.622913    8.622913      NC
     2531          72.366325   72.366325      NC
     2533          73.664000   73.664000      NC

[1923 rows x 3 columns]
113 404
threshold: 80 -- n_samples: 50
2022-10-25 17:56:05 -- Starting RF classification (1 targets total).
2022-10-25 17:56:05 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-10-25 18:01:19 -- Starting RF classification (1 targets total).
2022-10-25 18:01:19 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-10-25 18:06:40 -- Starting RF classification (1 targets total).
2022-10-25 18:06:40 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

                        AREA  PERCENTAGE   class
date S1_pixel_ID                                
0218 1             96.846579   96.846579  smooth
     6            100.000000  100.000000   mixed
     9             49.661367   49.661367      NC
     11            63.555294   63.555294      NC
     13           100.000000  100.000000   mixed
...                      ...         ...     ...
0302 2521          28.133500   28.133500      NC
     2524         100.000000  100.000000   mixed
     2529           8.622913    8.622913      NC
     2531          72.366325   72.366325      NC
     2533          73.664000   73.664000      NC

[1923 rows x 3 columns]
98 363
threshold: 90 -- n_samples: 42
2022-10-25 18:12:15 -- Starting RF classification (1 targets total).
2022-10-25 18:12:15 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-10-25 18:17:27 -- Starting RF classification (1 targets total).
2022-10-25 18:17:27 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-10-25 18:22:41 -- Starting RF classification (1 targets total).
2022-10-25 18:22:41 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

                        AREA  PERCENTAGE  class
date S1_pixel_ID                               
0218 1             96.846579   96.846579     NC
     6            100.000000  100.000000  mixed
     9             49.661367   49.661367     NC
     11            63.555294   63.555294     NC
     13           100.000000  100.000000  mixed
...                      ...         ...    ...
0302 2521          28.133500   28.133500     NC
     2524         100.000000  100.000000  mixed
     2529           8.622913    8.622913     NC
     2531          72.366325   72.366325     NC
     2533          73.664000   73.664000     NC

[1923 rows x 3 columns]
60 284
threshold: 100 -- n_samples: 24
2022-10-25 18:28:12 -- Starting RF classification (1 targets total).
2022-10-25 18:28:12 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-10-25 18:33:21 -- Starting RF classification (1 targets total).
2022-10-25 18:33:21 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-10-25 18:38:35 -- Starting RF classification (1 targets total).
2022-10-25 18:38:35 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

In [41]:
min(sum(targets_0218['class'] == 'rough'), sum(targets_0218['class'] == 'smooth'))

55

# Classification - multi date

Note - need to run the previous cell to load in the data

In [None]:
output_dir_predict = 'output/rf_classification/class_results/multi_date/'
output_dir_cm = 'output/figures/confusion/'
out_file_prefix = 'func_test_classification'

run_rf_cla(targets_all, predictors, n_runs=100, n_classes=5, 
    output_dir_predict=output_dir_predict, out_file_prefix=out_file_prefix,
    output_dir_cm=output_dir_cm)

In [12]:
datetime.now().strftime('%Y%m%d_%H%M%S')

'20220723_160242'