In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from scipy.stats import norm, lognorm, anderson, kstest
from sklearn import metrics
from rf_functions import data_setup, run_rf_reg, run_rf_cla
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
pd.options.mode.chained_assignment = None

# Regression - single date

In [None]:
s1_fp = 'input_data/s1_zonal_data.csv'
stats_fp = 'output/stats/scale_shape_both_dates.csv'
moran_fp = 'input_data/moran_max.csv'
output_dir_predict = 'output/rf_predictions/single_date/'
output_dir_metrics = 'output/rf_metrics/single_date/'
out_file_prefix = 'func_test_dB'

targets, predictors = data_setup(s1_fp, stats_fp, moran_fp, s1_units='dB', date=None) # regression
cols_0218 = [c for c in targets.columns if '0219' in c]
cols_0302 = [c for c in targets.columns if '0304' in c]
cols_misc = [c for c in targets.columns if c not in cols_0218 and c not in cols_0302]
cols_misc.remove('Point_ID')
cols_misc.remove('moran_p')
cols_0218 += cols_misc
cols_0302 += cols_misc

cols_0218 = [c for c in cols_0218 if '10m' in c]
cols_0302 = [c for c in cols_0302 if '10m' in c]

targets_0218 = targets[cols_0218].dropna()
targets_0302 = targets[cols_0302].dropna()

# run_rf_reg(targets_0218, predictors.loc[targets_0218.index], n_runs=100, 
#        rf_type='single_target', output_dir_predict=output_dir_predict,
#        output_dir_metrics=output_dir_metrics, out_file_prefix=out_file_prefix)
# run_rf_reg(targets_0302, predictors.loc[targets_0302.index], n_runs=100, 
#        rf_type='single_target', output_dir_predict=output_dir_predict,
#        output_dir_metrics=output_dir_metrics, out_file_prefix=out_file_prefix)



# Regression - multi date

Note - need to run the previous cell to load in the data

In [None]:
output_dir_predict = 'output/rf_predictions/multi_date/'
output_dir_metrics = 'output/rf_metrics/multi_date/'
out_file_prefix = 'func_test_dB'

def remove_date(col):
    if '0219' in col:
        date_start = col.index('0219')
    elif '0304' in col:
        date_start = col.index('0304')
    else:
        return col
    
    new_col = col[:date_start] + col[date_start+5:]
    if '_1_' in new_col:
        ind = new_col.index('_1_')
        new_col = new_col[:ind] + new_col[ind+2:]
    return new_col

cols_0218_new = [remove_date(c) for c in cols_0218]
cols_0302_new = [remove_date(c) for c in cols_0302]

targets_0218.columns = cols_0218_new
targets_0302.columns = cols_0302_new

targets_all = pd.concat([targets_0218, targets_0302])

# run_rf_reg(targets_all, predictors, n_runs=100, rf_type='single_target', 
#        output_dir_predict=output_dir_predict, output_dir_metrics=output_dir_metrics, 
#        out_file_prefix=out_file_prefix)



# Classification - single date

In [10]:
s1_fp = 'input_data/s1_classification_data.csv' # classification
output_dir_predict = 'output/rf_classification/visually_based/class_results/single_date/'
output_dir_cm = 'output/figures/confusion/'
out_file_prefix = 'smooth_rough_mixed'

targets, predictors = data_setup(s1_fp, s1_units='dB', date=None, 
                                 drop_vv_glcm=False, drop_ad=False)
targets.rename(columns={'class_assignment':'class'}, inplace=True)

# For now, drop 'NC' class
targets = targets.loc[targets['class']!='NC']
targets = targets.loc[targets['class']!='mixed']
targets = targets.loc[targets['class']!='water']

# Dumb indexing to maintain multiindex values to pass to func below
targets_0218 = targets.loc[[i for i in targets.index if i[0] == '0218']]
targets_0302 = targets.loc[[i for i in targets.index if i[0] == '0302']]

# Resample datasets for even samples in all classes
n_samples = 80
targets_0218_rough = targets_0218[targets_0218['class'] == 'rough'].sample(n_samples)
targets_0218_smooth = targets_0218[targets_0218['class'] == 'smooth'].sample(n_samples)
try:
    targets_0218_mixed = targets_0218[targets_0218['class'] == 'mixed'].sample(n_samples)
    targets_0218 = pd.concat([targets_0218_rough, targets_0218_smooth, targets_0218_mixed])
except:
    targets_0218 = pd.concat([targets_0218_rough, targets_0218_smooth])

targets_0302_rough = targets_0302[targets_0302['class'] == 'rough'].sample(n_samples)
targets_0302_smooth = targets_0302[targets_0302['class'] == 'smooth'].sample(n_samples)
try:
    targets_0302_mixed = targets_0302[targets_0302['class'] == 'mixed'].sample(n_samples)
    targets_0302 = pd.concat([targets_0302_rough, targets_0302_smooth, targets_0302_mixed])
except:
    targets_0302 = pd.concat([targets_0302_rough, targets_0302_smooth])

targets = pd.concat([targets_0218, targets_0302])

rf_params = {'n_estimators':2000,
             'max_features':'sqrt',
             'max_depth':None, 
             'random_state':5033}

run_rf_cla(targets_0218, predictors.loc[targets_0218.index], n_runs=100,
    class_split_method='custom', rf_params=rf_params,
    output_dir_predict=output_dir_predict, out_file_prefix=out_file_prefix,
    output_dir_cm=output_dir_cm)
run_rf_cla(targets_0302, predictors.loc[targets_0302.index], n_runs=100, 
    class_split_method='custom', rf_params=rf_params,
    output_dir_predict=output_dir_predict, out_file_prefix=out_file_prefix,
    output_dir_cm=output_dir_cm)

output_dir_predict = 'output/rf_classification/visually_based/class_results/multi_date/'
output_dir_cm = 'output/figures/confusion/'
out_file_prefix = 'smooth_rough_only'

run_rf_cla(targets, predictors.loc[targets.index], n_runs=100, class_split_method='custom',
    output_dir_predict=output_dir_predict, out_file_prefix=out_file_prefix, rf_params=rf_params,
    output_dir_cm=output_dir_cm)
    

2022-08-01 07:15:52 -- Starting RF classification (1 targets total).
2022-08-01 07:15:52 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-08-01 07:19:38 -- Starting RF classification (1 targets total).
2022-08-01 07:19:38 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

2022-08-01 07:23:30 -- Starting RF classification (1 targets total).
2022-08-01 07:23:30 -- Starting target class (1/1 targets)


  0%|          | 0/100 [00:00<?, ?it/s]

In [20]:
# targets_0302.groupby('class_assignment')['class_assignment'].count()
targets

Unnamed: 0_level_0,Unnamed: 1_level_0,class
date,S1_pixel_ID,Unnamed: 2_level_1
0218,88,rough
0218,265,rough
0218,724,rough
0218,100,rough
0218,911,rough
...,...,...
0302,1882,mixed
0302,1247,mixed
0302,510,mixed
0302,1843,mixed


# Classification - multi date

Note - need to run the previous cell to load in the data

In [None]:
output_dir_predict = 'output/rf_classification/class_results/multi_date/'
output_dir_cm = 'output/figures/confusion/'
out_file_prefix = 'func_test_classification'

run_rf_cla(targets_all, predictors, n_runs=100, n_classes=5, 
    output_dir_predict=output_dir_predict, out_file_prefix=out_file_prefix,
    output_dir_cm=output_dir_cm)

In [13]:
targets.to_csv('output/stats/classification_samples.csv')