In [47]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import image
import seaborn as sns
import glob
from scipy.stats.stats import pearsonr
import pandas as pd
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import normalized_mutual_info_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score, balanced_accuracy_score, mean_squared_error, r2_score

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

np.random.seed(42)


In [48]:
fpaths = glob.glob("dqn+ddqn_all/*/")
do_supervised_modeling = True

plastic_flakes_sample = 0.1
indian_pines_sample = 1 
salient_objects_sample = 0.01
foods_sample = 1
soil_moisture_sample = 1

In [49]:
indices = []
for fpath in fpaths:

    ## Post process RL agent for reward metrics and model results if desired
    
    files = glob.glob(fpath+'/*')

    with open(files[1], 'r') as f:
      config = json.load(f)
    
    if config['critic']['double_q']:
        indices.append(f"{config['data']['dataset_type']}_DDQN_{config['agent']['reward_type']}")
    else:
        indices.append(f"{config['data']['dataset_type']}_{config['agent']['agent_class']}_{config['agent']['reward_type']}")


In [50]:
indices.sort()
indices

['Foods_DDQN_correlation',
 'Foods_DDQN_mutual_info',
 'Foods_DQN_correlation',
 'Foods_DQN_mutual_info',
 'IndianPines_DDQN_correlation',
 'IndianPines_DDQN_mutual_info',
 'IndianPines_DQN_correlation',
 'IndianPines_DQN_mutual_info',
 'PlasticFlakes_DDQN_correlation',
 'PlasticFlakes_DDQN_mutual_info',
 'PlasticFlakes_DQN_correlation',
 'PlasticFlakes_DQN_mutual_info',
 'SalientObjects_DDQN_correlation',
 'SalientObjects_DDQN_mutual_info',
 'SalientObjects_DQN_correlation',
 'SalientObjects_DQN_mutual_info',
 'SoilMoisture_DDQN_correlation',
 'SoilMoisture_DDQN_mutual_info',
 'SoilMoisture_DQN_correlation',
 'SoilMoisture_DQN_mutual_info']

In [51]:
master_df = pd.read_excel('/Users/danielfurman/Desktop/Berkeley/Semester-3/Deep RL/final_project/modeling_test_results_no_rl.xlsx', index_col = 'Unnamed: 0')
master_df

Unnamed: 0,acc / r^2,bac / mae,f1 / mse,corr,mi
PlasticFlakes_DQN_correlation,,,,,
PlasticFlakes_DDQN_correlation,,,,,
PlasticFlakes_AC_correlation,,,,,
PlasticFlakes_SAC_correlation,,,,,
PlasticFlakes_DQN_mutual_info,,,,,
PlasticFlakes_DDQN_mutual_info,,,,,
PlasticFlakes_AC_mutual_info,,,,,
PlasticFlakes_SAC_mutual_info,,,,,
PlasticFlakes_variance_rank,,,,,
PlasticFlakes_random_combination,0.958792,0.931641,0.931507,0.922004,0.596827


In [52]:
for fpath in fpaths:

    ## Post process RL agent for reward metrics and model results if desired
    
    files = glob.glob(fpath+'/*')
    files
    
    with open(files[0], 'r') as f:
      data_metadata = json.load(f)
    with open(files[1], 'r') as f:
      config = json.load(f)
    
    selected_bands = np.load(files[2])

    if config['critic']['double_q']:
        index = f"{config['data']['dataset_type']}_DDQN_{config['agent']['reward_type']}"
    else:
        index = f"{config['data']['dataset_type']}_{config['agent']['agent_class']}_{config['agent']['reward_type']}" 
    print(index)
    
    dataset_type = config['data']['dataset_type']
    band_selection_num = config['data']['band_selection_num']
    dataset_type
    # which datasets to read and write
    
    # change to be controlled by config
    if dataset_type == 'IndianPines':
        do_indian_pines = True 
        do_salient_objects = False
        do_plastic_flakes = False
        do_soil_moisture = False
        do_foods = False
    if dataset_type == 'SalientObjects':
        do_indian_pines = False 
        do_salient_objects = True
        do_plastic_flakes = False
        do_soil_moisture = False
        do_foods = False
    if dataset_type == 'PlasticFlakes':
        do_indian_pines = False 
        do_salient_objects = False
        do_plastic_flakes = True
        do_soil_moisture = False
        do_foods = False
    if dataset_type == 'SoilMoisture':
        do_indian_pines = False 
        do_salient_objects = False
        do_plastic_flakes = False
        do_soil_moisture = True
        do_foods = False
    if dataset_type == 'Foods':
        do_indian_pines = False 
        do_salient_objects = False
        do_plastic_flakes = False
        do_soil_moisture = False
        do_foods = True
    
    do_indian_pines
    # change to be controlled by config
    num_b_kept = band_selection_num
    num_b_kept
    ## Define functions
    # reward functions
    
    def calculate_correlations(data, selected_bands):
        
        corr_sum = 0
        for i in selected_bands:
            for j in selected_bands:
                if i != j:
                    corr_sum += np.abs(pearsonr(data[:, i], 
                                       data[:, j])[0])
                
        return corr_sum/(len(selected_bands)**2)
    
    
    def calculate_mutual_infos(data, selected_bands):
        
        selected_bands = selected_bands
        normalized_mutual_info_score_sum = 0
        for i in selected_bands:
            for j in selected_bands:
                if i != j:
                    normalized_mutual_info_score_sum += normalized_mutual_info_score(data[:, i],
                                                                                 data[:, j])
                
        return normalized_mutual_info_score_sum/(len(selected_bands)**2)
    
    ## dataset loading
    
    def load_datasets(Dataset):
        
        if Dataset == 'SM':
            hyper_path = '../data/soil_moisture/hyperspectral_imagery/*npy'
            hyper = np.load(glob.glob(hyper_path)[0])
            gt_path = '../data/soil_moisture/gt_labels/*npy'
            gt = np.load(glob.glob(gt_path)[0])
            return hyper, gt
        
        if Dataset == 'IN':
            hyper_path = '../data/indian_pines/hyperspectral_imagery/*npy'
            hyper = np.load(glob.glob(hyper_path)[0])
            gt_path = '../data/indian_pines/gt_labels/*npy'
            gt = np.load(glob.glob(gt_path)[0])
            return hyper, gt
        
        if Dataset == 'SO':
            hyper_path = '../data/salient_objects/hyperspectral_imagery/*npy'
            gt_path = '../data/salient_objects/gt_labels/*npy'
            hypers=[]
            gt_labels=[]
            for i in range(len(glob.glob(hyper_path))):
                hyper = np.load(glob.glob(hyper_path)[i])
                hypers.append(hyper)
                gt = np.load(glob.glob(gt_path)[i])
                gt_labels.append(gt)
            return hypers, gt_labels
    
                              
        if Dataset == 'PF':
            hyper_path = '../data/plastic_flakes/hyperspectral_imagery/*npy'
            gt_path = '../data/plastic_flakes/gt_labels/*npy'
            hypers=[]
            gt_labels=[]
            for i in range(len(glob.glob(hyper_path))):
                hyper = np.load(glob.glob(hyper_path)[i])
                hypers.append(hyper)
                gt = np.load(glob.glob(gt_path)[i])
                gt_labels.append(gt)
            return hypers, gt_labels
        
        if Dataset == 'Foods':
            hyper_path = '../data/foods/hyperspectral_imagery/*npy'
            gt_path = '../data/foods/gt_labels/*npy'
            hypers=[]
            gt_labels=[]
            for i in range(len(glob.glob(hyper_path))):
                hyper = np.load(glob.glob(hyper_path)[i])
                hypers.append(hyper)
                gt = np.load(glob.glob(gt_path)[i])
                gt_labels.append(gt)
            return hypers, gt_labels
    
    
    def normalize(data, mean, std): 
        return (data - mean) / std
    
    
    def baseline_supervised_pass(data_fpath, labels_fpath, Dataset, selected_bands):
        
        # train val split
        data = np.load(data_fpath)
        indices = selected_bands  # np.random.randint(0, data.shape[1], num_random_bands)
        data = data[:,indices]
            
        # print(data.shape)
    
        labels = np.load(labels_fpath)
        
        if Dataset == 'SM':
    
            train_images, val_images, train_labels, val_labels = train_test_split(data, 
                                                                    labels, 
                                                                    test_size=0.3, 
                                                                    random_state=42)
            
            clf = LinearRegression()
            
            train_mu = np.mean(train_images)
            train_std = np.std(train_images)
            
            train_images = normalize(train_images, train_mu, train_std)
            val_images = normalize(val_images, train_mu, train_std)
            
            clf.fit(train_images, train_labels)
            val_predictions = clf.predict(val_images)
            mse = mean_squared_error(val_labels, val_predictions)
            mae = mean_absolute_error(val_labels, val_predictions)
            r2 = r2_score(val_labels, val_predictions)  
            
            return mse, mae, r2        
            
        else:
            train_images, val_images, train_labels, val_labels = train_test_split(data, 
                                                                    labels, 
                                                                    test_size=0.3, 
                                                                    random_state=42,
                                                                    stratify = labels)   
        
            clf = LogisticRegression(multi_class='multinomial')
            
            train_mu = np.mean(train_images)
            train_std = np.std(train_images)
            
            train_images = normalize(train_images, train_mu, train_std)
            val_images = normalize(val_images, train_mu, train_std)    
            
            clf.fit(train_images, train_labels)
            val_predictions = clf.predict(val_images)
            acc = accuracy_score(val_labels, val_predictions)        
            bac = balanced_accuracy_score(val_labels, val_predictions)
            f1 = f1_score(val_labels, val_predictions, average='macro')   
            
            return acc, bac, f1
        
    ## Plastic flakes dataset
    # stacks all images vertically
    
    # load data
    
    if do_plastic_flakes:
        
        hyper, gt = load_datasets(
            'PF')
        
        hyper, gt = np.array(hyper), np.array(gt)
        
        hyper_multiple = np.empty([hyper.shape[0]*hyper.shape[1], hyper.shape[-1]])
        gt_multiple = np.empty([gt.shape[0]*gt.shape[1]])
        
        print('\nDataset info...')
        print('The shape of the original imagery:', hyper.shape)
        print('The shape of the original labels:', gt.shape)
        
        for i in range(hyper.shape[0]):
            hyper_multiple[i*hyper.shape[1]:(i+1)*hyper.shape[1] , :] = hyper[i, :, :]
            gt_multiple[i*hyper.shape[1]:(i+1)*hyper.shape[1]] = gt[i, :]
    
        print('The shape of the vertically stacked images:', hyper_multiple.shape)
        print('The shape of the vertically stacked images:', gt_multiple.shape)    

        indices = np.random.randint(0, hyper_multiple.shape[0], int(hyper_multiple.shape[0]*plastic_flakes_sample))
        hyper_multiple = hyper_multiple[indices, :]
        print('The shape of the sub-sampled vertically stacked images:', hyper_multiple.shape)
    # rewards
        
    if do_plastic_flakes:
        correlation = calculate_correlations(hyper_multiple, selected_bands)
        print(f'\nCorrelation:', correlation)
        mi = calculate_mutual_infos(hyper_multiple, selected_bands)
        print(f'Normalized mutual information:', mi)
        master_df['corr'][index] = correlation
        master_df['mi'][index] = mi
    
    if do_plastic_flakes and do_supervised_modeling:
    
        # baseline models
        
        data_path = '../data/plastic_flakes/'
        hsi_paths = glob.glob(data_path + 'hyperspectral_imagery/*.npy')
        gt_paths = []
        for i in range(len(hsi_paths)):
             num = hsi_paths[i].split('.')[2].split('/')[-1]
             gt_paths.append(data_path + f'gt_labels/{num}.npy')
        
        
        accs, bacs, f1s = [], [], []
        for i in range(0, len(hsi_paths)):
        
            acc, bac, f1 = baseline_supervised_pass(hsi_paths[i], gt_paths[i], 'PF', selected_bands)
            accs.append(acc)
            bacs.append(bac)
            f1s.append(f1)
        
        print(f'\nModel performance with RL agent selected bands...')
        print(f'Validation Accuracy: {np.mean(accs)}')
        print(f'Validation Balanced Accuracy: {np.mean(bacs)}')
        print(f'Validation Macro Averaged F1 Score: {np.mean(f1s)}')
        master_df['acc / r^2'][index] = np.mean(accs)
        master_df['bac / mae'][index] = np.mean(bacs) 
        master_df['f1 / mse'][index] = np.mean(f1s)    
    ## Salient objects dataset
    # stacks all images vertically
    
    # load data
    
    if do_salient_objects:
        
        hyper, gt = load_datasets(
            'SO')
        
        hyper, gt = np.array(hyper), np.array(gt)
        hyper_multiple = np.empty([hyper.shape[0]*hyper.shape[1], hyper.shape[-1]])
        gt_multiple = np.empty([gt.shape[0]*gt.shape[1]])
        
        print('\nDataset info...')
        print('The shape of the original imagery:', hyper.shape)
        print('The shape of the original labels:', gt.shape)
        
        for i in range(hyper.shape[0]):
            hyper_multiple[i*hyper.shape[1]:(i+1)*hyper.shape[1] , :] = hyper[i, :, :]
            gt_multiple[i*hyper.shape[1]:(i+1)*hyper.shape[1]] = gt[i, :]
    
        print('\nDataset info...')
        print('The shape of the vertically stacked images:', hyper_multiple.shape)
        print('The shape of the vertically stacked images:', gt_multiple.shape)    
    
        # randomly sample hyper_multiple for 1% of the pixels
        indices = np.random.randint(0, hyper_multiple.shape[0], int(hyper_multiple.shape[0]*salient_objects_sample))
        hyper_multiple = hyper_multiple[indices, :]
        print('The shape of the sub-sampled vertically stacked images:', hyper_multiple.shape)

    # rewards
        
    if do_salient_objects:
        correlation = calculate_correlations(hyper_multiple, selected_bands)
        print(f'\nCorrelation:', correlation)
        mi = calculate_mutual_infos(hyper_multiple, selected_bands)
        print(f'Normalized mutual information:', mi)
    
        master_df['corr'][index] = correlation
        master_df['mi'][index] = mi
    
    if do_salient_objects and do_supervised_modeling:
        # baseline models
        
        data_path = '../data/salient_objects/'
        hsi_paths = glob.glob(data_path + 'hyperspectral_imagery/*.npy')
        gt_paths = []
        for i in range(len(hsi_paths)):
             num = hsi_paths[i].split('.')[2].split('/')[-1]
             gt_paths.append(data_path + f'gt_labels/{num}.npy')
        
        
        #print(hsi_paths)
        #print(gt_paths)
        
        accs, bacs, f1s = [], [], []
        for i in range(0, len(hsi_paths)):
        
            acc, bac, f1 = baseline_supervised_pass(hsi_paths[i], gt_paths[i], 'SO', selected_bands)
            accs.append(acc)
            bacs.append(bac)
            f1s.append(f1)
        
        print(f'\nModel performance with RL agent selected bands...')
        print(f'Validation Accuracy: {np.mean(accs)}')
        print(f'Validation Balanced Accuracy: {np.mean(bacs)}')
        print(f'Validation Macro Averaged F1 Score: {np.mean(f1s)}')
        master_df['acc / r^2'][index] = np.mean(accs)
        master_df['bac / mae'][index] = np.mean(bacs) 
        master_df['f1 / mse'][index] = np.mean(f1s) 
    ## Indian Pines dataset
    # load data
    
    if do_indian_pines:
        
        hyper, gt = load_datasets(
            'IN')
        
        print('\nDataset info...')
        print('The shape of the original imagery:', hyper.shape)
        print('The shape of the original labels:', gt.shape)
        
    # rewards
        
    if do_indian_pines:
        correlation = calculate_correlations(hyper, selected_bands)
        print(f'\nCorrelation:', correlation)
        mi = calculate_mutual_infos(hyper, selected_bands)
        print(f'Normalized mutual information:', mi)
        master_df['corr'][index] = correlation
        master_df['mi'][index] = mi
    
    if do_indian_pines and do_supervised_modeling:
    
    
        # baseline models
        
        data_fpath = '../data/indian_pines/hyperspectral_imagery/indian_pines_corrected.npy'
        labels_fpath = '../data/indian_pines/gt_labels/indian_pines_gt.npy'
        
        # model with all bands included
        print(f'\nModel performance with RL agent selected bands...')
        acc, bac, f1 = baseline_supervised_pass(data_fpath, labels_fpath, 'IN', selected_bands)
        print(f'Validation Accuracy: {acc}')
        print(f'Validation Balanced Accuracy: {bac}')
        print(f'Validation Macro Averaged F1 Score: {f1}')
        ## Soil moisture dataset
        # load data
        master_df['acc / r^2'][index] = acc
        master_df['bac / mae'][index] = bac 
        master_df['f1 / mse'][index] = f1

    if do_soil_moisture:
        
        hyper, gt = load_datasets(
            'SM')
        
        print('\nDataset info...')
        print('The shape of the original imagery:', hyper.shape)
        print('The shape of the original labels:', gt.shape)
        
    # rewards
        
    if do_soil_moisture:    
        correlation = calculate_correlations(hyper, selected_bands)
        print(f'\nCorrelation:', correlation)
        mi = calculate_mutual_infos(hyper, selected_bands)
        print(f'Normalized mutual information:', mi)
        master_df['corr'][index] = correlation
        master_df['mi'][index] = mi
  
        
    if do_soil_moisture and do_supervised_modeling:
    
        # baseline models 
        
        data_fpath = '../data/soil_moisture/hyperspectral_imagery/soil_moisture_hyper.npy'
        labels_fpath = '../data/soil_moisture/gt_labels/soil_moisture_gt.npy'
        
        print(f'\nModel performance with RL agent selected bands...')
        
        mse, mae, r2 = baseline_supervised_pass(data_fpath, labels_fpath, 'SM', selected_bands)
        print(f'Validation MSE: {mse}')
        print(f'Validation MAE: {mae}')
        print(f'Validation r2: {r2}')
        master_df['acc / r^2'][index] = r2
        master_df['bac / mae'][index] = mae 
        master_df['f1 / mse'][index] = mse    
    ## Foods dataset
    
    # load data
    
    if do_foods:
        
        hyper, gt = load_datasets(
            'Foods')
    
        hyper, gt = hyper[0], gt[0]
        
        print('\nDataset info...')
        print('The shape of the original imagery:', hyper.shape)
        print('The shape of the original labels:', gt.shape)
        
    # rewards
        
    if do_foods:
        correlation = calculate_correlations(hyper, selected_bands)
        print(f'\nCorrelation:', correlation)
        mi = calculate_mutual_infos(hyper, selected_bands)
        print(f'Normalized mutual information:', mi)

        master_df['corr'][index] = correlation
        master_df['mi'][index] = mi

    if do_foods and do_supervised_modeling:
    
    
        # baseline models
        
        data_fpath = '../data/foods/hyperspectral_imagery/foods_hyper.npy'
        labels_fpath = '../data/foods/gt_labels/foods_gt.npy'
        
        # model with all bands included
        print(f'\nModel performance with RL agent selected bands...')
        
        # model with random bands
        print(f'\nBaseline model performance on rescaled data (ints) with {30} band random subset...')
        acc, bac, f1 = baseline_supervised_pass(data_fpath, labels_fpath, 'Foods', selected_bands)
        print(f'Validation Accuracy: {acc}')
        print(f'Validation Balanced Accuracy: {bac}')
        print(f'Validation Macro Averaged F1 Score: {f1}')
        print('')

        master_df['acc / r^2'][index] = acc
        master_df['bac / mae'][index] = bac 
        master_df['f1 / mse'][index] = f1 

    print('..done\n')

SalientObjects_DDQN_correlation

Dataset info...
The shape of the original imagery: (60, 786432, 81)
The shape of the original labels: (60, 786432)

Dataset info...
The shape of the vertically stacked images: (47185920, 81)
The shape of the vertically stacked images: (47185920,)
The shape of the sub-sampled vertically stacked images: (471859, 81)

Correlation: 0.07764325678197317
Normalized mutual information: 0.08009992363726799

Model performance with RL agent selected bands...
Validation Accuracy: 0.9168932169146217
Validation Balanced Accuracy: 0.5849926331033124
Validation Macro Averaged F1 Score: 0.5984953416405829
..done

PlasticFlakes_DQN_correlation

Dataset info...
The shape of the original imagery: (11, 112128, 224)
The shape of the original labels: (11, 112128)
The shape of the vertically stacked images: (1233408, 224)
The shape of the vertically stacked images: (1233408,)
The shape of the sub-sampled vertically stacked images: (123340, 224)

Correlation: 0.9253206505719632

In [53]:
master_df

Unnamed: 0,acc / r^2,bac / mae,f1 / mse,corr,mi
PlasticFlakes_DQN_correlation,0.957525,0.929751,0.929887,0.925321,0.588455
PlasticFlakes_DDQN_correlation,0.95849,0.931121,0.930977,0.92261,0.585026
PlasticFlakes_AC_correlation,,,,,
PlasticFlakes_SAC_correlation,,,,,
PlasticFlakes_DQN_mutual_info,0.959019,0.932308,0.932099,0.92375,0.603999
PlasticFlakes_DDQN_mutual_info,0.960322,0.932655,0.933784,0.91768,0.573639
PlasticFlakes_AC_mutual_info,,,,,
PlasticFlakes_SAC_mutual_info,,,,,
PlasticFlakes_variance_rank,,,,,
PlasticFlakes_random_combination,0.958792,0.931641,0.931507,0.922004,0.596827


In [54]:
master_df.to_excel('/Users/danielfurman/Desktop/Berkeley/Semester-3/Deep RL/final_project/modeling_test_results_with_rl.xlsx')