# Abalysis on Delta

In [None]:
from pathlib import Path
import pandas as pd

# apply threshold
def apply_threshold(input, thd_low, thd_high):
    stb = input[(input['permanent_area'] >= thd_low) & (input['permanent_area'] <= thd_high)]
    neg = input[(input['permanent_area'] < thd_low)]
    pos = input[(input['permanent_area'] > thd_high)]
    return neg, stb, pos


data_dir = Path('data')
ttest_dir = Path("outputs_ttest")
delta_dir = Path("outputs_delta")

folder = "Reservoirs" # Reservoirs
basin_level = 4


''' t-TEST and u-TEST '''
# ttest = pd.read_csv(ttest_dir / folder / f"basins_level_{basin_level}_ttest.csv")

''' Delta (2017)  '''
# delta
delta_all = pd.read_csv(delta_dir / folder / f"basins_level_{basin_level}_ts_delta.csv")
delta_2017 = delta_all[delta_all['start_year']==2017]
total_num_basins = delta_2017.shape[0]
print(f"basin_level: {basin_level}, total number of basins: {total_num_basins}")

# mean and std
df_muStd = pd.read_csv(delta_dir / folder / "basin_level_mean_std.csv").set_index('basin_level')
mean = df_muStd['mean_permanent_area'][basin_level]
std = df_muStd['std_permanent_area'][basin_level]
print(f"(all years), mean: {mean}, std: {std}")

# delta outliers removed
delta_all_outliers_removed = pd.read_csv(delta_dir / folder / f"basins_level_{basin_level}_ts_delta_outliers_removed.csv")
delta_2017_rm = delta_all_outliers_removed[delta_all_outliers_removed['start_year']==2017]
des_2017 = delta_2017_rm.describe()
mean_2017 = des_2017.loc['mean', 'permanent_area']
std_2017 = des_2017.loc['std', 'permanent_area']
print(f"(2017) mean: {mean_2017}, std: {std_2017}")


res_arr = []
# for start_year in [2000, 2005, 2010, 2015, 2017]:
start_year = 2017
print()
print(f'+++++++++++++++++++++++++++++++++ {start_year} ++++++++++++++++++++++++++++++++++')
for alpha in [0.5, 1, 1.5, 2]:
    print()
    
    print(f"--------- apply all years thresholds (alpha = {alpha}) -----------")
    thd_high = mean + alpha * std
    thd_low = mean - alpha * std
    print(f"(all years, alpha = {alpha}), [{thd_low}, {thd_high}]")
    neg, stb, pos = apply_threshold(delta_2017, thd_low, thd_high)
    print(f"neg: {neg.shape[0]}, stable: {stb.shape[0]}, pos: {pos.shape[0]}")
    print(f"neg: {neg.shape[0] / total_num_basins * 100}, stable: {stb.shape[0] / total_num_basins * 100}, pos: {pos.shape[0] / total_num_basins * 100}")
    
    print(f"---------- apply 2017 thresholds (alpha = {alpha}) --------------")
    thr_2017_high = mean_2017 + alpha * std_2017
    thr_2017_low = mean_2017 - alpha * std_2017
    print(f"(2017, alpha = {alpha}), [{thr_2017_low}, {thr_2017_high}]")
    neg_th17, stb_th17, pos_th17 = apply_threshold(delta_2017, thr_2017_low, thr_2017_high)
    print(f"neg: {neg_th17.shape[0]}, stable: {stb_th17.shape[0]}, pos: {pos_th17.shape[0]}")
    print(f"neg: {neg_th17.shape[0] / total_num_basins * 100}, stable: {stb_th17.shape[0] / total_num_basins * 100}, pos: {pos_th17.shape[0] / total_num_basins * 100}")

        # [basin_level, num_basins, mean, std, aplha, thd_low, thd_high, neg.shape[0], stb.shape[0], pos.shape[0]]


# Analysis on U-Test

In [None]:
''' T-Test '''
from pathlib import Path
import pandas as pd

ttest_dir = Path("outputs_utest")

for folder in ["Pemanent_water", "Reservoirs"]: # Reservoirs, Pemanent_water
    print(folder)
    
    res = []
    p_thd_list = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05][::-1]
    p_thd_cols = [f'p_thd_{p_thd}' for p_thd in p_thd_list]
    for basin_level in [0, 3, 4, 5, 6, 7, 8]:
        url = ttest_dir / folder / f"basins_level_{basin_level}_utest.csv"
        print(f"reading: {url}")
        ttest = pd.read_csv(url)
        
        num_change_list = []
        for p_thd in p_thd_list:
            # P-values close to 0 indicate that the observed difference is unlikely to be due to chance, (true difference?)
            # whereas a P value close to 1 suggests no difference between the groups other than due to chance (no difference?)
            num_basins = ttest.shape[0]
            df_neg_pos = ttest[ttest['p_u'] <= p_thd]
            neg = df_neg_pos[df_neg_pos['u_score'] < 0].shape[0]
            pos = df_neg_pos[df_neg_pos['u_score'] > 0].shape[0]
            stable = num_basins - neg - pos
            
            res.append([basin_level, num_basins, 2017, p_thd, neg, stable, pos])
    
    ttest_res = pd.DataFrame(res, columns=['basin_level', 'num_basins', 'start_year', 'p_u_thd', 'neg', 'stable', 'pos']).set_index('basin_level')
    ttest_res.to_csv(ttest_dir / f'{folder}_utest_2017.csv')