In [None]:
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import xesmf as xe
import pandas as pd
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
from sklearn.manifold import LocallyLinearEmbedding

In [None]:
def open_data():
    #Open the datasets
    factual = xr.open_mfdataset("factual/*.nc")
    cfl = xr.open_mfdataset("eth_cfl/*.nc", join='inner', compat='override')
    
    factual = factual.reduce(np.nansum, dim='expver',keep_attrs=True)
    cfl["lon"] = np.arange(-180,180,2.5)
    factual = factual.rename({"latitude":"lat","longitude":"lon"})
    cfl = cfl.sel(lat=slice(-60,60),lon=slice(-80,20))
    
    #Regrid the factual dataset to be the counterfactual's granularity
    ds_out = xr.Dataset(
        {
            "lat": np.array(cfl["lat"]),
            "lon": np.array(cfl["lon"]),
        }
    )
    regridder = xe.Regridder(factual, ds_out, "bilinear")
    factual = regridder(factual)
    #factual = factual.isel(time=slice(0,864)) 
    factual = factual.isel(time=slice(0,732)) # if monthly data
    
    pred_df = pd.read_csv("yearly_activity.csv")
    pred_df = pred_df.loc[pred_df['Year'] >= 1950]
    ace_raw = pred_df['Accumulated Cyclone Energy']
    ace = np.array(ace_raw)
    
    return factual,cfl,ace

In [None]:
factual, cfl, ace = open_data()
factual

In [None]:
factual = factual.to_array().transpose("time","variable","lat","lon")
factual = factual.resample(time='AS').mean()
factual = factual.stack(location=['lat','lon'])
factual = factual.mean(dim=["location"])
factual = factual.to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(factual,ace,test_size=0.3,shuffle=True, random_state=66)

In [None]:
automl = AutoML()
automl_settings = {
        "metric": 'rmse',
        "estimator_list": 'auto',
        "task": 'regression',
        "time_budget": 300,
        "log_file_name": "./automl_factual.log",
    }

In [None]:
automl.fit(X_train=X_train, y_train=y_train,**automl_settings)

In [None]:
y_pred = automl.predict(X_test)

In [None]:
def eval_results(y_test,y_pred):
    print('R^2 Score: ' + str(r2_score(y_test, y_pred)))
    print('LGBM - RMSE: ' + str(mean_squared_error(y_test,y_pred, squared = False)))
    print('LGBM - MAE: ' + str(mean_absolute_error(y_test,y_pred)))
    plt.scatter(np.arange(len(y_test)), y_test,label='Observed')
    plt.scatter(np.arange(len(y_test)), y_pred,label='Predicted')
    plt.xlabel('Predicted Season')
    plt.ylabel('Predicted ACE'),
    plt.title('Testing Results')
    plt.legend()
    plt.show()
    plt.scatter(y_test,y_pred)
    plt.xlabel('Observed')
    plt.ylabel('Predicted')
    plt.title('Testing Results - Predicted v Observed')
    plt.show()
    plt.scatter(y_test, y_test-y_pred)
    plt.plot(y_test,np.zeros(len(y_test)))
    plt.xlabel('Observed')
    plt.ylabel('Residual from Predicted')
    plt.title('Testing Results - Residual')
    plt.show()

In [None]:
eval_results(y_test,y_pred)

In [None]:
factual, cfl, ace = open_data()

In [None]:
factual = factual.to_array().transpose("time","variable","lat","lon")
factual = factual.resample(time='AS').mean()
factual = factual.stack(information=['lat','lon','variable'])
factual = factual.to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(factual,ace,test_size=0.3,shuffle=True, random_state=66)

In [None]:
automl = AutoML()
automl_settings = {
        "metric": 'rmse',
        "estimator_list": 'auto',
        "task": 'regression',
        "time_budget": 300,
        "log_file_name": "./automl_factual.log"
    }

In [None]:
automl.fit(X_train=X_train, y_train=y_train,**automl_settings)

In [None]:
y_pred = automl.predict(X_test)

In [None]:
eval_results(y_test,y_pred)

In [None]:
factual, cfl, ace = open_data()

In [None]:
factual = factual.to_array().transpose("time","variable","lat","lon")
factual = factual.resample(time='AS').mean()
factual = factual.stack(information=['lat','lon','variable'])
factual = factual.to_numpy()

In [None]:
factual = StandardScaler().fit_transform(factual)
factual = factual.T
print(np.shape(factual))

In [None]:
pca = PCA(n_components=63)
pca.fit_transform(factual)

In [None]:
factual = pca.components_.T

In [None]:
X_train, X_test, y_train, y_test = train_test_split(factual,ace,test_size=0.3,shuffle=True, random_state=66)

In [None]:
automl = AutoML()
automl_settings = {
        "metric": 'rmse',
        "estimator_list": 'auto',
        "task": 'regression',
        "time_budget": 300,
        "log_file_name": "./automl_factual.log"
    }
automl.fit(X_train=X_train, y_train=y_train,**automl_settings)

In [None]:
y_pred = automl.predict(X_test)

In [None]:
eval_results(y_test,y_pred)

In [None]:
factual, cfl, ace = open_data()
factual = factual.to_array().transpose("time","variable","lat","lon")
factual = factual.resample(time='AS').mean()
factual = factual.stack(information=['lat','lon','variable'])
factual = factual.to_numpy()

In [None]:
factual = StandardScaler().fit_transform(factual)
print(np.shape(factual))

In [None]:
lle = LocallyLinearEmbedding(n_components=62)
lle.fit(factual)

In [None]:
factual = lle.embedding_
print(np.shape(factual))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(factual,ace,test_size=0.3,shuffle=True, random_state=66)

In [None]:
automl = AutoML()
automl_settings = {
        "metric": 'rmse',
        "estimator_list": 'auto',
        "task": 'regression',
        "time_budget": 300,
        "log_file_name": "./automl_factual.log"
    }
automl.fit(X_train=X_train, y_train=y_train,**automl_settings)

In [None]:
y_pred = automl.predict(X_test)

In [None]:
eval_results(y_test,y_pred)

In [None]:
def water_open_data():
    #Open the datasets
    factual = xr.open_mfdataset("factual/*.nc")
    cfl = xr.open_mfdataset("eth_cfl/*.nc", join='inner', compat='override')
    
    land_mask = xr.open_dataset("land_mask_gen.nc")
    
    factual = factual.reduce(np.nansum, dim='expver',keep_attrs=True)
    cfl["lon"] = np.arange(-180,180,2.5)
    factual = factual.rename({"latitude":"lat","longitude":"lon"})
    land_mask = land_mask.rename({"latitude":"lat","longitude":"lon"})
    
    ds_out_land = xr.Dataset(
        {
            "lat": np.array(cfl["lat"]),
            "lon": np.array(cfl["lon"]),
        }
    )
    regridder_mask = xe.Regridder(land_mask, ds_out_land, "bilinear",reuse_weights=True)
    land_mask = regridder_mask(land_mask)
    
    cfl = cfl.sel(lat=slice(-60,60),lon=slice(-80,20))
    land_mask = land_mask.sel(lat=slice(-60,60),lon=slice(-80,20))
    
    #Regrid the factual dataset to be the counterfactual's granularity
    ds_out = xr.Dataset(
        {
            "lat": np.array(cfl["lat"]),
            "lon": np.array(cfl["lon"]),
        }
    )
    
    ds_out_land = xr.Dataset(
        {
            "lat": np.array(cfl["lat"]),
            "lon": np.array(cfl["lon"]),
        }
    )
    
    regridder = xe.Regridder(factual, ds_out, "bilinear",reuse_weights=True)
    factual = regridder(factual)
    #factual = factual.isel(time=slice(0,756))
    factual = factual.isel(time=slice(0,732)) #monthly data
    
    
    
    
    pred_df = pd.read_csv("yearly_activity.csv")
    pred_df = pred_df.loc[pred_df['Year'] >= 1959]
    ace_raw = pred_df['Accumulated Cyclone Energy']
    ace = np.array(ace_raw)
    
    
    
    return factual,cfl,ace, land_mask

In [None]:
factual,cfl,ace, land_mask = water_open_data()

In [None]:
factual = factual.to_array().transpose("time","lat","lon","variable")
#factual = factual.resample(time='AS').mean()

In [None]:
land_mask = land_mask.to_array()

In [None]:
#land_mask = np.repeat(land_mask[:, :, np.newaxis], 5, axis=2)
#land_mask = np.repeat(land_mask[:, :, :, np.newaxis], 63, axis=3)
land_mask = np.dstack([land_mask]*5)
land_mask = np.dstack([land_mask]*756)
land_mask = land_mask.reshape((756,64,41,5))

In [None]:
land_mask = np.where(land_mask < 0.5, np.NaN, 1)
factual = np.multiply(factual,land_mask)

In [None]:
factual = factual.resample(time='AS').mean()
factual = factual.stack(information=['lat','lon','variable'])
factual = factual.to_numpy()
factual = factual[:, ~np.isnan(factual).any(axis=0)]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(factual,ace,test_size=0.3,shuffle=True, random_state=66)

In [None]:
automl = AutoML()
automl_settings = {
        "metric": 'rmse',
        "estimator_list": 'auto',
        "task": 'regression',
        "time_budget": 300,
        "log_file_name": "./automl_factual.log"
    }
automl.fit(X_train=X_train, y_train=y_train,**automl_settings)

In [None]:
y_pred = automl.predict(X_test)
eval_results(y_test,y_pred)

In [None]:
def open_monthly_data():
    mace = open("monthly_activity_ace.csv","r")
    mns = open("monthly_activity_num_storms.csv","r")
    
    mace_contents = mace.readlines()
    mns_contents = mns.readlines()
    
    mace.close()
    mns.close()
    
    mace_dct = []
    mns_dct = []
    
    for i in range(9,len(mace_contents)):
        #key_mace = int(mace_contents[i].split()[0])
        #key_mns = int(mns_contents[i].split()[0])
        #monthly_mace = []
        #monthly_mns = []
        for j in range(1,13):
            mace_dct.append(float(mace_contents[i].split()[j]))
            mns_dct.append(float(mns_contents[i].split()[j]))
        #mace_dct[key_mace] = monthly_mace
        #mns_dct[key_mns] = monthly_mns
    
    return np.array(mace_dct), np.array(mns_dct)

In [None]:
mace,mns = open_monthly_data()
factual,cfl,ace = open_data()

In [None]:
factual = factual.to_array().transpose("time","variable","lat","lon")
factual = factual.stack(information=['lat','lon','variable'])
factual = factual.to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(factual,mace,test_size=0.25,shuffle=True, random_state=66)

In [None]:
automl = AutoML()
automl_settings = {
        "metric": 'rmse',
        "estimator_list": 'auto',
        "task": 'regression',
        "time_budget": 300,
        "log_file_name": "./automl_factual.log"
    }
automl.fit(X_train=X_train, y_train=y_train,**automl_settings)

In [None]:
zero_idxs_y = [i for i, x in enumerate(y_test) if x == 0]
y_test_nz = []
X_test_nz = []
for i in range(len(y_test)):
    if(i in zero_idxs_y):
        continue
    y_test_nz.append(y_test[i])
    X_test_nz.append(X_test[i])
y_test_nz = np.array(y_test_nz)
X_test_nz = np.array(X_test_nz)

In [None]:
y_pred_nz = automl.predict(X_test_nz)
eval_results(y_test_nz,y_pred_nz)

In [None]:
y_pred = automl.predict(X_test)
eval_results(y_test,y_pred)

In [None]:
plt.hist(automl.model.estimator.feature_importances_)

In [None]:
sst_l = []
ice_l = []
p_l = []
ua_l = []
va_l = []
for i in range(0,len(automl.model.estimator.feature_importances_),5):
    ua_l.append(automl.model.estimator.feature_importances_[i])
    va_l.append(automl.model.estimator.feature_importances_[i+1])
    p_l.append(automl.model.estimator.feature_importances_[i+2])
    ice_l.append(automl.model.estimator.feature_importances_[i+3])
    sst_l.append(automl.model.estimator.feature_importances_[i+4])
total = sum(sst_l) + sum(ice_l) + sum(p_l) + sum(ua_l) + sum(va_l)
print('SST FI: ' + str(sum(sst_l)/total*100) + "%")
print('Ice FI: ' + str(sum(ice_l)/total*100) + "%")
print('Pressure FI: ' + str(sum(p_l)/total*100) + "%")
print('Ua FI: ' + str(sum(ua_l)/total*100) + "%")
print('Va FI: ' + str(sum(va_l)/total*100) + "%")

In [None]:
indices = [i for i, x in enumerate(my_list) if x == "whatever"]

In [None]:
mace,mns = open_monthly_data()
factual_1,cfl,ace = open_data()
factual_2,cfl,ace = open_data()
factual_3,cfl,ace = open_data()
factual_1 = factual_1.drop("msl")
factual_2 = factual_2.drop("u10")
factual_2 = factual_2.drop("v10")
factual_3 = factual_3.drop("siconc")
factual_3 = factual_3.drop("sst")
factual_1 = factual_1.to_array().transpose("time","variable","lat","lon")
factual_1 = factual_1.stack(information=['lat','lon','variable'])
factual_1 = factual_1.to_numpy()
factual_2 = factual_2.to_array().transpose("time","variable","lat","lon")
factual_2 = factual_2.stack(information=['lat','lon','variable'])
factual_2 = factual_2.to_numpy()
factual_3 = factual_3.to_array().transpose("time","variable","lat","lon")
factual_3 = factual_3.stack(information=['lat','lon','variable'])
factual_3 = factual_3.to_numpy()
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(factual_1,mace,test_size=0.25,shuffle=True, random_state=66)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(factual_2,mace,test_size=0.25,shuffle=True, random_state=66)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(factual_3,mace,test_size=0.25,shuffle=True, random_state=66)

In [None]:
np.shape(factual_3)

In [None]:
automl_1 = AutoML()
automl_settings = {
        "metric": 'rmse',
        "estimator_list": 'auto',
        "task": 'regression',
        "time_budget": 300,
        "log_file_name": "./automl_factual.log"
    }
automl_1.fit(X_train=X_train_1, y_train=y_train_1,**automl_settings)
automl_2 = AutoML()
automl_settings = {
        "metric": 'rmse',
        "estimator_list": 'auto',
        "task": 'regression',
        "time_budget": 300,
        "log_file_name": "./automl_factual.log"
    }
automl_2.fit(X_train=X_train_2, y_train=y_train_2,**automl_settings)
automl_3 = AutoML()
automl_settings = {
        "metric": 'rmse',
        "estimator_list": 'auto',
        "task": 'regression',
        "time_budget": 300,
        "log_file_name": "./automl_factual.log"
    }
automl_3.fit(X_train=X_train_3, y_train=y_train_3,**automl_settings)

In [None]:
y_pred_1 = automl_1.predict(X_test_1)
eval_results(y_test_1,y_pred_1)
y_pred_2 = automl_2.predict(X_test_2)
eval_results(y_test_2,y_pred_2)
y_pred_3 = automl_3.predict(X_test_3)
eval_results(y_test_3,y_pred_3)

In [None]:
mace,mns = open_monthly_data()
factual,cfl,ace, land_mask = water_open_data()
factual = factual.to_array().transpose("time","lat","lon","variable")
land_mask = land_mask.to_array()
land_mask = np.dstack([land_mask]*5)
land_mask = np.dstack([land_mask]*732)
land_mask = land_mask.reshape((732,64,41,5))
land_mask = np.where(land_mask < 0.5, np.NaN, 1)
factual = np.multiply(factual,land_mask)

factual = factual.stack(information=['lat','lon','variable'])
factual = factual.to_numpy()
factual = factual[:, ~np.isnan(factual).any(axis=0)]

X_train, X_test, y_train, y_test = train_test_split(factual,mace,test_size=0.3,shuffle=True, random_state=66)

In [None]:
automl = AutoML()
automl_settings = {
        "metric": 'rmse',
        "estimator_list": 'auto',
        "task": 'regression',
        "time_budget": 300,
        "log_file_name": "./automl_factual.log"
    }
automl.fit(X_train=X_train, y_train=y_train,**automl_settings)

In [None]:
y_pred = automl.predict(X_test)
eval_results(y_test,y_pred)

In [None]:
mace,mns = open_monthly_data()
factual,cfl,ace = open_data()
factual = factual.to_array().transpose("time","variable","lat","lon")
factual = factual.stack(information=['lat','lon','variable'])
factual = factual.to_numpy()


In [None]:
new_mace = []
new_factual = []
for i in range(732):
    if(i % 12 >= 4 and i % 12 <= 10):
        new_mace.append(mace[i])
        new_factual.append(factual[i])
        
new_mace = np.array(new_mace)
new_factual = np.array(new_factual)
print(np.shape(mace))
print(np.shape(factual))
print(np.shape(new_mace))
print(np.shape(new_factual))
print(mac)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_factual,new_mace,test_size=0.3,shuffle=True, random_state=66)
automl = AutoML()
automl_settings = {
        "metric": 'rmse',
        "estimator_list": 'auto',
        "task": 'regression',
        "time_budget": 300,
        "log_file_name": "./automl_factual.log"
    }
automl.fit(X_train=X_train, y_train=y_train,**automl_settings)
y_pred = automl.predict(X_test)
eval_results(y_test,y_pred)


In [None]:
y_pred = automl.predict(X_test)
eval_results(y_test,y_pred)

In [None]:
def unregrid_open_data():
    #Open the datasets
    factual = xr.open_mfdataset("factual/*.nc")
    cfl = xr.open_mfdataset("eth_cfl/*.nc", join='inner', compat='override')
    
    factual = factual.reduce(np.nansum, dim='expver',keep_attrs=True)
    cfl["lon"] = np.arange(-180,180,2.5)
    factual = factual.rename({"latitude":"lat","longitude":"lon"})
    cfl = cfl.sel(lat=slice(-60,60),lon=slice(-80,20))
    
    factual = factual.isel(time=slice(0,732)) # if monthly data
    
    pred_df = pd.read_csv("yearly_activity.csv")
    pred_df = pred_df.loc[pred_df['Year'] >= 1950]
    ace_raw = pred_df['Accumulated Cyclone Energy']
    ace = np.array(ace_raw)
    
    return factual,cfl,ace

In [None]:
mace,mns = open_monthly_data()
factual,cfl,ace = open_data()
factual = factual.to_array().transpose("time","variable","lat","lon")
factual = factual.stack(information=['lat','lon','variable'])
factual = factual.to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(factual, mns,test_size=0.3,shuffle=True, random_state=66)
automl = AutoML()
automl_settings = {
        "metric": 'rmse',
        "estimator_list": 'auto',
        "task": 'regression',
        "time_budget": 300,
        "log_file_name": "./automl_factual.log"
    }
automl.fit(X_train=X_train, y_train=y_train,**automl_settings)

In [None]:
y_pred = automl.predict(X_test)
eval_results(y_test,y_pred)

In [None]:
cfl = xr.open_mfdataset("init_ctf_pred.nc")
print(cfl.dims)
cfl = cfl.to_array().transpose("time","variable","lat","lon")
cfl = cfl.stack(information=['lat','lon','variable'])
cfl.dropna('lon')

In [None]:
np.shape(X_test)

In [None]:
def open_ctf_data():
    #Open the datasets
    
    cfl = xr.open_mfdataset("eth_cfl/*.nc", join='inner', compat='override')
    
    cfl["lon"] = np.arange(-180,180,2.5)
    cfl = cfl.sel(lat=slice(-60,60),lon=slice(-80,20))

    
    return cfl

In [None]:
cfl = open_ctf_data()
cfl = cfl.drop("time_bnds")

In [None]:
cfl['ua'] = cfl['ua'].sel(plev=20000)
cfl['va'] = cfl['va'].sel(plev=20000)
var_list = ['SST_cpl','ice_cov','psl', 'ua', 'va']

In [None]:
cfl = cfl.drop('plev')
ret = cfl.sel(time=cfl['time'][108:])[var_list]

In [None]:
ret = ret.to_array().transpose("time","variable","lat","lon")
ret = ret.stack(information=['lat','lon','variable'])

In [None]:
ret

In [None]:
counterfactual_results = automl.predict(ret)

In [None]:
np.sum(counterfactual_results)

In [None]:
np.shape(X_test)

In [None]:
ctf_2020 = np.random.rand(12,13120)

In [None]:
np.sum(automl.predict(ctf_2020))

In [None]:
np.shape(X_train)

In [None]:
ctf_2020 = np.loadtxt('ctf_ensemble_0.txt')

In [None]:
np.shape(ctf_2020)

In [None]:
automl.predict(ctf_2020)