In [1]:
import csv
import time  # Just to compare fit times
from pathlib import Path
from pprint import pprint

In [76]:
import numpy as np
import ray
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tune_sklearn import TuneSearchCV
from xgboost import XGBRegressor
import pandas as pd
from pathlib import Path
import time
import pickle

In [69]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance = metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)

    return {'r2': round(r2, 4), 'MAE': round(mean_absolute_error, 4), 'MSE': round(mse, 4), 'RMSE': round(np.sqrt(mse), 4), "explained_variance": round(explained_variance, 4)}

In [3]:
%load_ext memory_profiler

In [4]:
# !/home/dev/Desktop/Work/uncoverml/venv/bin/ray start --head --port=6379 --num-cpus=1

In [5]:
# !/home/dev/Desktop/Work/uncoverml/venv/bin/ray --version

In [8]:
ray.__version__

'1.1.0'

In [7]:
# !/home/dev/Desktop/Work/uncoverml/venv/bin/ray stop

In [16]:
ray.init(num_cpus=20, num_gpus=4)

2021-02-07 22:39:18,378	INFO services.py:1171 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.11.161',
 'raylet_ip_address': '192.168.11.161',
 'redis_address': '192.168.11.161:6379',
 'object_store_address': '/tmp/ray/session_2021-02-07_22-39-17_925678_54681/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-02-07_22-39-17_925678_54681/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-02-07_22-39-17_925678_54681',
 'metrics_export_port': 61559,
 'node_id': '2af2ee503a4ebfe84fdd8c6431d42412fb05b62b'}

In [15]:
ray.shutdown()

In [None]:
# read the csv file created

input_files = [
    Path("../data/formated_dataset.csv"),
    Path("../data/scaler_df.csv"),
    Path("../data/quantile_df.csv"),
]


features_to_use = ["target","Grav_lane_clip","clim_PTA_albers","be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata","3dem_mag1_fin","ceno_euc_aust1","be-30y-85m-avg-ND-SWIR1-NIR.filled.lzw.nodata","Thorium_2016","dem_fill","relief_elev_focalrange1000m_3s","LATITUDE_GRID1_clip","LOC_distance_to_coast","clim_EPA_albers","be-30y-85m-avg-ND-SWIR1-SWIR2.filled.lzw.nodata","LONGITUDE_GRID1_clip","si_geol1","3dem_mag2","clim_WDA_albers","Dose_2016","Clim_Prescott_LindaGregory","Potassium_2016","mrvbf_9","Rad2016K_Th","be-30y-85m-avg-ND-NIR-GREEN.filled.lzw.nodata","clim_RSM_albers","3dem_mag0.fin","s2-dpca-85m_1","water-85m_3","saga_wetSM_85_reprojected"]

parameters = {
    "objective": ["reg:squarederror"],
    "gamma":[0],
    "eval_metric": ["rmse", "mae","logloss","auc"],
    "n_estimators":[500, 700],
    "max_depth":[3, 15],
    "subsample":[0.8, 1],
    "min_child_weight":[10, 20],
    "learning_rate":[0.01, 0.2],
    "colsample_bytree":[0.5,1]
}

model=XGBRegressor(objective='reg:squarederror', tree_method='gpu_hist',n_jobs=-1)

xgb_tune_search = TuneSearchCV(
    model,
    parameters,

    search_optimization="bayesian",
    n_jobs=-1,
    n_trials=300,
    early_stopping=False,

    verbose=1,
    return_train_score=True,
    loggers=["csv"],
    use_gpu=True
)

for input_file in input_files:

    print("Input file: ",input_file)
    df = pd.read_csv(input_file).astype('float32')
    df = df[~df.isin([np.nan, np.inf, -np.inf,-9999.0]).any(1)]
    df = df[features_to_use]
    y_train = df['target']
    X_train = df.drop("target",axis=1)
    
    start = time.time()
    xgb_tune_search.fit(X_train, y_train)
    end = time.time()
    print("Tune Fit Time:", end - start)  
    
    fileObj = open('xgb_tune_search'+input_file.stem+'.pkl', 'wb')
    pickle.dump(xgb_tune_search,fileObj)
    fileObj.close()
    

In [83]:
input_files = [
    (Path("../data/formated_dataset.csv"),Path("../data/formated_oos_dataset.csv")),
    (Path("../data/scaler_df.csv"),Path("../data/scaler_oos_df.csv")),
    (Path("../data/quantile_df.csv"),Path("../data/quantile_oos_df.csv"))
]


features_to_use = ["target","Grav_lane_clip","clim_PTA_albers","be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata","3dem_mag1_fin","ceno_euc_aust1","be-30y-85m-avg-ND-SWIR1-NIR.filled.lzw.nodata","Thorium_2016","dem_fill","relief_elev_focalrange1000m_3s","LATITUDE_GRID1_clip","LOC_distance_to_coast","clim_EPA_albers","be-30y-85m-avg-ND-SWIR1-SWIR2.filled.lzw.nodata","LONGITUDE_GRID1_clip","si_geol1","3dem_mag2","clim_WDA_albers","Dose_2016","Clim_Prescott_LindaGregory","Potassium_2016","mrvbf_9","Rad2016K_Th","be-30y-85m-avg-ND-NIR-GREEN.filled.lzw.nodata","clim_RSM_albers","3dem_mag0.fin","s2-dpca-85m_1","water-85m_3","saga_wetSM_85_reprojected"]

for input_file,input_oos_file in input_files:

    print("Input file: ",input_file)
    df = pd.read_csv(input_file).astype('float32')
    df = df[~df.isin([np.nan, np.inf, -np.inf,-9999.0]).any(1)]
    df = df[features_to_use]
    y_train = df['target']
    X_train = df.drop("target",axis=1)
    
    print("Input oos file: ",input_oos_file)
    df = pd.read_csv(input_oos_file).astype('float32')
    df = df[~df.isin([np.nan, np.inf, -np.inf,-9999.0]).any(1)]
    df = df[features_to_use]
    y_oos = df['target']
    X_oos = df.drop("target",axis=1)
    

    with open('xgb_tune_search'+input_file.stem+'.pkl', "rb") as f:
        xgb_tune_search = pickle.load(f)
    best_model = xgb_tune_search.best_estimator_

    y_train_pred = best_model.predict(X_train)
    results_oss = regression_results(y_train, y_train_pred)
    print("Training results:",results_oss)

    y_oos_pred = best_model.predict(X_oos)
    results_oss = regression_results(y_oos, y_oos_pred)
    print("OOS results:",results_oss)
    print()
    print()


Input file:  ../data/formated_dataset.csv
Input oos file:  ../data/formated_oos_dataset.csv
Training results: {'r2': 0.9679, 'MAE': 0.0408, 'MSE': 0.0056, 'RMSE': 0.075, 'explained_variance': 0.9679}
OOS results: {'r2': 0.6754, 'MAE': 0.6638, 'MSE': 1.5891, 'RMSE': 1.2606, 'explained_variance': 0.6888}


Input file:  ../data/scaler_df.csv
Input oos file:  ../data/scaler_oos_df.csv
Training results: {'r2': 1.0, 'MAE': 0.0015, 'MSE': 0.0, 'RMSE': 0.0021, 'explained_variance': 1.0}
OOS results: {'r2': 1.0, 'MAE': 0.0015, 'MSE': 0.0, 'RMSE': 0.0021, 'explained_variance': 1.0}


Input file:  ../data/quantile_df.csv
Input oos file:  ../data/quantile_oos_df.csv
Training results: {'r2': 0.9326, 'MAE': 0.0551, 'MSE': 0.0056, 'RMSE': 0.0751, 'explained_variance': 0.9326}
OOS results: {'r2': 0.8552, 'MAE': 0.106, 'MSE': 0.023, 'RMSE': 0.1515, 'explained_variance': 0.8703}




In [59]:
#read results
import pickle

with open("xgb_tune_search.pkl", "rb") as f:
    xgb_tune_search = pickle.load(f)


In [60]:
xgb_tune_search.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8143159659128333,
             eval_metric='auc', gamma=0, gpu_id=0, importance_type='gain',
             interaction_constraints='', learning_rate=0.10635403959720964,
             max_delta_step=0, max_depth=11, min_child_weight=16, missing=nan,
             monotone_constraints='()', n_estimators=660, n_jobs=-1,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.85619873071933,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [73]:
# xgb_tune_search.cv_results_

In [63]:
## Fit with cross validation
start_time = time.time()
# xgb_tune_search.best_estimator_.fit(X_test, y_test)   
y_pred = xgb_tune_search.best_estimator_.predict(X_test)
duration = (time.time() - start_time)/60       
print("results: ",regression_results(y_test, y_pred))
print("XGBOOST HyperParameter Tuning  %s minutes ---" % + duration)

results:  {'r2': 0.9092, 'MAE': 0.0609, 'MSE': 0.0076, 'RMSE': 0.0874, 'explained_variance': 0.9092}
XGBOOST HyperParameter Tuning  0.003015621503194173 minutes ---


## OOS Testing

In [70]:
# read the csv file created
input_files = [
    Path("../data/formated_dataset.csv"),
    Path("../data/scaler_df.csv"),
    Path("../data/quantile_df.csv")
]

input_oos_files = [
    Path("../data/formated_oos_dataset.csv"),
    Path("../data/scaler_oos_df.csv"),
    Path("../data/quantile_oos_df.csv")
]

features_to_use = ["target","Grav_lane_clip","clim_PTA_albers","be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata","3dem_mag1_fin","ceno_euc_aust1","be-30y-85m-avg-ND-SWIR1-NIR.filled.lzw.nodata","Thorium_2016","dem_fill","relief_elev_focalrange1000m_3s","LATITUDE_GRID1_clip","LOC_distance_to_coast","clim_EPA_albers","be-30y-85m-avg-ND-SWIR1-SWIR2.filled.lzw.nodata","LONGITUDE_GRID1_clip","si_geol1","3dem_mag2","clim_WDA_albers","Dose_2016","Clim_Prescott_LindaGregory","Potassium_2016","mrvbf_9","Rad2016K_Th","be-30y-85m-avg-ND-NIR-GREEN.filled.lzw.nodata","clim_RSM_albers","3dem_mag0.fin","s2-dpca-85m_1","water-85m_3","saga_wetSM_85_reprojected"]

input_file = input_files[0]
print("Input file: ",input_file)
df = pd.read_csv(input_file).astype('float32')
df = df[~df.isin([np.nan, np.inf, -np.inf,-9999.0]).any(1)]
df = df[features_to_use]
y = df['target']
X = df.drop("target",axis=1)

input_oos_file = input_oos_files[0]
print("Input oos file: ",input_oos_file)
df = pd.read_csv(input_oos_file).astype('float32')
df = df[~df.isin([np.nan, np.inf, -np.inf,-9999.0]).any(1)]
df = df[features_to_use]
y_oos = df['target']
X_oos = df.drop("target",axis=1)




Input file:  ../data/formated_dataset.csv
Input oos file:  ../data/formated_oos_dataset.csv


In [71]:
# OOS sample dataset
# need to itrate over all the models that are selected!!
best_model = xgb_tune_search.best_estimator_
y_pred = best_model.predict(X_oos)
results_oss = regression_results(y_oos, y_pred)
print("OOS results:",results_oss)



OOS results:
{'r2': 0.7721, 'MAE': 0.5843, 'MSE': 1.1158, 'RMSE': 1.0563, 'explained_variance': 0.7766}


In [72]:
# OOS sample dataset
# need to itrate over all the models that are selected!!
best_model = xgb_tune_search.best_estimator_
y_pred = best_model.predict(X)
results_oss = regression_results(y, y_pred)
print("Training results:")
print(results_oss)

Training results:
{'r2': 0.9831, 'MAE': 0.0259, 'MSE': 0.003, 'RMSE': 0.0544, 'explained_variance': 0.9831}
