In [2]:
pip install -U tensorboardx

Collecting tensorboardx
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/101.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardx
Successfully installed tensorboardx-2.6.2.2


In [3]:
pip install ray

Collecting ray
  Downloading ray-2.6.3-cp310-cp310-manylinux2014_x86_64.whl (56.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ray
Successfully installed ray-2.6.3


In [4]:
pip install xgboost



In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold, cross_validate
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from random import randint, uniform

import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler

#import xgboost_ray as xgb_ray
from ray.tune.integration.xgboost import TuneReportCheckpointCallback
from ray.air import session

Define Wells That Will Be Used (Keep consistent with Data Import and Preprocessing Notebooks)

In [6]:
wells = ["CCS1", "VW1"]

Retrieve Data and Labels

In [9]:
datalogs = {}
labels = {}
datasets = {}
datalogs_path = "/content/drive/MyDrive/REU Project/Norm CSV Files/norm_"
labels_path = "/content/drive/MyDrive/REU Project/FMI LOGS/labels/label_"
for well in wells:
  datalogs_df = pd.read_csv(datalogs_path + well + ".csv")
  labels_df = pd.read_csv(labels_path + well + ".csv")

  datalogs_df = datalogs_df.drop(columns=["FORM"], axis=1)

  datalogs[well] = datalogs_df
  labels[well] = labels_df

  dataset = pd.merge(datalogs_df, labels_df, on='DEPT')
  datasets[well] = dataset

#shuffle order of test and training datasets for better results
VW1_dataset = datasets["VW1"].sample(frac=1)
CCS1_dataset = datasets["CCS1"].sample(frac=1)

#separate into train and test data
X_train = VW1_dataset.drop(columns=["HAS_FRAC"]).values
X_test = CCS1_dataset.drop(columns=["HAS_FRAC"]).values
y_train = VW1_dataset["HAS_FRAC"].values
y_test = CCS1_dataset["HAS_FRAC"].values

#get rid of depth for ML algo
X_train_nod = X_train[:, 1:]
X_test_nod = X_test[:, 1:]

Run random forest with randomly chosen hyperparameters for comparison

In [10]:
#print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#make random forest and run it
rfc = RandomForestClassifier(n_estimators=100,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             max_features=30,
                             max_depth=10,
                             bootstrap=True,
                             random_state=0)
rfc.fit(X_train_nod, y_train)
y_pred = rfc.predict(X_test_nod)

print("Testing Accuracy Score:", accuracy_score(y_test, y_pred))
print("Testing F1 Score:", f1_score(y_test, y_pred))

Testing Accuracy Score: 0.7434194925302348
Testing F1 Score: 0.10132890365448503


Make random forest and use random search to find best hyperparameters

In [13]:
param_grid = {
    "n_estimators": [20, 40, 60, 80, 100, 120, 140, 160, 180],
    "max_depth": [5, 10, 50, 100, 110],
    "max_features": [5, 10 , 15, 20, 25, 30],
    "min_samples_leaf": [1,2,3,4,5,6],
    "min_samples_split": [2,4,6,8,10,12,14]
}

rand_search = RandomizedSearchCV(RandomForestClassifier(),
                                 param_grid,
                                 n_iter=10,
                                 cv=5,
                                 verbose=1,
                                 n_jobs=-1)

rand_search.fit(X_train_nod, y_train)
y_pred = rand_search.predict(X_test_nod)
print(rand_search.best_params_)
print("Testing Accuracy Score:", accuracy_score(y_test, y_pred))
print("Testing F1 Score:", f1_score(y_test, y_pred))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 40, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 10, 'max_depth': 100}
Testing Accuracy Score: 0.7868152715200379
Testing F1 Score: 0.16372093023255815


Make XGBoost and implement random search hyperparameter tuning

In [17]:
#convert Random Forest classifier to XGBoost Format
dtrain = xgb.DMatrix(X_train_nod, label=y_train)

#set grid for hyperparameter tuning
neg_class_count = len(y_train) - np.sum(y_train)
pos_class_count = np.sum(y_train)
hyperparameter_grid = {
    'max_depth': [5, 10, 50, 100, 110],
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2],
    'subsample' :  [0.2, 0.5, 0.8, 0.9, 1.0], #fraction of training samples used for training each tree
    'colsample_bytree' : [0.2, 0.5, 0.8, 0.9, 1.0], #fraction of features used for training each tree
    'scale_pos_weight' : [1, neg_class_count / pos_class_count] #can assign higher weight to positive class
}

#train XGB classifier and use it to make predictions
xgb_classifier = xgb.XGBClassifier(objective = 'binary:logistic', seed=42)

#perform random search
rand_search = RandomizedSearchCV(estimator = xgb_classifier,
                                 param_distributions=hyperparameter_grid,
                                 n_iter=10,
                                 cv=5,
                                 verbose=1,
                                 n_jobs=-1,
                                 random_state=42)

rand_search.fit(X_train_nod, y_train)

#retrieve best model and its hyperparameters
best_classifier = rand_search.best_estimator_
best_params = rand_search.best_params_

y_pred = best_classifier.predict(X_test_nod)

print("Accuracy score:", accuracy_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))
print("Best hyperparameters", best_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy score: 0.8219113113587858
F1 score: 0.1417142857142857
Best hyperparameters {'subsample': 0.5, 'scale_pos_weight': 6.028119507908611, 'max_depth': 50, 'learning_rate': 0.15, 'colsample_bytree': 0.9}


Implement Raytune Hyperparameter Tuning (Shuts down models with low performing hyperparameters and focuses only on ones working well) on XGBoost model

In [18]:
def train_xgb(config):
    #calculate scale_pos_weight
    if config['scale_pos_weight'] == "ratio":
      neg_class_count = len(y_train) - np.sum(y_train)
      pos_class_count = np.sum(y_train)
      scale_pos_weight = neg_class_count / pos_class_count
    else:
      scale_pos_weight = config['scale_pos_weight']

    #convert Random Forest classifier to XGBoost Format
    dtrain = xgb.DMatrix(X_train_nod, label=y_train)

    hyperparameters = {
      'objective' : 'binary:logistic',
      'seed' : 42,
      'max_depth': config['max_depth'],
      'learning_rate': config['learning_rate'],
      'subsample' : config['subsample'], #fraction of training samples used for training each tree
      'colsample_bytree' : config['colsample_bytree'], #fraction of features used for training each tree
      'min_child_weight' : config['min_child_weight'],
      'reg_alpha' : config['reg_alpha'],
      'reg_lambda' : config['reg_lambda'],
      'scale_pos_weight' : scale_pos_weight #can assign higher weight to positive class
    }

    #train XGB classifier and use it to make predictions
    xgb_classifier = XGBClassifier(**hyperparameters)
    xgb_classifier.fit(X_train_nod, y_train)

    y_pred = xgb_classifier.predict(X_test_nod)

    f1 = f1_score(y_test, y_pred)
    tune.report(mean_f1=f1)

#Configure Raytune
ray.shutdown() #in case it didn't shut down right last time
ray.init()

scheduler = ASHAScheduler( #works well to gradually elimiate bad performing configurations
    metric="mean_f1",
    mode="max",
    max_t = 100
)

#run analysis
analysis = tune.run(
    train_xgb,
    num_samples = 500, #number of times to sample from hyperparameter space
    config={
            'max_depth': tune.choice([5, 25, 50, 75, 100, 125]),
            'learning_rate': tune.loguniform(0.001, 0.1),
            'subsample' :  tune.uniform(0.2, 1.0), #fraction of training samples used for training each tree
            'colsample_bytree' : tune.uniform(0.2, 1.0), #fraction of features used for training each tree
            'min_child_weight' : tune.uniform(2, 10),
            'reg_alpha' : tune.uniform(0, 1),
            'reg_lambda' : tune.uniform(0, 1),
            'scale_pos_weight' : tune.choice([1, "ratio"]), #can assign higher weight to positive class
    },
    scheduler=scheduler,
    verbose=1
)

#get best hyperparameters and accuracy
best_trial = analysis.get_best_trial("mean_f1", "max")
best_hyperparameters = best_trial.config
best_f1 = best_trial.last_result["mean_f1"]
print("Best configuration:", best_hyperparameters)
print("best f1:", best_f1)

2023-08-22 02:44:48,987	INFO worker.py:1621 -- Started a local Ray instance.
2023-08-22 02:44:50,660	INFO tune.py:666 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
+------------------------------------------------+
| time_this_iter_s                       0.58135 |
| time_total_s                           0.58135 |
| training_iteration                           1 |
| mean_f1                                0.16561 |
+------------------------------------------------+

Trial train_xgb_dd946_00318 started with configuration:
+------------------------------------------------+
| Trial train_xgb_dd946_00318 config             |
+------------------------------------------------+
| colsample_bytree                       0.22837 |
| learning_rate                          0.01271 |
| max_depth                                  125 |
| min_child_weight                       3.18118 |
| reg_alpha                              0.49233 |
| reg_lambda                             0.57186 |
| scale_pos_weight                         ratio |
| subsample                              0.36757 |
+----------