### Import the required libraries 

In [None]:
import odbind as odb
from odbind.survey import Survey
from odbind.well import Well
import pandas as pd
import numpy as np

In [None]:
%load_ext autoreload
%autoreload 2

### Load the survey from Opendtect

In [None]:
sdata = Survey("UnderGrad_Proj")

### Well Data

In [None]:
wells = Well.names(sdata)

In [None]:
wells

#### Well Logs informations

In [None]:
well1 = Well(sdata,wells[0])


well1.log_info_dataframe()

In [None]:
well1.track_dataframe()

#### Load all the well logs into a panda dataframe

In [None]:
well_list = []

for well in wells:
    EB= Well(sdata, well)
    df = EB.logs_dataframe()[0]
    df['WELL'] = well
    well_list.append(df)    

In [None]:
well_df = pd.concat(well_list)

In [None]:
well_df.T

#### Names of all the available columns in the df

In [None]:
hd = list(well_df.columns)

#### select columns to be used in the example

In [None]:
sel_hd = ['dah','PEM2010_INPUT_DTC_ISO_1',
 'PEM2010_INPUT_NP_1',
 'PEM2010_INPUT_RHOB_1',
 'PEM2010_INPUT_RT_1',
 'PEM2010_INPUT_SWE_1',
 'PEM2010_INPUT_SWT_1',
 'PEM2010_INPUT_VCL_1'
       ,'WELL'  ]

In [None]:
df = well_df[sel_hd]


In [None]:
df.head()

In [None]:
df.tail()

### Exploration Data Analysis

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
import seaborn as sn

In [None]:
sn.pairplot(df.dropna().reset_index().drop(columns='dah'),hue='WELL' )

#### Automatic EDA using ydata-profiling

In [None]:
from ydata_profiling.profile_report import ProfileReport
from ydata_profiling.compare_reports import compare

In [None]:
ProfileReport(df)

#### drop null values 

In [None]:
df_nan = df.dropna()


In [None]:
pr1 = ProfileReport(df)

In [None]:
pr2 = ProfileReport(df_nan)

In [None]:
pr1.compare(pr2)

#### Remove outliers using PYOD

In [None]:
from pyod.models import lof

In [None]:
lf = lof.LOF(contamination=0.01)

In [None]:
lf.fit(df_nan.drop(columns="WELL"))

In [None]:
df_nan_an = df_nan.copy()

In [None]:
df_nan_an['anomaly'] = lf.predict(df_nan.drop(columns="WELL"))
df_nan_an

In [None]:
fo2_1.put_log('Double GR', somelogs['dah']['m'].to_numpy(),\
              somelogs['Gamma Ray']['API'].to_numpy()*2,'API','GR',True]

In [None]:
df_nan_an["scores"] = lf.decision_scores_
df_nan_an

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df_nan_an["anomaly"].value_counts()

In [None]:
inlier = df_nan[df_nan_an['anomaly']==0]
outlier = df_nan[df_nan_an['anomaly']==1]

In [None]:
inlier

In [None]:
outlier = outlier[["PEM2010_INPUT_DTC_ISO_1"]].sort_index()

In [None]:
inlier["PEM2010_INPUT_DTC_ISO_1"].sort_index().plot()
plt.scatter(outlier.index,outlier["PEM2010_INPUT_DTC_ISO_1"],c='red' )
plt.show()

In [None]:
df_inliers = inlier.copy()

#### Features and targets "in this examples we use SWE as the targets"

In [None]:
df_inliers.columns

In [None]:
df_inliers['RT_log'] = df_inliers[['PEM2010_INPUT_RT_1']].apply(np.log)

In [None]:
df_inliers['RT_log'].sort_index().plot()

### Prepare data for Model Building

In [None]:
Xdata = df_inliers.drop(columns=["PEM2010_INPUT_SWT_1","PEM2010_INPUT_SWE_1","dah","WELL"])
ydata = df_inliers[["PEM2010_INPUT_SWE_1"]]

#### split to training and test data

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train, y_test = train_test_split(Xdata,ydata, test_size=0.25)


In [None]:
x_train

### Parameters Optimization for RandomForest 

In [None]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error

# Define your dataset (X and y) and split it into training and testing sets
# Assuming X_train, X_test, y_train, y_test are defined

def objective(trial):
    # Define hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_split = trial.suggest_float('min_samples_split', 0.1, 1.0)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.1, 0.5)
    ccp_alpha = trial.suggest_float('ccp_alpha', 0.0, 0.5)  # Pruning parameter
    
    # Select feature scaling technique
    scaler_name = trial.suggest_categorical('scaler', ['StandardScaler', 'MinMaxScaler', 'RobustScaler'])
    if scaler_name == 'StandardScaler':
        scaler = StandardScaler()
    elif scaler_name == 'MinMaxScaler':
        scaler = MinMaxScaler()
    else:
        scaler = RobustScaler()
    
    # Apply feature scaling
    X_train_scaled = scaler.fit_transform(x_train)
    X_test_scaled = scaler.transform(x_test)
    
    # Train Random Forest model with pruning
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        ccp_alpha=ccp_alpha,  # Include pruning parameter
        random_state=42
    )
    rf.fit(X_train_scaled, y_train)
    
    # Evaluate model
    y_pred = rf.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    
    return mse

# Run optimization process
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print('Best trial:')
trial = study.best_trial
print('Accuracy:', trial.value)
print("Params: ")
for key, value in trial.params.items():
    print(f'    {key}: {value}')


In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

### Parameter Optimizations for XGBoost

In [None]:
import optuna
import xgboost as xgb


def objective(trial):
    # Define hyperparameters to optimize
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-6, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-6, 10.0),
        'random_state': 42
    }
    
    # Select feature scaling technique
    scaler_name = trial.suggest_categorical('scaler', ['StandardScaler', 'MinMaxScaler', 'RobustScaler'])
    if scaler_name == 'StandardScaler':
        scaler = StandardScaler()
    elif scaler_name == 'MinMaxScaler':
        scaler = MinMaxScaler()
    else:
        scaler = RobustScaler()
    
    # Apply feature scaling
    X_train_scaled = scaler.fit_transform(x_train)
    X_test_scaled = scaler.transform(x_test)
    
    # Train XGBoostRegressor model
    model = xgb.XGBRegressor(**params)
    model.fit(X_train_scaled, y_train)
    
    # Evaluate model
    y_pred = model.predict(X_test_scaled)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    return rmse


In [None]:
study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective, n_trials=100)

print('Best trial:')
trial = study.best_trial
print('RMSE:', trial.value)
print("Params: ")
for key, value in trial.params.items():
    print(f'    {key}: {value}')


In [None]:
import optuna
from sklearn import datasets
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    # Define the hyperparameters to tune
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    C = trial.suggest_loguniform('C', 1e-3, 1e2)
    epsilon = trial.suggest_loguniform('epsilon', 1e-3, 1e1)
    scaler_type = trial.suggest_categorical('scaler', ['standard', 'minmax', 'robust'])
    
   # Select feature scaling technique
    scaler_name = trial.suggest_categorical('scaler', ['StandardScaler', 'MinMaxScaler', 'RobustScaler'])
    if scaler_name == 'StandardScaler':
        scaler = StandardScaler()
    elif scaler_name == 'MinMaxScaler':
        scaler = MinMaxScaler()
    else:
        scaler = RobustScaler()
    
     
    # Create the SVR model with the suggested hyperparameters
    svr = SVR(kernel=kernel, C=C, epsilon=epsilon)
    
    # Create a pipeline that includes scaling and the SVR model
    pipeline = Pipeline([
        ('scaler', scaler),
        ('svr', svr)
    ])
    
    # Use cross-validation to evaluate the model
    # Note: Using 'neg_mean_squared_error' to get negative MSE scores
    score = cross_val_score(pipeline, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
    mean_score = score.mean()
    
    return mean_score




In [None]:
# Create and run the Optuna study
# Note: We maximize the negative MSE to minimize the actual MSE
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and score
print(f"Best trial: {study.best_trial.value}")
print("Best hyperparameters: ")
for key, value in study.best_trial.params.items():
    print(f"{key}: {value}")