# Importing Libraries

* System Append to set proper path

In [None]:
sys.path.append('../')

* Default

In [None]:
import lasio
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter

* Pandas Options

In [None]:
pd.set_option('max_columns', None)

* Source Code

In [None]:
from Source.Utils import welllog
from Source.Utils import multi_df
from Source.Utils import well_plot

* Tqdm Progress Bar

In [None]:
%%capture
from tqdm import tqdm_notebook

# Checkpoint import

In [None]:
df_main = pd.read_csv('../checkpoints/df_main.csv.gz', compression='gzip')

In [None]:
df_main.hea

## NPHI Prediction

* NPHI non NULL data

In [None]:
nphi_data = df_main[pd.notnull(df_main['NPHI'])].drop(columns=['LITHOLOGY_GEOLINK', 'DEPTH', 'Number of Outliers', 'WELL_NAME'])

In [None]:
nphi_data.head()

In [None]:
len(nphi_data)

* Pearson Correlation Matrix

In [None]:
corrmat_nphi = abs(nphi_data.corr()) # absolute correlation

plt.figure(figsize=(15,10))

sns.heatmap(corrmat_nphi, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, cmap='Blues')

        * Using the second branch, only the three highest correlated variables

In [None]:
plt.figure(figsize=(10,10))

sns.scatterplot(x="NPHI", y="CALI", data=nphi_data.sample(100000))

# Linear correlation (maybe)

In [None]:
plt.figure(figsize=(10,10))

sns.scatterplot(x="NPHI", y="RHOB", data=nphi_data.sample(100000))

# negative correlation

In [None]:
plt.figure(figsize=(10,10))

sns.scatterplot(x="NPHI", y="DTC", data=nphi_data.sample(100000))

# non-linear correlation (exponential)

* Regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, KFold, RandomizedSearchCV

from sklearn.preprocessing import RobustScaler

        * Dataset Split

In [None]:
X = nphi_data[['CALI', 'DTC', 'RHOB']]

Y = nphi_data['NPHI'].values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.40, random_state=42)

        * Model Selection

In [None]:
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

                * Pipelines for Models (using Robust Scaler due possible presence of Outlier -- Decrease the sensitivity)

In [None]:
linear_pipelines = []

linear_pipelines.append(('ScaledLASSO', Pipeline([('Scaler', RobustScaler()),('LASSO', linear_model.Lasso())])))
linear_pipelines.append(('ScaledEN', Pipeline([('Scaler', RobustScaler()),('EN', linear_model.ElasticNet())])))
linear_pipelines.append(('ScaledKNN', Pipeline([('Scaler', RobustScaler()),('KNN', KNeighborsRegressor())])))
linear_pipelines.append(('ScaledCART', Pipeline([('Scaler', RobustScaler()),('CART', DecisionTreeRegressor())])))
linear_pipelines.append(('ScaledGBM', Pipeline([('Scaler', RobustScaler()),('GBM', GradientBoostingRegressor())])))
linear_pipelines.append(('ScaledRidge', Pipeline([('Scaler', RobustScaler()),('Ridge', linear_model.Ridge())])))
linear_pipelines.append(('ScaledOMP', Pipeline([('Scaler', RobustScaler()),('OMP', linear_model.OrthogonalMatchingPursuit())])))
linear_pipelines.append(('ScaledBAYRID', Pipeline([('Scaler', RobustScaler()),('BAYRID', linear_model.BayesianRidge())])))
linear_pipelines.append(('ScaledSGD', Pipeline([('Scaler', RobustScaler()),('SGD', linear_model.SGDRegressor())])))
linear_pipelines.append(('ScaledRANDOMFOREST', Pipeline([('Scaler', RobustScaler()),('RANDOMFOREST', RandomForestRegressor(n_jobs=30))])))



                * Cross-Validation        

In [None]:
results = []

names = []

for name, model in tqdm_notebook(linear_pipelines, desc='Cross-Validation Procedure'):

    kfold = KFold(n_splits=5, random_state=42)

    rmse = np.sqrt(-cross_val_score(model, x_train, y_train, cv=kfold, scoring='neg_mean_squared_error'))
    results.append(rmse)
    names.append(name)
    msg = "%s: %f (%f)" % (name, rmse.mean(), rmse.std())
    print(msg)

        * Hyperparemeter Tunning

            Due to this time-consuming task, proximately 5 hours using a 64 core server on multiprocessing, we will only present the best set of parameters. The result of the cell bellow can be verified by the files in the model's result folder. 

In [None]:
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}# Number of trees in random forest
# Use the random grid to search for best hyperparameters
# First create the base model to tune
#rf = RandomForestRegressor(n_jobs=20)

#kfold = KFold(n_splits=3, random_state=42)

#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = kfold, verbose=10, random_state=42, n_jobs = 32, scoring='neg_mean_squared_error')

#rf_random.fit(RobustScaler().fit_transform(x_train), y_train)


        * Test Prediction

In [None]:
best_rf = RandomForestRegressor(max_depth=30, max_features='sqrt', min_samples_split=5, n_estimators=400, n_jobs=30) # the rest of the best parameters are  the default ones

best_rf.fit(RobustScaler().fit_transform(x_train), y_train)

In [None]:
y_predict = best_rf.predict(RobustScaler().fit_transform(x_test))

        * Metrics

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

print('MAE: ', str(mean_absolute_error(y_predict, y_test)), '\n')
print('########################', '\n')
print('RMSE: ', str(mean_squared_error(y_predict, y_test, squared=False)))

        * Null values Prediction

            * Null values Dataset

In [None]:
null_nphi_dataset = df_main[pd.isnull(df_main['NPHI'])].drop(columns=['LITHOLOGY_GEOLINK', 'DEPTH', 'Number of Outliers', 'WELL_NAME'])

null_nphi_dataset.head(n=5)

            * Features to Predict (Highest correlation with target variable)

In [None]:
features_for_predict = null_nphi_dataset[['CALI', 'DTC', 'RHOB']]

features_for_predict.head(n=5)

            * Prediction

In [None]:
nphi_prediction = best_rf.predict(RobustScaler().fit_transform(features_for_predict))

            * Check boundary constraints

In [None]:
print('Maximal Range Respected: ', nphi_prediction.max() < ranges['NPHI'][1], '\n', 'Minimal Range Respected: ', nphi_prediction.min() > ranges['NPHI'][0]) # Check if the prediction is respecting the previous established interval

            * Replacing Null values for Predicted ones

In [None]:
predicted_nphi_dataset = null_nphi_dataset

predicted_nphi_dataset['NPHI'] = nphi_prediction

predicted_nphi_dataset.head(n=5)

        * Final Dataframe

In [None]:
total_df = nphi_data.append(predicted_nphi_dataset)

total_df.sort_index(inplace=True)

total_df['LITHOLOGY_GEOLINK'] = df_main['LITHOLOGY_GEOLINK'].values

total_df['DEPTH'] = df_main['DEPTH'].values

total_df['WELL_NAME'] = df_main['WELL_NAME'].values

total_df.head(n=5)

# Checkpoint

In [None]:
path_file_total_df = '../checkpoints/total_df.csv.gz'

total_df.to_csv(path_file_total_df,index=False, compression='gzip')