In [None]:
# Author: Pierre Jeanne
# Project Name: Hole Deviation Prediction 
# Date Created: 11 April 2021
# from: https://www.kaggle.com/cboychinedu/starter-hole-deviation-prediction-eda

## INTRODUCTION

Hole Deviation is the unintentional departure of the drill bit from a preselected borehole trajactory. Whether it involves drilling a straight or curved-hole section. The tendency of the drill bit to walk away from the desired path can lead to drilling problems such as higher drilling costs and also lease-boundary legal problems.

### Causes of hole deviation:
It is not exactly known what causes a drill bit to deviate from its uninteded path. it is generally agreed that one or a combination of the following factors may be responsible for deviation.

1- Heterogeneous nature of formation and dip angle

2- Drill string characteristics, specifically the bottomehole assemble makeup (BHA)

3- Applied weight on bit (WOB)

4- Stabilizers (location, number, clearances)

5- Hole-inclination angle from the vertical

6- Hydraulics at the bit

7- Improper hole cleaning

N/B; It is known that some resultand force acting on a drill bit causes hole deviation to occur. The machanics of this resultand force is complex and it is governed mainly by the mechanics of the bottomhole assemble makeup (BHA).

In [None]:
# Importing the necessary packages
import numpy as np
import pandas as pd
# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import seaborn as sns;

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>1- load the dataset</center></h3>

In [None]:
# import dataset
df = pd.read_csv(r"../input/hole-deviation/well_log.csv")
# drop unwanted columns
df = df.drop(columns=['Unnamed: 0'])
# set 'Depth' as index
df = df.set_index('Depth')
# show first 3 columns
df.head(3)

The provided well log dataset contains parameters for drilling which are:
- Gamma-ray = natural radioactivity of the formation along the borehole.
- Resistivity = ability to impede the flow of electric current.
- Density = density along the length of a borehole (bulk density)
- Density_Calculated = This logging measures the calculated density.
- Classification: 0 not deviate from its pre selected trajectory, 1 deviated from its pre selected trajectory.

In [None]:
# show number of rows and columns
print('Shape of the file')
print('-'*30)
print(df.shape)
print('')
# show if missing values
print('Number of nan values')
print('-'*30)
print(df.isnull().sum())

In [None]:
# plot the data
df.plot(subplots=True, figsize=(15,35))
plt.show()

In [None]:
# can drop columns with constant number
df = df.drop('Neuron_Porosity',axis=1)

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>2- feature engineering</center></h3>

In [None]:
# calculate various mechanical properties
df['ratio_vp/vs'] = df['Vp']/df['Vs']
df['Rigidity'] = df['Density']*df['Vs']**2
df["Young’s modulus"] = df['Rigidity']* ((3*(df['Vp']**2)-(4*(df['Vs']**2)))/(df['Vp']**2-df['Vs']**2))
df["Bulk's modulus"] = df["Young’s modulus"]/(3*(1-2*df['Possions_Ratio']))
df['Lame'] = df["Bulk's modulus"]-2*df['Rigidity']/3

In [None]:
# plot mechanical parameters
df1 = df[['ratio_vp/vs','Rigidity',"Young’s modulus","Bulk's modulus",'Lame']] 
df1.plot(subplots=True, figsize=(15,15))
plt.show()

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>3- correlations</center></h3>

In [None]:
# Get correlation matrix 
corr_df = df.corr(method='pearson')
# heatmap
fig, axes = plt.subplots(1,figsize=(15,15))
sns.heatmap(corr_df,annot=True,linewidths=.5, annot_kws={"size": 14},vmin=-1.0, vmax=1.0, square=True,cbar=True)
plt.show()

In [None]:
# we removed variables that are highly correlated between them...
df = df.drop(columns=['Gamma-ray','Delta T','Density_Calculated','Vp','Vs',
                     'ratio_vp/vs','Rigidity',"Bulk's modulus",'Lame','Density_Porosity'])

In [None]:
# Get correlation matrix 
corr_df = df.corr(method='pearson')
# heatmap
fig, axes = plt.subplots(1,figsize=(6,6))
sns.heatmap(corr_df,annot=True,linewidths=.5, annot_kws={"size": 14},vmin=-1.0, vmax=1.0, square=True,cbar=True)
plt.show()

In [None]:
# plot the correlations between variables and their relations with 'classification'
fig, axes = plt.subplots(2,2,figsize=(10,10))
fig.subplots_adjust(hspace=0.3)
ax0, ax1, ax2, ax3 = axes.flatten() 
sns.scatterplot(data=df,x='Shale_Volume',y='Restivity',hue='Classification',ax=ax0)
sns.scatterplot(data=df,x='Density',y='Restivity',hue='Classification',ax=ax1)
sns.scatterplot(data=df,x='Possions_Ratio',y="Young’s modulus",hue='Classification',ax=ax2)
sns.scatterplot(data=df,x='Density',y="Young’s modulus",hue='Classification',ax=ax3)

plt.show()

In [None]:
# Displaying the count for non Deviated hole 
majority_class = df.loc[df['Classification'] == 0].count()[0]

# Showing the count for Deviated hole 
minority_class = df.loc[df['Classification'] == 1].count()[0]

# Printing the classes for the deviated and non-deviated class 
print('Non Deviated Class (Classification = 0): {}'.format(majority_class))
print('Deviated Class (Classification = 1) : {}'.format(minority_class))


sns.countplot(x="Classification", data=df)
plt.show()

The target data is imbalanced 

The challenge of working with imbalanced datasets is that most machine learning techniques will ignore, and in turn have poor performance on, the minority class, although typically it is performance on the minority class that is most important.

One approach to addressing imbalanced datasets is to use **SMOTE**.

SMOTE is an oversampling method. It works by creating synthetic samples from the minor class instead of creating copies. The algorithm selects two or more similar instances (using a distance measure) and perturbing an instance one attribute at a time by a random amount within the difference to the neighboring instances. This is a type of data augmentation for the minority class and is referred to as the Synthetic Minority Oversampling Technique, or **SMOTE** for short.

In [None]:
# pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
X = df.drop('Classification',axis=1).values
y = df[['Classification']].values.ravel()
# Using SMOTE to Balance the imbalanced data 
X_resampled, y_resampled = SMOTE().fit_resample(X, y)

In [None]:
# convert y_resampled to df
df_y_resampled = pd.DataFrame(y_resampled,columns=['Classification'])

# showing a plot of the Balanced dataset 
majority_class = df_y_resampled.loc[df_y_resampled['Classification'] == 0].count()[0]

# Showing the count for Non Hole Deviation 
minority_class = df_y_resampled.loc[df_y_resampled['Classification'] == 1].count()[0]

# Printing the classes for the deviated and non-deviated class 
print('Non Deviated Class (Classification = 0): {}'.format(majority_class))
print('Deviated Class (Classification = 1) : {}'.format(minority_class))


sns.countplot(x="Classification", data=df_y_resampled)
plt.show()

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>4- Machine learning</center></h3>

In [None]:
# slip the data
from sklearn.model_selection import train_test_split
# scale the data
from sklearn.preprocessing import StandardScaler
# cross validation
from sklearn.model_selection import cross_val_score
# classification model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
# hyperparameter tunning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
# model evaluation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## 4.1 Preprocessing
### 4.1.1: Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,test_size = .3, random_state=0)

### 4.1.2: Scale the data

In [None]:
scaler = StandardScaler()
# fit and transform "x_train"
X_train = scaler.fit_transform(X_train)
# transform "x_test"
X_test = scaler.transform(X_test)

## 4.2: Model selection
### 4.2.1: Classification with logistic regression
Binary logistic regression requires the dependent variable to be binary.
- Only the meaningful variables should be included.
- The independent variables should be independent of each other. That is, the model should have little or no multicollinearity.
- The independent variables are linearly related to the log odds.
- Logistic regression requires quite large sample sizes.

In [None]:
clf_lr = LogisticRegression()

# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_train = cross_val_score(clf_lr,X_train, y_train,cv=5)
# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_test = cross_val_score(clf_lr,X_test, y_test,cv=5)

print(cv_scores_train)
print(cv_scores_test)

clf_lr_mean_train = np.mean(cv_scores_train)
clf_lr_mean_test = np.mean(cv_scores_test)

### 4.2.2: Classification with k-Nearest Neighbors

In [None]:
clf_knn = KNeighborsClassifier(n_neighbors=2)

# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_train = cross_val_score(clf_knn,X_train, y_train,cv=5)
# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_test = cross_val_score(clf_knn,X_test, y_test,cv=5)

print(cv_scores_train)
print(cv_scores_test)

clf_knn_mean_train = np.mean(cv_scores_train)
clf_knn_mean_test = np.mean(cv_scores_test)

### 4.2.3: Classification with Support Vector Machines

In [None]:
clf_svm = svm.SVC()

# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_train = cross_val_score(clf_svm,X_train, y_train,cv=5)
# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_test = cross_val_score(clf_svm,X_test, y_test,cv=5)

print(cv_scores_train)
print(cv_scores_test)

clf_svm_mean_train = np.mean(cv_scores_train)
clf_svm_mean_test = np.mean(cv_scores_test)

### 4.2.4: Classification with random Forest classifier
#### random search

In [None]:
# Setup the parameters and distributions to sample from: param_dist
parameters = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = parameters, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
# view the best parameters from fitting the random search:
rf_random.best_params_

#### Grid Search with Cross Validation
Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, we can explicitly specify every combination of settings to try. We do this with GridSearchCV, a method that, instead of sampling randomly from a distribution, evaluates all combinations we define. To use Grid Search, we make another grid based on the best values provided by random search:

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [25,27,29,30,32,34],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1],
    'min_samples_split': [2,3],
    'n_estimators': [170.185,200,215,230]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

#### best model

In [None]:
clf_rf = RandomForestClassifier(bootstrap ='True',max_depth = 25,max_features = 'sqrt',
                                min_samples_leaf = 1, min_samples_split = 2, n_estimators = 200)

# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_train = cross_val_score(clf_rf,X_train, y_train,cv=5)
# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_test = cross_val_score(clf_rf,X_test, y_test,cv=5)

print(cv_scores_train)
print(cv_scores_test)

clf_rf_mean_train = np.mean(cv_scores_train)
clf_rf_mean_test = np.mean(cv_scores_test)

### 4.1.6: Classification with Neutral network

In [None]:
# Import necessary modules
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import History 
from keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from keras.wrappers.scikit_learn import KerasClassifier

#### find the best model architecture

In [None]:
def create_model(unit1,unit2):
    # creating the layers of the NN
    model2 = Sequential()
    model2.add(Dense(units=unit1, activation='relu'))
    model2.add(Dense(units=unit2, activation='relu'))
    model2.add(Dense(units=1, activation='sigmoid'))
    model2.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model2

In [None]:
# # Import KerasClassifier from keras scikit learn wrappers
# from sklearn.model_selection import KFold

# # Create a KerasClassifier
# model = KerasClassifier(build_fn = create_model)

# # Define the parameters to try out
# params={'batch_size':[5, 15, 20],
#         'unit1':[40,50,60],
#         'unit2':[20,30,40,50]
#         }
# gs=GridSearchCV(estimator=model, param_grid=params, cv=10)
# # now fit the dataset to the GridSearchCV object. 
# gs = gs.fit(X_train, y_train)
# early_stopping_monitor = EarlyStopping(monitor='accuracy', patience=3)
# gs = gs.fit(X_train, y_train,epochs=500, callbacks = [early_stopping_monitor], verbose=1)

In [None]:
# Print results
# print("Best: {} using {}".format(gs.best_score_,gs.best_params_))

#### best model

In [None]:
# Wrap Keras model so it can be used by scikit-learn
clf_nn = KerasClassifier(build_fn=create_model, 
                                 unit1=60, 
                                 unit2=40,
                                 batch_size=5)

# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_train = cross_val_score(clf_nn,X_train, y_train,cv=5)

# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_test = cross_val_score(clf_nn,X_test, y_test,cv=5)

print(cv_scores_train)
print(cv_scores_test)

clf_nn_mean_train = np.mean(cv_scores_train)
clf_nn_mean_test = np.mean(cv_scores_test)

#### add other hidden layer

## summary

In [None]:
# create list with mean model score on training and testing dataset
score_train = []
score_train.extend((clf_lr_mean_train, clf_knn_mean_train, clf_svm_mean_train,clf_rf_mean_train,clf_nn_mean_train))
score_test = []
score_test.extend((clf_lr_mean_test, clf_knn_mean_test, clf_svm_mean_test,clf_rf_mean_test,clf_nn_mean_test))

In [None]:
# create dataframe
list_regressors = ['Log reg','knn','svm','rf','nn']

dic_score = {'model': list_regressors,
            'score_train':score_train,
            'score_test':score_test}

dic_score = pd.DataFrame(dic_score)
dic_score

In [None]:
# Plot the predictions for each model
fig, axes = plt.subplots(1,figsize=(15,5))
ax = plt.subplot(1,1,1)

ax = sns.pointplot(x = "model", y = "score_train", data = dic_score,label = 'accuracy on train data') 
ax = sns.pointplot(x = "model", y = "score_test", data = dic_score,color='red',label = 'accuracy on test data') 
ax.legend()
ax.set_ylabel('Score (accuracy)', size=20, labelpad=12.5)
ax.set_xlabel('Model', size=20, labelpad=12.5)
ax.tick_params(labelsize=14)
# ax.set_xticklabels(ax.get_xticklabels(), size=14) 
# ax.text(i, dic_score[0] + 0.002, '{:.6f}'.format(dic_score[0]), horizontalalignment='left', size='large', color='black', weight='semibold')

# add annotations one by one with a loop
for ind in dic_score.index: 
    ax.text(ind,dic_score['score_train'][ind]+0.01,'{:.5f}'.format(dic_score['score_train'][ind]),
             horizontalalignment='left', size='medium', color='blue', weight='semibold',fontsize=12)
    ax.text(ind,dic_score['score_test'][ind]-0.01,'{:.5f}'.format(dic_score['score_test'][ind]),
             horizontalalignment='left', size='medium', color='red', weight='semibold',fontsize=12)
        
plt.title('Scores of Models', size=20)

plt.show()

### see how best model is performing on the original data set

In [None]:
clf_rf = RandomForestClassifier(bootstrap ='True',max_depth = 25,max_features = 'sqrt',
                                min_samples_leaf = 1, min_samples_split = 2, n_estimators = 200)
clf_rf.fit(X_train,y_train)
y_pred = clf_rf.predict(X_test)

In [None]:
# Generate the confusion matrix and 
print('confusion matrix')
print('-'*30)
print(confusion_matrix(y_test, y_pred))
print('')
# print classification report
print('classification report')
print('-'*30)
print(classification_report(y_test, y_pred))