# Importing Libraries

* System Append to set proper path

In [None]:
sys.path.append('../')

* Default

In [None]:
import lasio
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter

* Pandas Options

In [None]:
pd.set_option('max_columns', None)

* Source Code

In [None]:
from Source.Utils import welllog
from Source.Utils import multi_df
from Source.Utils import well_plot

* Tqdm Progress Bar

In [None]:
%%capture
from tqdm import tqdm_notebook

# Checkpoint import

In [None]:
total_df = pd.read_csv('../checkpoints/total_df.csv.gz', compression='gzip')

In [None]:
total_df.head()

# Lithology Code Prediction

* LITHOLOGY_GEOLINK non NULL data

In [None]:
litho_data = total_df[pd.notnull(total_df['LITHOLOGY_GEOLINK'])].drop(columns=['WELL_NAME'])

litho_data.head(n=5)

In [None]:
len(litho_data)

    * Converting LITHOLOGY_GEOLINK to int type

In [None]:
litho_data['LITHOLOGY_GEOLINK'] = litho_data['LITHOLOGY_GEOLINK'].astype(int)

    * Checking classes balance

In [None]:
unsorted_bal_class = dict(Counter(litho_data['LITHOLOGY_GEOLINK'].values))

sorted_bal_class = {k: v for k, v in sorted(unsorted_bal_class.items(), key=lambda item: item[1])}

sorted_bal_class # although some classes have a considerable less representation than others, we need to respect this distribution to maintain the geological setting of the area

    * Pearson Correlation

In [None]:
corrmat_litho = abs(litho_data.corr()) # absolute correlation

plt.figure(figsize=(15,10))

sns.heatmap(corrmat_litho, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, cmap='Blues')

    * Dataset Creation

In [None]:
X = litho_data.drop(columns='LITHOLOGY_GEOLINK')

Y = litho_data['LITHOLOGY_GEOLINK'].values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.40, random_state=42)

    * Model Selection

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate

class_pipelines = []

class_pipelines.append(('ScaledRidge', Pipeline([('Scaler', RobustScaler()),('Ridge', linear_model.RidgeClassifier())])))
class_pipelines.append(('ScaledSGDCls', Pipeline([('Scaler', RobustScaler()),('SGDCls', linear_model.SGDClassifier(n_jobs=30))])))
class_pipelines.append(('ScaledKNNCls', Pipeline([('Scaler', RobustScaler()),('KNNCls', KNeighborsClassifier(n_jobs=30))])))
class_pipelines.append(('ScaledDTC', Pipeline([('Scaler', RobustScaler()),('DTC', DecisionTreeClassifier())])))
class_pipelines.append(('ScaledRFC', Pipeline([('Scaler', RobustScaler()),('RFC', RandomForestClassifier(n_jobs=30))])))
class_pipelines.append(('ScaledADA', Pipeline([('Scaler', RobustScaler()),('ADA', AdaBoostClassifier())])))


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score, make_scorer

results = []

names = []

for name, model in tqdm_notebook(class_pipelines, desc='Cross-Validation Procedure'):

    kfold = KFold(n_splits=5, random_state=42)

    scorers = {'accuracy': make_scorer(accuracy_score), 'balanced_accuracy': make_scorer(balanced_accuracy_score), 'precision': make_scorer(precision_score, average='micro'), 'recall': make_scorer(recall_score, average='micro'), 'f1': make_scorer(f1_score, average='weighted')}

    final_scorers = cross_validate(model, x_train, y_train, cv=kfold, scoring=scorers)
    results.append(final_scorers)
    names.append(name)
    print(name, 'Acc: ', final_scorers['test_accuracy'].mean(), '\\', final_scorers['test_accuracy'].std(), '\n')
    print(name, 'BalAcc: ', final_scorers['test_balanced_accuracy'].mean(), '\\', final_scorers['test_balanced_accuracy'].std(), '\n')
    print(name, 'F1: ', final_scorers['test_f1'].mean(), '\\', final_scorers['test_f1'].std(), '\n')


    * Hyperparameter Tunning

In [None]:
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}

# rfc = RandomForestClassifier(n_jobs=40)

# kfold = KFold(n_splits=5, random_state=42)

# rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = kfold, verbose=10, random_state=42, scoring='balanced_accuracy')

# rfc_random.fit(RobustScaler().fit_transform(x_train), y_train)


    * Model Accuracy Test

In [None]:
best_rfc = RandomForestClassifier(bootstrap=False, max_depth=50, min_samples_split=2, n_estimators=1000, n_jobs=30) # the rest of the best parameters are  the default ones

best_rfc.fit(RobustScaler().fit_transform(x_train), y_train)

y_predict = best_rfc.predict(RobustScaler().fit_transform(x_test))

In [None]:
print('Accuracy: ', str(accuracy_score(y_test, y_predict)), '\n')
print('########################', '\n')
print('Balanced Accuracy: ', str(balanced_accuracy_score(y_test, y_predict)))
print('########################', '\n')
print('F1-Score Micro: ', str(f1_score(y_test, y_predict, average='weighted')))

    * Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix

fig, ax = plt.subplots(figsize=(10, 10))

plot_confusion_matrix(best_rfc, RobustScaler().fit_transform(x_test), y_test, normalize='true', cmap=plt.cm.Blues, ax=ax, values_format='.1f')

plt.show()

In [None]:
litho_data.columns

    * Well log visualization

In [None]:
from Source.Utils import well_plot

In [None]:
well_plot.plot_well_logs(df_main, '35_11-1')