# Importing Libraries

* System Append to set proper path

In [None]:
sys.path.append('../')

* Default

In [None]:
import lasio
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter
import multiprocessing

* Pandas Options

In [None]:
pd.set_option('max_columns', None)

* Source Code

In [None]:
from Source.Utils import welllog
from Source.Utils import multi_df
from Source.Utils import well_plot

* Tqdm Progress Bar

In [None]:
%%capture
from tqdm import tqdm_notebook

# Checkpoint import

In [None]:
total_df = pd.read_csv('../checkpoints/total_df.csv.gz', compression='gzip')

In [None]:
total_df.head()

## Well clustering

* Preparing the clustering dataset

In [None]:
tmp_dict = {}

wells = total_df['WELL_NAME'].unique().tolist()

for well in tqdm_notebook(wells, desc='Process Progress'):

    GR = total_df[total_df['WELL_NAME'] == well]['GR'].values

    RHOB = total_df[total_df['WELL_NAME'] == well]['RHOB'].values

    NPHI = total_df[total_df['WELL_NAME'] == well]['NPHI'].values

    DTC = total_df[total_df['WELL_NAME'] == well]['DTC'].values

    RDEP = total_df[total_df['WELL_NAME'] == well]['RDEP'].values

    listafinal = np.concatenate((GR, RHOB, NPHI, DTC, RDEP))

    tmp_dict[well] = listafinal

df_clust = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in tmp_dict.items() ]))

* Filtering Wells with High percentage of Null values

In [None]:
len(df_clust.columns)

In [None]:
null_count = df_clust.isnull().sum()

tresh = int(np.percentile(null_count, 50))

df_clust_clean = df_clust.copy()

for well in tqdm_notebook(df_clust.columns, desc='Process Progress'):
    if null_count[well] > tresh:
        df_clust_clean.drop(columns=well, inplace=True)


* Filling Null values

In [None]:
df_clust_clean.fillna(df_clust_clean.mean(), inplace=True)

In [None]:
df_clust_clean.isnull().sum().max()

* Importing the geographic locations of the drilled wells

In [None]:
well_exploration = pd.read_csv('../Data/wellbore_exploration_all.csv')

In [None]:
well_exploration.head()

In [None]:
geo_well = well_exploration[['wlbWellboreName','wlbNsUtm', 'wlbEwUtm']]

In [None]:
geo_well.head()

* Transposing the dataframe to prepare for the clustering algorithm

In [None]:
df_clust_clean

In [None]:
df_clust_t = df_clust_clean.T

In [None]:
df_clust_t.shape

* Filtering well Northing and Easting coordinates and appending to the clusterized dataframe

In [None]:
list_of_wells = df_clust_clean.columns

In [None]:
df_clust_t['UTM-N'] = 0.0
df_clust_t['UTM-E'] = 0.0

In [None]:
for well_name in tqdm_notebook(list_of_wells, desc='Process Progress'):
    if well_name in list(geo_well['wlbWellboreName'].values):
        df_clust_t.loc[well_name, 'UTM-N'] = geo_well[geo_well['wlbWellboreName'] == well_name]['wlbNsUtm'].values
        df_clust_t.loc[well_name, 'UTM-E'] = geo_well[geo_well['wlbWellboreName'] == well_name]['wlbEwUtm'].values

In [None]:
df_clust_t[['UTM-N', 'UTM-E']] 

In [None]:
df_clust_t = df_clust_t[df_clust_t['UTM-N'] != 0]

* Normalizing columns 

In [None]:
df_clust_norm = df_clust_t.copy()

In [None]:
from sklearn.preprocessing import MinMaxScaler

df_clust_norm = MinMaxScaler().fit_transform(df_clust_norm)

df_clust_norm = pd.DataFrame(df_clust_norm, index=df_clust_t.index, columns=df_clust_t.columns)

* Importing K-means algorithm

In [None]:
from sklearn.cluster import KMeans

* Evaluating the optimum number of clusters

In [None]:
wcss = [] # Within cluster sum of squares to analyze k-means performance

k_number_clusters = np.arange(1, 11) # define number of clusters to test

for k in tqdm_notebook(k_number_clusters, desc='K-Means Hyperparameter Tunning'): 

    kmeans = KMeans(n_clusters=k, init="k-means++", random_state=42, max_iter=500, n_jobs=7) # k-means model definition

    kmeans.fit(df_clust_t) # fitting to our dataframe

    wcss.append(kmeans.inertia_) # appending intertia value to our list for further evaluation

In [None]:
plt.figure(figsize=(12,6))    
plt.plot(k_number_clusters, wcss, linewidth=2, color="red", marker ="8")
plt.xlabel("K Clusters Value")
plt.xticks(np.arange(1,11,1))
plt.ylabel("WCSS")
plt.title('K-Means Elbow Plot Evaluation Method')
plt.show()

In [None]:
N_CLUSTERS = 4

In [None]:
optimum_clustering = KMeans(n_clusters=N_CLUSTERS, init="k-means++", random_state=42, max_iter=500, n_jobs=7)

df_clust_t['Cluster'] = optimum_clustering.fit_predict(df_clust_t)

In [None]:
df_clust_t.head()

In [None]:
plt.figure(figsize=(12,6))    

sns.scatterplot(data=df_clust_t, x="UTM-E", y="UTM-N", palette='bright', hue="Cluster")


# Checkpoint

In [None]:
df_clust_clean

path_file_df_clust_clean = '../checkpoints/df_clust_clean.csv.gz'

df_clust_clean.to_csv(path_file_df_clust_clean,index=False, compression='gzip')