In [1]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from tqdm import tqdm
import time

train = pd.read_csv("CSV_train.csv",low_memory=False,delimiter=';')
test=pd.read_csv("CSV_test.csv",low_memory=False,delimiter=',')
hidden=pd.read_csv("CSV_hidden_test.csv",low_memory=False,delimiter=',')

In [2]:
# storing length of datasets 
train_len = train.shape[0] 
test_len = test.shape[0]
All_data = pd.concat((train,test,hidden)).reset_index(drop=True) 

lithology_keys = {30000: 'Sandstone',
                 65030: 'Sandstone/Shale',
                 65000: 'Shale',
                 80000: 'Marl',
                 74000: 'Dolomite',
                 70000: 'Limestone',
                 70032: 'Chalk',
                 88000: 'Halite',
                 86000: 'Anhydrite',
                 99000: 'Tuff',
                 90000: 'Coal',
                 93000: 'Basement'}
All_data['Lithology'] = All_data['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(lithology_keys)
All_data

Unnamed: 0,WELL,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,GROUP,FORMATION,CALI,RSHA,RMED,...,DTS,DCAL,DRHO,MUDWEIGHT,RMIC,ROPA,RXO,FORCE_2020_LITHOFACIES_LITHOLOGY,FORCE_2020_LITHOFACIES_CONFIDENCE,Lithology
0,15/9-13,494.5280,437641.96875,6470972.5,-469.501831,NORDLAND GP.,,19.480835,,1.611410,...,,,-0.574928,,,,,65000,1.0,Shale
1,15/9-13,494.6800,437641.96875,6470972.5,-469.653809,NORDLAND GP.,,19.468800,,1.618070,...,,,-0.570188,,,,,65000,1.0,Shale
2,15/9-13,494.8320,437641.96875,6470972.5,-469.805786,NORDLAND GP.,,19.468800,,1.626459,...,,,-0.574245,,,,,65000,1.0,Shale
3,15/9-13,494.9840,437641.96875,6470972.5,-469.957794,NORDLAND GP.,,19.459282,,1.621594,...,,,-0.586315,,,,,65000,1.0,Shale
4,15/9-13,495.1360,437641.96875,6470972.5,-470.109772,NORDLAND GP.,,19.453100,,1.602679,...,,,-0.597914,,,,,65000,1.0,Shale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1429689,35/9-7,2973.2988,536096.06250,6793022.0,-2943.444580,BAAT GP.,Etive Fm.,8.276272,,2.820439,...,136.911575,,0.502458,,2.311106,24.306124,,65000,2.0,Shale
1429690,35/9-7,2973.4508,536096.06250,6793022.0,-2943.595947,BAAT GP.,Etive Fm.,8.267273,,3.020778,...,137.583923,,0.374753,,1.853418,22.201078,,65000,2.0,Shale
1429691,35/9-7,2973.6028,536096.06250,6793022.0,-2943.747559,BAAT GP.,Etive Fm.,8.250099,,2.795711,...,138.310898,,0.211487,,1.325961,20.096741,,65000,2.0,Shale
1429692,35/9-7,2973.7548,536096.06250,6793022.0,-2943.899170,BAAT GP.,Etive Fm.,,,2.658694,...,137.592819,,0.147950,,1.260347,17.992323,,65000,2.0,Shale


In [3]:
#dropping columns with high missing values
drop_cols = ['SGR', 'ROPA', 'RXO', 'MUDWEIGHT','DCAL','RMIC','FORCE_2020_LITHOFACIES_CONFIDENCE']
All_data_drop = All_data.drop(drop_cols, axis=1)
# encoding categorical variables
All_data_drop['GROUP_encoded'] = All_data_drop['GROUP'].astype('category')
All_data_drop['GROUP_encoded'] = All_data_drop['GROUP_encoded'].cat.codes

All_data_drop['FORMATION_encoded'] = All_data_drop['FORMATION'].astype('category')
All_data_drop['FORMATION_encoded'] = All_data_drop['FORMATION_encoded'].cat.codes

All_data_drop['WELL_encoded'] = All_data_drop['WELL'].astype('category')
All_data_drop['WELL_encoded'] = All_data_drop['WELL_encoded'].cat.codes

All_data_drop['Lithology_encoded'] = All_data_drop['Lithology'].astype('category')
All_data_drop['Lithology_encoded'] = All_data_drop['Lithology_encoded'].cat.codes

In [4]:
All_data_drop

Unnamed: 0,WELL,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,GROUP,FORMATION,CALI,RSHA,RMED,...,BS,ROP,DTS,DRHO,FORCE_2020_LITHOFACIES_LITHOLOGY,Lithology,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
0,15/9-13,494.5280,437641.96875,6470972.5,-469.501831,NORDLAND GP.,,19.480835,,1.611410,...,,34.636410,,-0.574928,65000,Shale,6,-1,1,10
1,15/9-13,494.6800,437641.96875,6470972.5,-469.653809,NORDLAND GP.,,19.468800,,1.618070,...,,34.636410,,-0.570188,65000,Shale,6,-1,1,10
2,15/9-13,494.8320,437641.96875,6470972.5,-469.805786,NORDLAND GP.,,19.468800,,1.626459,...,,34.779556,,-0.574245,65000,Shale,6,-1,1,10
3,15/9-13,494.9840,437641.96875,6470972.5,-469.957794,NORDLAND GP.,,19.459282,,1.621594,...,,39.965164,,-0.586315,65000,Shale,6,-1,1,10
4,15/9-13,495.1360,437641.96875,6470972.5,-470.109772,NORDLAND GP.,,19.453100,,1.602679,...,,57.483765,,-0.597914,65000,Shale,6,-1,1,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1429689,35/9-7,2973.2988,536096.06250,6793022.0,-2943.444580,BAAT GP.,Etive Fm.,8.276272,,2.820439,...,8.5,15.195305,136.911575,0.502458,65000,Shale,0,16,113,10
1429690,35/9-7,2973.4508,536096.06250,6793022.0,-2943.595947,BAAT GP.,Etive Fm.,8.267273,,3.020778,...,8.5,15.770223,137.583923,0.374753,65000,Shale,0,16,113,10
1429691,35/9-7,2973.6028,536096.06250,6793022.0,-2943.747559,BAAT GP.,Etive Fm.,8.250099,,2.795711,...,8.5,16.418465,138.310898,0.211487,65000,Shale,0,16,113,10
1429692,35/9-7,2973.7548,536096.06250,6793022.0,-2943.899170,BAAT GP.,Etive Fm.,,,2.658694,...,8.5,17.037945,137.592819,0.147950,65000,Shale,0,16,113,10


In [5]:
#dropping categorial features replaces beforehan by encoded features
drop2 = All_data_drop.drop(['GROUP', 'FORMATION','WELL','FORCE_2020_LITHOFACIES_LITHOLOGY','Lithology'], axis=1)

# splitting dataset into training, test, and hidden sets
train_prep = drop2[:train_len].copy()
test_prep = drop2[train_len:(train_len+test_len)].copy()
hidden_prep = drop2[(train_len+test_len):].copy()

In [6]:
drop2

Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,DTC,SP,BS,ROP,DTS,DRHO,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
0,494.5280,437641.96875,6470972.5,-469.501831,19.480835,,1.611410,1.798681,1.884186,80.200851,...,161.131180,24.612379,,34.636410,,-0.574928,6,-1,1,10
1,494.6800,437641.96875,6470972.5,-469.653809,19.468800,,1.618070,1.795641,1.889794,79.262886,...,160.603470,23.895531,,34.636410,,-0.570188,6,-1,1,10
2,494.8320,437641.96875,6470972.5,-469.805786,19.468800,,1.626459,1.800733,1.896523,74.821999,...,160.173615,23.916357,,34.779556,,-0.574245,6,-1,1,10
3,494.9840,437641.96875,6470972.5,-469.957794,19.459282,,1.621594,1.801517,1.891913,72.878922,...,160.149429,23.793688,,39.965164,,-0.586315,6,-1,1,10
4,495.1360,437641.96875,6470972.5,-470.109772,19.453100,,1.602679,1.795299,1.880034,71.729141,...,160.128342,24.104078,,57.483765,,-0.597914,6,-1,1,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1429689,2973.2988,536096.06250,6793022.0,-2943.444580,8.276272,,2.820439,3.158570,,90.720284,...,75.260658,,8.5,15.195305,136.911575,0.502458,0,16,113,10
1429690,2973.4508,536096.06250,6793022.0,-2943.595947,8.267273,,3.020778,3.332977,,87.062027,...,74.868301,,8.5,15.770223,137.583923,0.374753,0,16,113,10
1429691,2973.6028,536096.06250,6793022.0,-2943.747559,8.250099,,2.795711,3.044179,,86.115921,...,74.848122,,8.5,16.418465,138.310898,0.211487,0,16,113,10
1429692,2973.7548,536096.06250,6793022.0,-2943.899170,,,2.658694,2.847681,,89.497131,...,74.964027,,8.5,17.037945,137.592819,0.147950,0,16,113,10


In [7]:
train_prep1= train_prep.copy()
test_prep1= test_prep.copy()
hidden_prep1= hidden_prep.copy()

In [8]:
#Inputing missing values by introducing median 
from sklearn.impute import SimpleImputer
miss = SimpleImputer(missing_values=np.nan, strategy='median')
miss.fit(drop2)
All_imp = miss.fit_transform(drop2)
All_imp=pd.DataFrame(All_imp, columns=['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'CALI', 'RSHA', 'RMED', 'RDEP',
       'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DRHO',
         'GROUP_encoded',
       'FORMATION_encoded', 'WELL_encoded','Lithology_encoded'])
All_imp

Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,DTC,SP,BS,ROP,DTS,DRHO,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
0,494.5280,437641.96875,6470972.5,-469.501831,19.480835,1.398049,1.611410,1.798681,1.884186,80.200851,...,161.131180,24.612379,12.250001,34.636410,189.362198,-0.574928,6.0,-1.0,1.0,10.0
1,494.6800,437641.96875,6470972.5,-469.653809,19.468800,1.398049,1.618070,1.795641,1.889794,79.262886,...,160.603470,23.895531,12.250001,34.636410,189.362198,-0.570188,6.0,-1.0,1.0,10.0
2,494.8320,437641.96875,6470972.5,-469.805786,19.468800,1.398049,1.626459,1.800733,1.896523,74.821999,...,160.173615,23.916357,12.250001,34.779556,189.362198,-0.574245,6.0,-1.0,1.0,10.0
3,494.9840,437641.96875,6470972.5,-469.957794,19.459282,1.398049,1.621594,1.801517,1.891913,72.878922,...,160.149429,23.793688,12.250001,39.965164,189.362198,-0.586315,6.0,-1.0,1.0,10.0
4,495.1360,437641.96875,6470972.5,-470.109772,19.453100,1.398049,1.602679,1.795299,1.880034,71.729141,...,160.128342,24.104078,12.250001,57.483765,189.362198,-0.597914,6.0,-1.0,1.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1429689,2973.2988,536096.06250,6793022.0,-2943.444580,8.276272,1.398049,2.820439,3.158570,2.331407,90.720284,...,75.260658,54.270451,8.500000,15.195305,136.911575,0.502458,0.0,16.0,113.0,10.0
1429690,2973.4508,536096.06250,6793022.0,-2943.595947,8.267273,1.398049,3.020778,3.332977,2.331407,87.062027,...,74.868301,54.270451,8.500000,15.770223,137.583923,0.374753,0.0,16.0,113.0,10.0
1429691,2973.6028,536096.06250,6793022.0,-2943.747559,8.250099,1.398049,2.795711,3.044179,2.331407,86.115921,...,74.848122,54.270451,8.500000,16.418465,138.310898,0.211487,0.0,16.0,113.0,10.0
1429692,2973.7548,536096.06250,6793022.0,-2943.899170,12.515673,1.398049,2.658694,2.847681,2.331407,89.497131,...,74.964027,54.270451,8.500000,17.037945,137.592819,0.147950,0.0,16.0,113.0,10.0


In [9]:
# print(All_imp['GR'].quantile(0.50)) 
# print(All_imp['GR'].quantile(0.95)) 
# All_imp['GR'] = np.where(All_imp['GR'] > 150, 67, All_imp['GR'])
# All_imp['GR'].describe()

In [10]:
All_imp.describe()

Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,DTC,SP,BS,ROP,DTS,DRHO,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
count,1429694.0,1429694.0,1429694.0,1429694.0,1429694.0,1429694.0,1429694.0,1429694.0,1429694.0,1429694.0,...,1429694.0,1429694.0,1429694.0,1429694.0,1429694.0,1429694.0,1429694.0,1429694.0,1429694.0,1429694.0
mean,2216.632,484685.7,6680881.0,-2166.097,13.06219,5.929846,4.633407,11.05773,2.297625,70.47302,...,112.2697,56.15993,11.9803,64.89179,193.5923,0.01135653,6.6813,36.83317,56.81314,9.139322
std,992.9555,35097.49,131117.2,955.4058,3.575119,70.92686,50.0156,106.7479,0.2336318,34.08658,...,28.86977,61.74737,2.396487,943.5014,36.22618,6.21591,3.148458,23.68314,32.79013,1.501331
min,136.086,423237.5,6406641.0,-5395.563,2.344,0.0001,-0.008418695,0.03170056,0.7209712,0.1092843,...,7.415132,-999.0,6.0,-0.117977,69.16318,-7429.339,-1.0,-1.0,0.0,0.0
25%,1457.365,454469.5,6590310.0,-2833.309,9.973806,1.398049,0.9265758,0.910423,2.131772,47.37605,...,88.42327,42.87162,12.25,19.29947,189.3622,-0.005890843,5.0,18.0,32.0,9.0
50%,2121.598,478648.8,6737678.0,-2088.677,12.51567,1.398049,1.455281,1.45352,2.331407,67.86042,...,108.3101,54.27045,12.25,19.29947,189.3622,0.003207336,6.0,37.0,55.0,10.0
75%,2895.131,520153.2,6784878.0,-1435.2,15.21438,1.398049,2.593216,2.59318,2.471782,89.24216,...,138.7327,65.11991,12.25,19.29947,189.3622,0.01877124,9.0,58.0,83.0,10.0
max,5436.632,572632.8,6856661.0,-111.086,28.279,2193.905,1988.616,1999.887,3.45782,1141.292,...,320.4789,526.5473,26.0,47015.12,676.5781,2.836938,13.0,69.0,117.0,11.0


In [11]:
All_imp.clip(lower=All_imp.quantile(0.05), upper=All_imp.quantile(0.98), axis=1)

Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,DTC,SP,BS,ROP,DTS,DRHO,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
0,729.48095,437780.34375,6470972.5,-518.136710,19.480835,1.398049,1.611410,1.798681,1.899048,80.200851,...,161.131180,24.612379,12.250001,34.636410,189.362198,-0.031164,6.0,-1.0,3.0,10.0
1,729.48095,437780.34375,6470972.5,-518.136710,19.468800,1.398049,1.618070,1.795641,1.899048,79.262886,...,160.603470,23.895531,12.250001,34.636410,189.362198,-0.031164,6.0,-1.0,3.0,10.0
2,729.48095,437780.34375,6470972.5,-518.136710,19.468800,1.398049,1.626459,1.800733,1.899048,74.821999,...,160.173615,23.916357,12.250001,34.779556,189.362198,-0.031164,6.0,-1.0,3.0,10.0
3,729.48095,437780.34375,6470972.5,-518.136710,19.459282,1.398049,1.621594,1.801517,1.899048,72.878922,...,160.149429,23.793688,12.250001,39.965164,189.362198,-0.031164,6.0,-1.0,3.0,10.0
4,729.48095,437780.34375,6470972.5,-518.136710,19.453100,1.398049,1.602679,1.795299,1.899048,71.729141,...,160.128342,24.104078,12.250001,57.483765,189.362198,-0.031164,6.0,-1.0,3.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1429689,2973.29880,536096.06250,6793022.0,-2943.444580,8.409431,1.398049,2.820439,3.158570,2.331407,90.720284,...,75.260658,54.270451,8.500000,15.195305,154.237827,0.121060,2.0,16.0,113.0,10.0
1429690,2973.45080,536096.06250,6793022.0,-2943.595947,8.409431,1.398049,3.020778,3.332977,2.331407,87.062027,...,74.868301,54.270451,8.500000,15.770223,154.237827,0.121060,2.0,16.0,113.0,10.0
1429691,2973.60280,536096.06250,6793022.0,-2943.747559,8.409431,1.398049,2.795711,3.044179,2.331407,86.115921,...,74.848122,54.270451,8.500000,16.418465,154.237827,0.121060,2.0,16.0,113.0,10.0
1429692,2973.75480,536096.06250,6793022.0,-2943.899170,12.515673,1.398049,2.658694,2.847681,2.331407,89.497131,...,74.964027,54.270451,8.500000,17.037945,154.237827,0.121060,2.0,16.0,113.0,10.0


In [12]:
train_imp = All_imp[:train_len].copy()
test_imp = All_imp[train_len:(train_len+test_len)].copy()
hidden_imp = All_imp[(train_len+test_len):].copy()

In [13]:
print(train_imp.shape)
print(test_imp.shape)
print(hidden_imp.shape)

(1170511, 22)
(136786, 22)
(122397, 22)


In [14]:
train_imp

Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,DTC,SP,BS,ROP,DTS,DRHO,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
0,494.5280,437641.96875,6470972.5,-469.501831,19.480835,1.398049,1.611410,1.798681,1.884186,80.200851,...,161.131180,24.612379,12.250001,34.636410,189.362198,-0.574928,6.0,-1.0,1.0,10.0
1,494.6800,437641.96875,6470972.5,-469.653809,19.468800,1.398049,1.618070,1.795641,1.889794,79.262886,...,160.603470,23.895531,12.250001,34.636410,189.362198,-0.570188,6.0,-1.0,1.0,10.0
2,494.8320,437641.96875,6470972.5,-469.805786,19.468800,1.398049,1.626459,1.800733,1.896523,74.821999,...,160.173615,23.916357,12.250001,34.779556,189.362198,-0.574245,6.0,-1.0,1.0,10.0
3,494.9840,437641.96875,6470972.5,-469.957794,19.459282,1.398049,1.621594,1.801517,1.891913,72.878922,...,160.149429,23.793688,12.250001,39.965164,189.362198,-0.586315,6.0,-1.0,1.0,10.0
4,495.1360,437641.96875,6470972.5,-470.109772,19.453100,1.398049,1.602679,1.795299,1.880034,71.729141,...,160.128342,24.104078,12.250001,57.483765,189.362198,-0.597914,6.0,-1.0,1.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170506,3169.3124,478648.81250,6737678.5,-2088.677002,8.423170,1.398049,1.455281,1.453520,2.527984,77.654900,...,108.310127,54.270451,8.500000,27.674368,189.362198,-0.001763,11.0,7.0,117.0,8.0
1170507,3169.4644,478648.81250,6737678.5,-2088.677002,8.379244,1.398049,1.455281,1.453520,2.537613,75.363937,...,108.310127,54.270451,8.500000,28.024338,189.362198,-0.007600,11.0,7.0,117.0,9.0
1170508,3169.6164,478648.81250,6737678.5,-2088.677002,8.350248,1.398049,1.455281,1.453520,2.491860,66.452843,...,108.310127,54.270451,8.500000,28.091282,189.362198,-0.018297,11.0,7.0,117.0,9.0
1170509,3169.7684,478648.81250,6737678.5,-2088.677002,8.313779,1.398049,1.455281,1.453520,2.447539,55.784817,...,108.310127,54.270451,8.500000,28.019775,189.362198,-0.011438,11.0,7.0,117.0,9.0


In [15]:
train_imp.describe()

Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,DTC,SP,BS,ROP,DTS,DRHO,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
count,1170511.0,1170511.0,1170511.0,1170511.0,1170511.0,1170511.0,1170511.0,1170511.0,1170511.0,1170511.0,...,1170511.0,1170511.0,1170511.0,1170511.0,1170511.0,1170511.0,1170511.0,1170511.0,1170511.0,1170511.0
mean,2184.087,485566.8,6681795.0,-2138.068,13.13538,6.406896,4.869328,10.6041,2.291382,70.9137,...,113.0094,58.52595,12.06421,73.27165,191.6435,0.01079367,6.61994,36.44059,57.52154,9.169452
std,997.1821,34403.46,127674.9,966.475,3.657786,74.01863,53.75806,113.4141,0.2357325,34.23149,...,28.96434,65.84702,2.548096,1042.454,27.98447,6.869625,3.136,23.94241,31.96306,1.455014
min,136.086,426898.8,6406641.0,-5395.563,2.344,0.0001,-0.008418695,0.03170056,0.7209712,0.1092843,...,7.415132,-999.0,6.0,-0.117977,69.16318,-7429.339,-1.0,-1.0,1.0,0.0
25%,1418.597,454801.9,6593126.0,-2804.552,9.882808,1.296196,0.9290146,0.9148633,2.123109,47.62722,...,89.07137,42.05721,12.25,19.29947,189.3622,-0.006882701,5.0,18.0,34.0,9.0
50%,2076.605,477769.9,6737314.0,-2055.283,12.51567,1.398049,1.455281,1.44979,2.331407,68.36763,...,108.3101,54.27045,12.25,19.29947,189.3622,0.003207336,6.0,37.0,56.0,10.0
75%,2864.393,520131.4,6784877.0,-1397.964,15.74931,1.510922,2.5871,2.537876,2.462978,89.03551,...,139.3171,70.39606,12.25,19.29947,189.3622,0.01555134,9.0,58.0,85.0,10.0
max,5436.632,572632.8,6856661.0,-111.086,28.279,2193.905,1988.616,1999.887,3.45782,1076.964,...,320.4789,526.5473,26.0,47015.12,676.5781,2.836938,13.0,69.0,117.0,11.0


In [16]:


# calculating p-impedance
train_imp['PI'] = train_imp.RHOB * (1e6/train_imp.DTC)
test_imp['PI'] = test_imp.RHOB * (1e6/test_imp.DTC)
hidden_imp['PI'] = hidden_imp.RHOB * (1e6/hidden_imp.DTC)

# calculating s-impedance
train_imp['SI'] = train_imp.RHOB * (1e6/train_imp.DTS) 
test_imp['SI'] = test_imp.RHOB * (1e6/test_imp.DTS) 
hidden_imp['SI'] = hidden_imp.RHOB * (1e6/hidden_imp.DTS) 

#calculating Shear modulus (G)
train_imp['G'] = ((1e6/train_imp.DTS)**2) * train_imp.RHOB
test_imp['G'] = ((1e6/test_imp.DTS)**2) * test_imp.RHOB
hidden_imp['G'] = ((1e6/hidden_imp.DTS)**2) * hidden_imp.RHOB

#calculating Bulk modulus (K)
train_imp['K'] = (((1e6/train_imp.DTC)**2) * train_imp.RHOB) - (4 * train_imp.G/3)
test_imp['K'] = (((1e6/test_imp.DTC)**2) * test_imp.RHOB) - (4 * test_imp.G/3)
hidden_imp['K'] = (((1e6/hidden_imp.DTC)**2) * hidden_imp.RHOB) - (4 * hidden_imp.G/3)

# calculate the shale volume
train_imp["VSHALE"] = (train_imp.GR - np.min(train_imp.GR)) / (np.max(train_imp.GR) - np.min(train_imp.GR))
test_imp["VSHALE"] = (test_imp.GR - np.min(test_imp.GR)) / (np.max(test_imp.GR) - np.min(test_imp.GR))
hidden_imp["VSHALE"] = (hidden_imp.GR - np.min(hidden_imp.GR)) / (np.max(hidden_imp.GR) - np.min(hidden_imp.GR))
#train_imp1.head()

# calculate the total porosity
train_imp['PHIT'] = np.sqrt(((((train_imp.NPHI)*(train_imp.NPHI)+(train_imp.RHOB)*(train_imp.RHOB))))/2)
test_imp['PHIT'] = np.sqrt(((((test_imp.NPHI)*(test_imp.NPHI)+(test_imp.RHOB)*(test_imp.RHOB))))/2)
hidden_imp['PHIT'] = np.sqrt(((((hidden_imp.NPHI)*(hidden_imp.NPHI)+(hidden_imp.RHOB)*(hidden_imp.RHOB))))/2)
#train_imp1.tail()

# calculate effective porosity
train_imp['PHIE'] = train_imp.PHIT*(1-train_imp.VSHALE)
train_imp = train_imp[train_imp['PHIE'] !=0]
train_imp['PHIE'] = train_imp['PHIE'].abs() 

test_imp['PHIE'] = test_imp.PHIT*(1-test_imp.VSHALE)
test_imp = test_imp[test_imp['PHIE'] !=0]
test_imp['PHIE'] = test_imp['PHIE'].abs() 

hidden_imp['PHIE'] = hidden_imp.PHIT*(1-hidden_imp.VSHALE)
hidden_imp = hidden_imp[hidden_imp['PHIE'] !=0]
hidden_imp['PHIE'] = hidden_imp['PHIE'].abs() 
#train_imp1.tail()

# # display the log for preview
# plt.figure(figsize=(5,10))
# plt.subplot(122)
# plt.title('PHIE')
# plt.plot('PHIE', 'SI','PI','G','K','VSHALE','DEPTH_MD', data=train_imp)
# plt.gca().invert_yaxis()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_imp['PHIE'] = train_imp['PHIE'].abs()


In [17]:
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
x_header=['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'CALI', 'RSHA', 'RMED', 'RDEP',
       'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DRHO','K','G','VSHALE','PHIT','PHIE','SI','PI',
       'GROUP_encoded', 'FORMATION_encoded', 'WELL_encoded']
y_header=['Lithology_encoded']
x_train = train_imp[x_header]
y_train = train_imp[y_header]
x_test = test_imp[x_header]
y_test = test_imp[y_header]
x_hidden = hidden_imp[x_header]
y_hidden = hidden_imp[y_header]

##Min-Max scaler 
scaler = MinMaxScaler()
x_train_scaled = x_train.copy()
x_test_scaled = x_test.copy()
x_hidden_scaled = x_hidden.copy()

x_train_scaled.iloc[:,:25] = scaler.fit_transform(x_train_scaled.iloc[:,:25])
x_test_scaled.iloc[:,:25] = scaler.transform(x_test_scaled.iloc[:,:25])
x_hidden_scaled.iloc[:,:25] = scaler.transform(x_hidden_scaled.iloc[:,:25])

In [18]:
#Supervised Algorithms
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, recall_score, precision_score, f1_score
from sklearn.neighbors import KNeighborsRegressor
from pprint import pprint
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
import xgboost
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
#Comparing base models accuracies by using k-fold cross validation - 10 folds

from sklearn.model_selection import cross_val_score



model_knn = KNeighborsClassifier(leaf_size= 20, metric= 'minkowski', n_neighbors= 1, p=1, weights='uniform')

model_knn.fit(x_train_scaled, y_train.values.ravel())

train_pred_knn = model_knn.predict(x_train_scaled)
open_pred_knn = model_knn.predict(x_test_scaled)
hidden_pred_knn = model_knn.predict(x_hidden_scaled)
#Printing Reports 


#Printing Reports


  from pandas import MultiIndex, Int64Index


In [19]:
A = np.load('penalty_matrix.npy')
def score(y_true, y_pred):
    S = 0.0
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    for i in range(0, y_true.shape[0]):
        S -= A[y_true[i], y_pred[i]]
    return S/y_true.shape[0]

In [20]:
from sklearn.metrics import classification_report, accuracy_score
print('-----------------------TRAIN SET REPORT---------------------')
print("Open set RMSE:", np.sqrt(mean_squared_error(y_train, train_pred_knn)))
print('Open set penalty matrix score:', score(y_train.values, train_pred_knn))
print('Open set report:', classification_report(y_train, train_pred_knn))
print('-----------------------OPEN SET REPORT---------------------')
print("Open set RMSE:", np.sqrt(mean_squared_error(y_test, open_pred_knn)))
print('Open set penalty matrix score:', score(y_test.values, open_pred_knn))
print('Open set report:', classification_report(y_test, open_pred_knn))
print('-----------------------HIDDEN SET REPORT---------------------')
print("Hidden set RMSE:", np.sqrt(mean_squared_error(y_hidden, hidden_pred_knn)))
print('Hidden set penalty matrix score:', score(y_hidden.values, hidden_pred_knn))
print('Hidden set report:', classification_report(y_hidden, hidden_pred_knn))

-----------------------TRAIN SET REPORT---------------------
Open set RMSE: 0.0
Open set penalty matrix score: [0.]
Open set report:               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1085
         1.0       1.00      1.00      1.00       103
         2.0       1.00      1.00      1.00     10513
         3.0       1.00      1.00      1.00      3820
         4.0       1.00      1.00      1.00      1688
         5.0       1.00      1.00      1.00      8213
         6.0       1.00      1.00      1.00     56320
         7.0       1.00      1.00      1.00     33329
         8.0       1.00      1.00      1.00    168937
         9.0       1.00      1.00      1.00    150455
        10.0       1.00      1.00      1.00    720802
        11.0       1.00      1.00      1.00     15245

    accuracy                           1.00   1170510
   macro avg       1.00      1.00      1.00   1170510
weighted avg       1.00      1.00      1.00   1170510



In [21]:
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.colors as colors
facies_colors = ['#F4D03F','#7ccc19','#196F3D','#160599','#2756c4','#3891f0','#80d4ff','#87039e','#ec90fc','#FF4500','#000000','#DC7633']
facies_labels = ['SS', 'S-S', 'SH', 'MR', 'DOL','LIM', 'CH','HAL', 'AN', 'TF', 'CO', 'BS']


#Facies_color_map
facies_color_map = {}
for ind, label in enumerate(facies_labels):
    facies_color_map[label] = facies_colors[ind]
    
def pred_log(logs, well_num, facies_colors, n_pred):
    wells = logs['WELL'].unique()
    logs = logs[logs['WELL'] == wells[well_num]]
    logs = logs.sort_values(by='DEPTH_MD')        #Sorting log by depth
    cmap_facies = colors.ListedColormap(facies_colors[0:len(facies_colors)], 'indexed')
    
    top = logs.DEPTH_MD.min()
    bot = logs.DEPTH_MD.max()
       
    f, ax = plt.subplots(nrows=1, ncols=(12+n_pred), figsize=(15, 12))
    log_colors = ['black', 'red', 'blue', 'green', 'purple','black', 'red', 'blue', 'green', 'purple', 'black', 'red', 'blue', 'green', 'purple', 'black', 'black', 'red', 'blue', 'green', 'purple', 'black', 'black', 'red', 'blue', 'green', 'purple', 'black']

    for i in range(7,18):
      ax[i-7].plot(logs.iloc[:,i], logs.DEPTH_MD, color=log_colors[i])
      ax[i-7].set_ylim(top, bot)
      #ax[i-7].set_xlim(logs.iloc[:,i].min(), logs.iloc[:,i].max())

      ax[i-7].set_xlabel(str(logs.columns[i]))
      ax[i-7].invert_yaxis()
      ax[i-7].grid()

    for j in range((-1-n_pred), 0):
      label = np.repeat(np.expand_dims(logs.iloc[:,j].values, 1), 100, 0)
      im = ax[j].imshow(label, interpolation='none', aspect='auto', cmap=cmap_facies, vmin=0, vmax=12)
      ax[j].set_xlabel(str(logs.columns[j]))

    divider = make_axes_locatable(ax[-1])
    cax = divider.append_axes("right", size="20%", pad=0.05)
    cbar=plt.colorbar(im, cax=cax)
    cbar.set_label((12*' ').join(['SS', 'S-S', 'SH', 'MR', 'DOL','LIM', 'CH','HAL', 'AN', 'TF', 'CO', 'BS']))
    cbar.set_ticks(range(0,1)); cbar.set_ticklabels('')
        
    f.suptitle('WELL LOGS '+str(wells[well_num]), fontsize=14,y=0.94)

In [26]:
#Storing results
test_knn = test_imp.copy()
hidden_knn = hidden_imp.copy()
#Saving Results
test_knn['KNN_ADD'] = open_pred_knn
hidden_knn['KNN_ADD'] = hidden_pred_knn

test_knn.to_csv('test_knn.csv')
hidden_knn.to_csv('hidden_knn.csv')

In [27]:
test_knn = pd.read_csv('test_knn.csv')
hidden_knn = pd.read_csv('hidden_knn.csv')

In [24]:
x_train_scaled

Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,K,G,VSHALE,PHIT,PHIE,SI,PI,GROUP_encoded,FORMATION_encoded,WELL_encoded
0,0.067624,0.073718,0.142953,0.932176,0.660761,0.000637,0.000815,0.000884,0.425020,0.077168,...,0.008466,0.103092,0.077168,0.389828,0.504086,0.232190,0.023277,6.0,-1.0,1.0
1,0.067652,0.073718,0.142953,0.932147,0.660297,0.000637,0.000818,0.000882,0.427069,0.076264,...,0.008479,0.103427,0.076264,0.391988,0.506116,0.233183,0.023562,6.0,-1.0,1.0
2,0.067681,0.073718,0.142953,0.932118,0.660297,0.000637,0.000822,0.000885,0.429527,0.071986,...,0.008491,0.103829,0.071986,0.394580,0.510327,0.234373,0.023849,6.0,-1.0,1.0
3,0.067710,0.073718,0.142953,0.932089,0.659930,0.000637,0.000820,0.000885,0.427843,0.070113,...,0.008491,0.103554,0.070113,0.392804,0.510098,0.233558,0.023744,6.0,-1.0,1.0
4,0.067738,0.073718,0.142953,0.932061,0.659692,0.000637,0.000810,0.000882,0.423503,0.069006,...,0.008491,0.102843,0.069006,0.388228,0.507442,0.231456,0.023461,6.0,-1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170506,0.572248,0.355099,0.735606,0.625774,0.234400,0.000637,0.000736,0.000711,0.660253,0.074715,...,0.011803,0.141586,0.074715,0.638733,0.682184,0.346096,0.068605,11.0,7.0,117.0
1170507,0.572277,0.355099,0.735606,0.625774,0.232707,0.000637,0.000736,0.000711,0.663771,0.072508,...,0.011816,0.142162,0.072508,0.642466,0.686463,0.347800,0.068951,11.0,7.0,117.0
1170508,0.572305,0.355099,0.735606,0.625774,0.231589,0.000637,0.000736,0.000711,0.647054,0.063922,...,0.011755,0.139426,0.063922,0.624730,0.680059,0.339705,0.067307,11.0,7.0,117.0
1170509,0.572334,0.355099,0.735606,0.625774,0.230182,0.000637,0.000736,0.000711,0.630860,0.053643,...,0.011695,0.136776,0.053643,0.607554,0.675040,0.331863,0.065714,11.0,7.0,117.0


In [28]:
#Plotting predictions - HIDDEN DATASET
for i in range(1):
  pred_log(hidden_knn, i, facies_colors, 2)

KeyError: 'WELL'