In [1]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from tqdm import tqdm
import time

train = pd.read_csv("CSV_train.csv",low_memory=False,delimiter=';')
test=pd.read_csv("CSV_test.csv",low_memory=False,delimiter=',')
hidden=pd.read_csv("CSV_hidden_test.csv",low_memory=False,delimiter=',')

In [2]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136786 entries, 0 to 136785
Data columns (total 28 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   WELL                              136786 non-null  object 
 1   DEPTH_MD                          136786 non-null  float64
 2   X_LOC                             136727 non-null  float64
 3   Y_LOC                             136727 non-null  float64
 4   Z_LOC                             136727 non-null  float64
 5   GROUP                             136786 non-null  object 
 6   FORMATION                         129712 non-null  object 
 7   CALI                              131141 non-null  float64
 8   RSHA                              39097 non-null   float64
 9   RMED                              136199 non-null  float64
 10  RDEP                              136727 non-null  float64
 11  RHOB                              119826 non-null  f

In [3]:
# storing length of datasets 
train_len = train.shape[0] 
test_len = test.shape[0]
#All_data = pd.concat((train,test,hidden)).reset_index(drop=True) 

lithology_keys = {30000: 'Sandstone',
                 65030: 'Sandstone/Shale',
                 65000: 'Shale',
                 80000: 'Marl',
                 74000: 'Dolomite',
                 70000: 'Limestone',
                 70032: 'Chalk',
                 88000: 'Halite',
                 86000: 'Anhydrite',
                 99000: 'Tuff',
                 90000: 'Coal',
                 93000: 'Basement'}
train['Lithology'] = train['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(lithology_keys)
test['Lithology'] = test['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(lithology_keys)
hidden['Lithology'] = hidden['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(lithology_keys)

In [4]:
#dropping columns with high missing values
drop_cols = ['SGR', 'ROPA', 'RXO', 'MUDWEIGHT','DCAL','RMIC','FORCE_2020_LITHOFACIES_CONFIDENCE']
drop_test=['SGR', 'ROPA', 'RXO', 'MUDWEIGHT','DCAL','RMIC']
train_drop = train.drop(drop_cols, axis=1)
test_drop = test.drop(drop_test, axis=1)
hidden_drop = hidden.drop(drop_cols, axis=1)
# encoding train categorical variables
train_drop['GROUP_encoded'] = train_drop['GROUP'].astype('category')
train_drop['GROUP_encoded'] = train_drop['GROUP_encoded'].cat.codes

train_drop['FORMATION_encoded'] = train_drop['FORMATION'].astype('category')
train_drop['FORMATION_encoded'] = train_drop['FORMATION_encoded'].cat.codes

train_drop['WELL_encoded'] = train_drop['WELL'].astype('category')
train_drop['WELL_encoded'] = train_drop['WELL_encoded'].cat.codes

train_drop['Lithology_encoded'] = train_drop['FORCE_2020_LITHOFACIES_LITHOLOGY'].astype('category')
train_drop['Lithology_encoded'] = train_drop['Lithology_encoded'].cat.codes

# encoding test categorical variables
test_drop['GROUP_encoded'] = test_drop['GROUP'].astype('category')
test_drop['GROUP_encoded'] = test_drop['GROUP_encoded'].cat.codes

test_drop['FORMATION_encoded'] = test_drop['FORMATION'].astype('category')
test_drop['FORMATION_encoded'] = test_drop['FORMATION_encoded'].cat.codes

test_drop['WELL_encoded'] = test_drop['WELL'].astype('category')
test_drop['WELL_encoded'] = test_drop['WELL_encoded'].cat.codes

test_drop['Lithology_encoded'] = test_drop['FORCE_2020_LITHOFACIES_LITHOLOGY'].astype('category')
test_drop['Lithology_encoded'] = test_drop['Lithology_encoded'].cat.codes
hidden_drop['GROUP_encoded'] = hidden_drop['GROUP'].astype('category')
hidden_drop['GROUP_encoded'] = hidden_drop['GROUP_encoded'].cat.codes

hidden_drop['FORMATION_encoded'] = hidden_drop['FORMATION'].astype('category')
hidden_drop['FORMATION_encoded'] = hidden_drop['FORMATION_encoded'].cat.codes

hidden_drop['WELL_encoded'] = hidden_drop['WELL'].astype('category')
hidden_drop['WELL_encoded'] = hidden_drop['WELL_encoded'].cat.codes

hidden_drop['Lithology_encoded'] = hidden_drop['FORCE_2020_LITHOFACIES_LITHOLOGY'].astype('category')
hidden_drop['Lithology_encoded'] = hidden_drop['Lithology_encoded'].cat.codes

In [5]:
#dropping categorial features replaces beforehan by encoded features
train_drop1 = train_drop.drop(['GROUP', 'FORMATION','WELL','FORCE_2020_LITHOFACIES_LITHOLOGY','Lithology'], axis=1)
test_drop1 = test_drop.drop(['GROUP', 'FORMATION','WELL','FORCE_2020_LITHOFACIES_LITHOLOGY','Lithology'], axis=1)
hidden_drop1 = hidden_drop.drop(['GROUP', 'FORMATION','WELL','FORCE_2020_LITHOFACIES_LITHOLOGY','Lithology'], axis=1)

In [32]:
print(train_drop1.quantile(0.10))
print(train_drop1.quantile(0.90))

DEPTH_MD             9.414652e+02
X_LOC                4.441583e+05
Y_LOC                6.468386e+06
Z_LOC               -3.461883e+03
CALI                 8.527271e+00
RSHA                 6.091504e-01
RMED                 6.818662e-01
RDEP                 6.625254e-01
RHOB                 1.966032e+00
GR                   3.246619e+01
NPHI                 1.675192e-01
PEF                  2.718434e+00
DTC                  7.592603e+01
SP                   6.876202e+00
BS                   8.500000e+00
ROP                  2.150400e+00
DTS                  1.387309e+02
DRHO                -2.196863e-02
GROUP_encoded        3.000000e+00
FORMATION_encoded   -1.000000e+00
WELL_encoded         1.000000e+01
Lithology_encoded    0.000000e+00
Name: 0.1, dtype: float64
DEPTH_MD             3.585008e+03
X_LOC                5.368416e+05
Y_LOC                6.812832e+06
Z_LOC               -9.225270e+02
CALI                 1.837270e+01
RSHA                 7.989742e+00
RMED                 6

In [6]:
# trainQ1 = train_drop1.quantile(0.25)
# trainQ3 = train_drop1.quantile(0.75)
# trainIQR = trainQ3 - trainQ1
# print(trainIQR)

# testQ1 = test_drop1.quantile(0.25)
# testQ3 = test_drop1.quantile(0.75)
# testIQR = testQ3 - testQ1
# print(testIQR)

# hiddenQ1 = hidden_drop1.quantile(0.25)
# hiddenQ3 = hidden_drop1.quantile(0.75)
# hiddenIQR = hiddenQ3 - hiddenQ1
# print(hiddenIQR)

DEPTH_MD               1445.795801
X_LOC                 65353.625000
Y_LOC                193558.500000
Z_LOC                  1419.636230
CALI                      7.281036
RSHA                      2.245227
RMED                      1.766843
RDEP                      1.646980
RHOB                      0.396377
GR                       41.408297
NPHI                      0.181587
PEF                       2.554354
DTC                      52.948624
SP                       50.990318
BS                        3.750001
ROP                      29.184794
DTS                      68.708374
DRHO                      0.030955
GROUP_encoded             4.000000
FORMATION_encoded        40.000000
WELL_encoded             45.000000
Lithology_encoded         0.000000
dtype: float64
DEPTH_MD               1586.694090
X_LOC                 42703.453100
Y_LOC                217317.500000
Z_LOC                  1490.240417
CALI                      5.128100
RSHA                      1.728497
RMED 

In [7]:

# train_drop2 = train_drop1[~((train_drop1 < (trainQ1 - 1.5 * trainIQR)) |(train_drop1 > (trainQ3 + 1.5 * trainIQR))).any(axis=1)]
# train_drop2=train_drop2.reset_index(drop=True)
# print(train_drop2.shape)
# train_drop2


(481520, 22)


Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,DTC,SP,BS,ROP,DTS,DRHO,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
0,501.9760,437642.00000,6470972.5,-476.949677,21.002375,,1.446267,1.808564,1.696347,71.916695,...,159.475830,24.342602,,27.574333,,-0.026934,6,-1,0,1
1,502.1280,437642.00000,6470972.5,-477.101654,22.255486,,1.395273,1.716676,1.655434,70.890160,...,159.490387,24.464191,,27.431999,,-0.009692,6,-1,0,1
2,502.2800,437642.00000,6470972.5,-477.253662,22.294195,,1.402481,1.590261,1.641988,71.705124,...,159.489990,24.619265,,27.431999,,0.002679,6,-1,0,1
3,502.4320,437642.00000,6470972.5,-477.405640,22.391655,,1.426419,1.433956,1.600092,73.166786,...,159.480789,24.368204,,27.431999,,0.007460,6,-1,0,1
4,502.5840,437642.00000,6470972.5,-477.557648,23.167938,,1.380926,1.279809,1.530632,73.454750,...,159.471573,24.181400,,27.431999,,0.001195,6,-1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481515,3140.1284,444920.34375,6421578.5,-3079.271484,8.484242,3.950208,3.946853,3.424120,2.471713,76.297417,...,76.412384,,8.5,30.570990,134.650909,-0.017750,11,7,97,1
481516,3140.2804,444920.34375,6421578.5,-3079.423584,8.465464,3.895838,3.751034,3.197002,2.489856,70.762009,...,77.060547,,8.5,30.535135,132.248093,-0.001506,11,7,97,1
481517,3140.4324,444920.34375,6421578.5,-3079.575439,8.455131,3.311376,3.077763,2.632942,2.492218,66.463943,...,79.642212,,8.5,30.482046,129.424530,0.009678,11,7,97,1
481518,3140.5844,444920.34375,6421578.5,-3079.727539,8.448524,2.449606,2.242409,1.958930,2.495520,59.788101,...,82.965523,,8.5,29.961433,127.326607,0.000727,11,7,97,1


In [8]:

# test_drop2 = test_drop1[~((test_drop1 < (testQ1 - 1.5 * testIQR)) |(test_drop1 > (testQ3 + 1.5 * testIQR))).any(axis=1)]
# test_drop2=test_drop2.reset_index(drop=True)
# print(test_drop2.shape)
# test_drop2


(66464, 22)


Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,DTC,SP,BS,ROP,DTS,DRHO,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
0,517.868001,423244.4688,6461862.5,-492.863861,19.258232,,1.207627,1.496608,1.983400,67.158707,...,174.164764,36.087242,,53.864590,,0.027439,5,-1,0,1
1,518.020001,423244.4688,6461862.5,-493.015839,19.277550,,1.210276,1.513636,1.993067,65.859154,...,174.290649,35.655830,,6.205420,,0.039036,5,-1,0,1
2,518.172001,423244.4688,6461862.5,-493.167816,19.079758,,1.214100,1.524275,1.984950,66.941612,...,174.331513,35.488075,,6.359530,,0.040724,5,-1,0,1
3,518.324001,423244.4688,6461862.5,-493.319824,18.814838,,1.222963,1.538192,1.976260,67.594482,...,174.165421,35.927845,,6.502960,,0.037430,5,-1,0,1
4,518.476001,423244.4688,6461862.5,-493.471832,18.785326,,1.231475,1.552319,1.965034,70.590813,...,174.334335,35.799774,,6.646305,,0.027787,5,-1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66459,3175.293600,536227.5000,6794881.5,-3150.820068,8.537357,,3.992167,3.775654,2.332617,125.523697,...,99.617767,,8.5,17.475340,201.776520,0.026362,0,24,9,1
66460,3175.445600,536227.5000,6794881.5,-3150.971924,8.541270,,4.925763,4.675759,2.247326,118.711121,...,95.182381,,8.5,17.382217,196.369003,0.008995,0,24,9,1
66461,3191.101600,536227.1250,6794881.5,-3166.618896,8.495884,,4.163665,4.184748,2.467505,79.260956,...,77.925369,,8.5,31.494883,132.442749,-0.000772,0,24,9,1
66462,3191.253600,536227.1250,6794881.0,-3166.770752,8.483252,,4.385100,4.339803,2.444234,78.630280,...,76.714081,,8.5,30.027096,129.641937,-0.006190,0,24,9,1


In [9]:

# hidden_drop2 = hidden_drop1[~((hidden_drop1 < (hiddenQ1 - 1.5 * hiddenIQR)) |(hidden_drop1 > (hiddenQ3 + 1.5 * hiddenIQR))).any(axis=1)]
# hidden_drop2=hidden_drop2.reset_index(drop=True)
# print(hidden_drop2.shape)
# hidden_drop2


(42108, 22)


Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,DTC,SP,BS,ROP,DTS,DRHO,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
0,1521.3200,433906.7813,6460000.5,-1496.281494,12.507532,,1.000478,0.931597,,128.556168,...,142.312378,,17.5,69.766304,330.911133,-0.090620,4,33,0,1
1,1523.4480,433906.8125,6460000.5,-1498.409424,11.999999,,0.974516,0.897710,,121.076324,...,149.432892,,17.5,74.583710,316.585113,-0.059700,4,33,0,1
2,1523.6000,433906.8125,6460000.5,-1498.561401,11.999999,,0.979522,0.901183,,135.356125,...,147.225769,,17.5,66.182564,331.246674,-0.059700,4,33,0,1
3,1523.7520,433906.8125,6460000.5,-1498.713379,11.999999,,0.978163,0.902526,,140.511200,...,147.876144,,17.5,61.021481,332.420471,-0.059626,4,33,0,1
4,1523.9040,433906.8125,6460000.5,-1498.865479,11.999999,,0.977485,0.901335,,136.948639,...,150.419540,,17.5,63.334373,323.817871,-0.058049,4,33,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42103,2449.3548,536112.1250,6793043.5,-2420.286621,10.289483,,2.505939,2.468475,2.289250,87.797913,...,96.275513,,8.5,45.813919,201.809769,0.021886,8,25,9,1
42104,2591.3228,536110.5625,6793040.0,-2562.190918,9.944988,,3.267919,3.191356,2.318670,124.924362,...,89.716644,,8.5,22.359880,199.252151,0.093039,1,31,9,1
42105,2591.4748,536110.5625,6793040.0,-2562.343018,9.978121,,2.851122,2.703501,2.312745,128.821487,...,89.891899,,8.5,22.957935,198.519165,0.116662,1,31,9,1
42106,2591.6268,536110.5000,6793040.0,-2562.494629,9.982432,,2.364331,2.273278,2.320783,131.541443,...,90.862244,,8.5,23.925800,197.370819,0.129587,1,31,9,1


In [10]:
#dropping categorial features replaces beforehan by encoded features
#drop2 = All_data_drop.drop(['GROUP', 'FORMATION','WELL','FORCE_2020_LITHOFACIES_LITHOLOGY','Lithology'], axis=1)

# splitting dataset into training, test, and hidden sets
# train_prep = drop2[:train_len].copy()
# test_prep = drop2[train_len:(train_len+test_len)].copy()
# hidden_prep = drop2[(train_len+test_len):].copy()

In [11]:
# train_prep1= train_prep.copy()
# test_prep1= test_prep.copy()
# hidden_prep1= hidden_prep.copy()

In [12]:
#Inputing missing values by introducing median 
from sklearn.impute import SimpleImputer
miss = SimpleImputer(missing_values=np.nan, strategy='median')
miss.fit(train_drop2)
train_imp = miss.fit_transform(train_drop2)
train_imp=pd.DataFrame(train_imp, columns=['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'CALI', 'RSHA', 'RMED', 'RDEP',
       'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DRHO',
         'GROUP_encoded',
       'FORMATION_encoded', 'WELL_encoded','Lithology_encoded'])
train_imp

Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,DTC,SP,BS,ROP,DTS,DRHO,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
0,501.9760,437642.00000,6470972.5,-476.949677,21.002375,1.175413,1.446267,1.808564,1.696347,71.916695,...,159.475830,24.342602,12.250001,27.574333,206.386459,-0.026934,6.0,-1.0,0.0,1.0
1,502.1280,437642.00000,6470972.5,-477.101654,22.255486,1.175413,1.395273,1.716676,1.655434,70.890160,...,159.490387,24.464191,12.250001,27.431999,206.386459,-0.009692,6.0,-1.0,0.0,1.0
2,502.2800,437642.00000,6470972.5,-477.253662,22.294195,1.175413,1.402481,1.590261,1.641988,71.705124,...,159.489990,24.619265,12.250001,27.431999,206.386459,0.002679,6.0,-1.0,0.0,1.0
3,502.4320,437642.00000,6470972.5,-477.405640,22.391655,1.175413,1.426419,1.433956,1.600092,73.166786,...,159.480789,24.368204,12.250001,27.431999,206.386459,0.007460,6.0,-1.0,0.0,1.0
4,502.5840,437642.00000,6470972.5,-477.557648,23.167938,1.175413,1.380926,1.279809,1.530632,73.454750,...,159.471573,24.181400,12.250001,27.431999,206.386459,0.001195,6.0,-1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481515,3140.1284,444920.34375,6421578.5,-3079.271484,8.484242,3.950208,3.946853,3.424120,2.471713,76.297417,...,76.412384,53.897419,8.500000,30.570990,134.650909,-0.017750,11.0,7.0,97.0,1.0
481516,3140.2804,444920.34375,6421578.5,-3079.423584,8.465464,3.895838,3.751034,3.197002,2.489856,70.762009,...,77.060547,53.897419,8.500000,30.535135,132.248093,-0.001506,11.0,7.0,97.0,1.0
481517,3140.4324,444920.34375,6421578.5,-3079.575439,8.455131,3.311376,3.077763,2.632942,2.492218,66.463943,...,79.642212,53.897419,8.500000,30.482046,129.424530,0.009678,11.0,7.0,97.0,1.0
481518,3140.5844,444920.34375,6421578.5,-3079.727539,8.448524,2.449606,2.242409,1.958930,2.495520,59.788101,...,82.965523,53.897419,8.500000,29.961433,127.326607,0.000727,11.0,7.0,97.0,1.0


In [13]:
#Inputing missing values by introducing median 
from sklearn.impute import SimpleImputer
miss1 = SimpleImputer(missing_values=np.nan, strategy='median')
miss1.fit(test_drop2)
test_imp = miss.fit_transform(test_drop2)
test_imp=pd.DataFrame(test_imp, columns=['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'CALI', 'RSHA', 'RMED', 'RDEP',
       'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DRHO',
         'GROUP_encoded',
       'FORMATION_encoded', 'WELL_encoded','Lithology_encoded'])
test_imp

Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,DTC,SP,BS,ROP,DTS,DRHO,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
0,517.868001,423244.4688,6461862.5,-492.863861,19.258232,1.252154,1.207627,1.496608,1.983400,67.158707,...,174.164764,36.087242,12.250001,53.864590,189.010284,0.027439,5.0,-1.0,0.0,1.0
1,518.020001,423244.4688,6461862.5,-493.015839,19.277550,1.252154,1.210276,1.513636,1.993067,65.859154,...,174.290649,35.655830,12.250001,6.205420,189.010284,0.039036,5.0,-1.0,0.0,1.0
2,518.172001,423244.4688,6461862.5,-493.167816,19.079758,1.252154,1.214100,1.524275,1.984950,66.941612,...,174.331513,35.488075,12.250001,6.359530,189.010284,0.040724,5.0,-1.0,0.0,1.0
3,518.324001,423244.4688,6461862.5,-493.319824,18.814838,1.252154,1.222963,1.538192,1.976260,67.594482,...,174.165421,35.927845,12.250001,6.502960,189.010284,0.037430,5.0,-1.0,0.0,1.0
4,518.476001,423244.4688,6461862.5,-493.471832,18.785326,1.252154,1.231475,1.552319,1.965034,70.590813,...,174.334335,35.799774,12.250001,6.646305,189.010284,0.027787,5.0,-1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66459,3175.293600,536227.5000,6794881.5,-3150.820068,8.537357,1.252154,3.992167,3.775654,2.332617,125.523697,...,99.617767,54.058868,8.500000,17.475340,201.776520,0.026362,0.0,24.0,9.0,1.0
66460,3175.445600,536227.5000,6794881.5,-3150.971924,8.541270,1.252154,4.925763,4.675759,2.247326,118.711121,...,95.182381,54.058868,8.500000,17.382217,196.369003,0.008995,0.0,24.0,9.0,1.0
66461,3191.101600,536227.1250,6794881.5,-3166.618896,8.495884,1.252154,4.163665,4.184748,2.467505,79.260956,...,77.925369,54.058868,8.500000,31.494883,132.442749,-0.000772,0.0,24.0,9.0,1.0
66462,3191.253600,536227.1250,6794881.0,-3166.770752,8.483252,1.252154,4.385100,4.339803,2.444234,78.630280,...,76.714081,54.058868,8.500000,30.027096,129.641937,-0.006190,0.0,24.0,9.0,1.0


In [14]:
#Inputing missing values by introducing median 
from sklearn.impute import SimpleImputer
miss2 = SimpleImputer(missing_values=np.nan, strategy='median')
miss2.fit(hidden_drop2)
hidden_imp = miss.fit_transform(hidden_drop2)
hidden_imp=pd.DataFrame(hidden_imp, columns=['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'CALI', 'RSHA', 'RMED', 'RDEP',
       'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DRHO',
         'GROUP_encoded',
       'FORMATION_encoded', 'WELL_encoded','Lithology_encoded'])
hidden_imp

Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,DTC,SP,BS,ROP,DTS,DRHO,GROUP_encoded,FORMATION_encoded,WELL_encoded,Lithology_encoded
0,1521.3200,433906.7813,6460000.5,-1496.281494,12.507532,1.336012,1.000478,0.931597,2.391550,128.556168,...,142.312378,30.856518,17.5,69.766304,330.911133,-0.090620,4.0,33.0,0.0,1.0
1,1523.4480,433906.8125,6460000.5,-1498.409424,11.999999,1.336012,0.974516,0.897710,2.391550,121.076324,...,149.432892,30.856518,17.5,74.583710,316.585113,-0.059700,4.0,33.0,0.0,1.0
2,1523.6000,433906.8125,6460000.5,-1498.561401,11.999999,1.336012,0.979522,0.901183,2.391550,135.356125,...,147.225769,30.856518,17.5,66.182564,331.246674,-0.059700,4.0,33.0,0.0,1.0
3,1523.7520,433906.8125,6460000.5,-1498.713379,11.999999,1.336012,0.978163,0.902526,2.391550,140.511200,...,147.876144,30.856518,17.5,61.021481,332.420471,-0.059626,4.0,33.0,0.0,1.0
4,1523.9040,433906.8125,6460000.5,-1498.865479,11.999999,1.336012,0.977485,0.901335,2.391550,136.948639,...,150.419540,30.856518,17.5,63.334373,323.817871,-0.058049,4.0,33.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42103,2449.3548,536112.1250,6793043.5,-2420.286621,10.289483,1.336012,2.505939,2.468475,2.289250,87.797913,...,96.275513,30.856518,8.5,45.813919,201.809769,0.021886,8.0,25.0,9.0,1.0
42104,2591.3228,536110.5625,6793040.0,-2562.190918,9.944988,1.336012,3.267919,3.191356,2.318670,124.924362,...,89.716644,30.856518,8.5,22.359880,199.252151,0.093039,1.0,31.0,9.0,1.0
42105,2591.4748,536110.5625,6793040.0,-2562.343018,9.978121,1.336012,2.851122,2.703501,2.312745,128.821487,...,89.891899,30.856518,8.5,22.957935,198.519165,0.116662,1.0,31.0,9.0,1.0
42106,2591.6268,536110.5000,6793040.0,-2562.494629,9.982432,1.336012,2.364331,2.273278,2.320783,131.541443,...,90.862244,30.856518,8.5,23.925800,197.370819,0.129587,1.0,31.0,9.0,1.0


In [15]:
print(train_imp.shape)
print(test_imp.shape)
print(hidden_imp.shape)

(481520, 22)
(66464, 22)
(42108, 22)


In [19]:
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler

from sklearn.model_selection import train_test_split
x_header=['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'CALI', 'RSHA', 'RMED', 'RDEP',
       'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DRHO',
       'GROUP_encoded', 'FORMATION_encoded', 'WELL_encoded']
y_header=['Lithology_encoded']

x_train1 = train_imp[x_header]

x_test1 = test_imp[x_header]
# y_test = test_imp[y_header]


x_hidden = hidden_imp[x_header]
y_hidden = hidden_imp[y_header]

##Min-Max scaler 
scaler = MinMaxScaler()
x_train_scaled = x_train1.copy()
x_test_scaled = x_test1.copy()
x_hidden_scaled = x_hidden.copy()

x_train_scaled.iloc[:,:18] = scaler.fit_transform(x_train_scaled.iloc[:,:18])
x_test_scaled.iloc[:,:18] = scaler.transform(x_test_scaled.iloc[:,:18])
x_hidden_scaled.iloc[:,:18] = scaler.transform(x_hidden_scaled.iloc[:,:18])

traintest1=pd.concat((train_imp,test_imp)).reset_index(drop=True) 
# x = traintest[x_header]
y = traintest1[y_header]

traintest2=pd.concat((x_train_scaled,x_test_scaled)).reset_index(drop=True) 
x = traintest2[x_header]
# y = traintest[y_header]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 12)

In [25]:
y_train['Lithology_encoded'].unique()

array([1.])

In [29]:
traintest1['Lithology_encoded'].unique()

array([1.])

In [20]:
#Supervised Algorithms
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, recall_score, precision_score, f1_score
from sklearn.neighbors import KNeighborsRegressor
from pprint import pprint
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
import xgboost
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
#Comparing base models accuracies by using k-fold cross validation - 10 folds

from sklearn.model_selection import cross_val_score

# new_train = pd.concat((x_train_scaled, pd.DataFrame(y_train, columns=["Lithology_encoded"])), axis=1)

# #Randomly sampling data
# sampled_train = new_train.sample(n=100000, random_state=0)


# #Spliting training data
# x_train_sam = sampled_train.drop(["Lithology_encoded"], axis=1)
# y_train_sam = sampled_train["Lithology_encoded"]



  from pandas import MultiIndex, Int64Index


In [21]:
estimator = LogisticRegression(C=1e-3, solver='saga', max_iter=4000)
score = cross_val_score(estimator, x_train, y_train.values.ravel(), cv=10, scoring='f1_weighted').mean()
score

Traceback (most recent call last):
  File "C:\Users\Ayori\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Ayori\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1374, in fit
    raise ValueError("This solver needs samples of at least 2 classes"
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0

Traceback (most recent call last):
  File "C:\Users\Ayori\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Ayori\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1374, in fit
    raise ValueError("This solver needs samples of at least 2 classes"
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0

Traceb

nan

In [22]:
estimator1 = DecisionTreeClassifier()
score1 = cross_val_score(estimator1, x_train, y_train.values.ravel(), cv=10, scoring='f1_weighted').mean()
score1

1.0

In [None]:
estimator2 = RandomForestClassifier()
score2 = cross_val_score(estimator2, x_train, y_train.values.ravel(), cv=10, scoring='f1_weighted').mean()
score2

In [None]:
estimator3 = XGBClassifier()
score3 = cross_val_score(estimator3, x_train, y_train.values.ravel(), cv=10, scoring='f1_weighted').mean()
score3

In [None]:
estimator4 = GradientBoostingClassifier()
score4 = cross_val_score(estimator4, x_train y_train.values.ravel(), cv=10, scoring='f1_weighted').mean()
score4


In [None]:
from sklearn.neighbors import KNeighborsClassifier
estimator5 =KNeighborsClassifier()
score5 = cross_val_score(estimator5, x_train, y_train.values.ravel(), cv=10, scoring='f1_weighted').mean()
score5


In [None]:
from sklearn.neighbors import KNeighborsClassifier
estimator6 =SVC()
score6 = cross_val_score(estimator6, x_train, y_train.values.ravel(), cv=10, scoring='f1_weighted').mean()
score6


In [None]:
from catboost import CatBoostClassifier
estimator7 =CatBoostClassifier()
score7 = cross_val_score(estimator7, x_train, y_train.values.ravel(), cv=10, scoring='f1_weighted').mean()
score7


In [None]:
from lightgbm import LGBMClassifier
estimator8 =LGBMClassifier()
score8 = cross_val_score(estimator8, x_train, y_train.values.ravel(), cv=10, scoring='f1_weighted').mean()
score8


In [None]:

# estimators = [LogisticRegression(C=1e-3, solver='saga', max_iter=4000), DecisionTreeClassifier(), RandomForestClassifier(), XGBClassifier(), SVC(), GradientBoostingClassifier()]
# f1_train_scores = []
# i=1
# for estimator in estimators:
#   f1_score_i = cross_val_score(estimator, x_train_sam, y_train_sam.values.ravel(), cv=10, scoring='f1_weighted').mean()
#   f1_train_scores.append(f1_score_i)

#   print('---------------MODEL {} ACCURACY {}---------------'.format(i, f1_score_i))
#   i += 1
# #print(f1_train_scores)

In [None]:
# #Merging train data
# new_train = pd.concat((x_train_scaled, pd.DataFrame(y_train, columns=["Lithology_encoded"])), axis=1)

# #Randomly sampling data
# sampled_train = new_train.sample(n=819358, random_state=0)


# #Spliting training data
# x_train_sam = sampled_train.drop(["Lithology_encoded"], axis=1)
# y_train_sam = sampled_train["Lithology_encoded"]

# new_test = pd.concat((x_test_scaled, pd.DataFrame(y_test, columns=["Lithology_encoded"])), axis=1)

# #Randomly sampling data
# sampled_test = new_test.sample(n=120000, random_state=None)