In [1]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from tqdm import tqdm
import time

train = pd.read_csv("CSV_train.csv",low_memory=False,delimiter=';')
test=pd.read_csv("CSV_test.csv",low_memory=False,delimiter=',')
hidden=pd.read_csv("CSV_hidden_test.csv",low_memory=False,delimiter=',')

KeyboardInterrupt: 

In [None]:
test.info()

In [None]:
# storing length of datasets 
train_len = train.shape[0] 
test_len = test.shape[0]
All_data = pd.concat((train,test,hidden)).reset_index(drop=True) 

lithology_keys = {30000: 'Sandstone',
                 65030: 'Sandstone/Shale',
                 65000: 'Shale',
                 80000: 'Marl',
                 74000: 'Dolomite',
                 70000: 'Limestone',
                 70032: 'Chalk',
                 88000: 'Halite',
                 86000: 'Anhydrite',
                 99000: 'Tuff',
                 90000: 'Coal',
                 93000: 'Basement'}
All_data['Lithology'] = All_data['FORCE_2020_LITHOFACIES_LITHOLOGY'].map(lithology_keys)
All_data

In [None]:
#dropping columns with high missing values
drop_cols = ['SGR', 'ROPA', 'RXO', 'MUDWEIGHT','DCAL','RMIC','FORCE_2020_LITHOFACIES_CONFIDENCE']
All_data_drop = All_data.drop(drop_cols, axis=1)


In [None]:
All_data_drop

In [None]:
# drop2

In [None]:
All_data_drop.columns

In [None]:
#Inputing missing values by introducing median 
from sklearn.impute import SimpleImputer

numeric_header=['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC',
       'CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'NPHI', 'PEF', 'DTC',
       'SP', 'BS', 'ROP', 'DTS', 'DRHO', 'FORCE_2020_LITHOFACIES_LITHOLOGY'
       ]
categorical_header=['WELL','GROUP', 'FORMATION','Lithology']
numeric=All_data_drop.select_dtypes(include=[np.number])
categorical= All_data_drop.select_dtypes(exclude=[np.number])
miss = SimpleImputer(missing_values=np.nan, strategy='median')
miss.fit(numeric)
numeric_imp = miss.fit_transform(numeric)
numeric_imp=pd.DataFrame(numeric_imp, columns=numeric_header)
miss2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
miss2.fit(categorical)
categorical_imp = miss2.fit_transform(categorical)
categorical_imp=pd.DataFrame(categorical_imp, columns=categorical_header)
frames = [numeric_imp,categorical_imp]
  
result = pd.concat(frames,axis=1, join='inner')
result

In [None]:
# encoding categorical variables
result['GROUP_encoded'] = result['GROUP'].astype('category')
result['GROUP_encoded'] = result['GROUP_encoded'].cat.codes

result['FORMATION_encoded'] = result['FORMATION'].astype('category')
result['FORMATION_encoded'] = result['FORMATION_encoded'].cat.codes

result['WELL_encoded'] = result['WELL'].astype('category')
result['WELL_encoded'] = result['WELL_encoded'].cat.codes

result['Lithology_encoded'] = result['FORCE_2020_LITHOFACIES_LITHOLOGY'].astype('category')
result['Lithology_encoded'] = result['Lithology_encoded'].cat.codes

In [None]:
#dropping categorial features replaces beforehan by encoded features
# drop2 = All_data_drop.drop(['GROUP', 'FORMATION','WELL','FORCE_2020_LITHOFACIES_LITHOLOGY','Lithology'], axis=1)

# # splitting dataset into training, test, and hidden sets
# train_prep = drop2[:train_len].copy()
# test_prep = drop2[train_len:(train_len+test_len)].copy()
# hidden_prep = drop2[(train_len+test_len):].copy()

In [None]:
# train_prep1= train_prep.copy()
# test_prep1= test_prep.copy()
# hidden_prep1= hidden_prep.copy()

In [None]:
train_imp = result[:train_len].copy()
test_imp = result[train_len:(train_len+test_len)].copy()
hidden_imp = result[(train_len+test_len):].copy()

In [None]:
print(train_imp.shape)
print(test_imp.shape)
print(hidden_imp.shape)

In [None]:
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
x_header=['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'CALI', 'RSHA', 'RMED', 'RDEP',
       'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DRHO',
       'GROUP_encoded', 'FORMATION_encoded', 'WELL_encoded']
y_header=['Lithology_encoded']
x_train = train_imp[x_header]
y_train = train_imp[y_header]
x_test = test_imp[x_header]
y_test = test_imp[y_header]
x_hidden = hidden_imp[x_header]
y_hidden = hidden_imp[y_header]

##Min-Max scaler 
scaler = MinMaxScaler()
x_train_scaled = x_train.copy()
x_test_scaled = x_test.copy()
x_hidden_scaled = x_hidden.copy()

x_train_scaled.iloc[:,:18] = scaler.fit_transform(x_train_scaled.iloc[:,:18])
x_test_scaled.iloc[:,:18] = scaler.transform(x_test_scaled.iloc[:,:18])
x_hidden_scaled.iloc[:,:18] = scaler.transform(x_hidden_scaled.iloc[:,:18])

In [None]:
x_train_scaled

In [None]:
y_train

In [None]:
#  import numpy as np
#     matrix_path = '/content/drive/MyDrive/Thesis_data/penalty_matrix.npy'
#     A = np.load(matrix_path)
#     S = 0.0
#     y_true = y_true.astype(int)
#     y_pred = y_pred.astype(int)
#     for i in range(0, y_true.shape[0]):
#         S -= A[y_true[i], y_pred[i]]
#     return S/y_true.shape[0]
    
# # Confusion Matrix Function

# def confusion_matrix(y_true, y_pred):
      
#     """Plots a confusion matrix normalized by the number of predictions a particular
#     machine learning algorithm has. By ormalize we look at the number of predictions
#     the model gets right.
#     Parameters
#     ----------
#     y_true: list
#       The actual lithologies given by the datasets provider.
#     y_pred: list
#       The predicted lithofacies obtained by a particular machine learning model.
#     Returns

In [None]:
A = np.load('penalty_matrix.npy')
def score(y_true, y_pred):
    S = 0.0
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    for i in range(0, y_true.shape[0]):
        S -= A[y_true[i], y_pred[i]]
    return S/y_true.shape[0]

In [None]:
#Supervised Algorithms
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsRegressor
from pprint import pprint
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
import xgboost
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
#Comparing base models accuracies by using k-fold cross validation - 10 folds

from sklearn.model_selection import cross_val_score
new_train = pd.concat((x_train_scaled, pd.DataFrame(y_train, columns=["Lithology_encoded"])), axis=1)

#Randomly sampling data
sampled_train = new_train.sample(n=50000, random_state=0)


#Spliting training data
x_train_sam = sampled_train.drop(["Lithology_encoded"], axis=1)
y_train_sam = sampled_train["Lithology_encoded"]

model_xgb = XGBClassifier(n_estimators=1000, max_depth=4,
                                 booster='gbtree', objective='multi:softprob',
                                 learning_rate=0.075, random_state=42,
                                 subsample=1, colsample_bytree=1,
                                 tree_method='gpu_hist', predictor='gpu_predictor',
                                 verbose=2020, reg_lambda=1500)

model_xgb.fit(x_train_sam, y_train_sam.values.ravel(), early_stopping_rounds=100, eval_set=[(x_test, y_test)], verbose=100)

train_pred_xgb = model_xgb.predict(x_train_scaled)
open_pred_xgb = model_xgb.predict(x_test_scaled)
hidden_pred_xgb = model_xgb.predict(x_hidden_scaled)
#Printing Reports 



In [None]:
print('-----------------------TRAIN SET REPORT---------------------')
print("Open set RMSE:", np.sqrt(mean_squared_error(y_train, train_pred_xgb)))
print('Open set penalty matrix score:', score(y_train.values, train_pred_xgb))
print('Open set report:', classification_report(y_train, train_pred_xgb))
print('-----------------------OPEN SET REPORT---------------------')
print("Open set RMSE:", np.sqrt(mean_squared_error(y_test, open_pred_xgb)))
print('Open set penalty matrix score:', score(y_test.values, open_pred_xgb))
print('Open set report:', classification_report(y_test, open_pred_xgb))
print('-----------------------HIDDEN SET REPORT---------------------')
print("Hidden set RMSE:", np.sqrt(mean_squared_error(y_hidden, hidden_pred_xgb)))
print('Hidden set penalty matrix score:', score(y_hidden.values, hidden_pred_xgb))
print('Hidden set report:', classification_report(y_hidden, hidden_pred_xgb))

In [None]:


# def grid_search(model):
#     params = {'max_depth': [3, 6, 10, 15],
#               'learning_rate': [0.01, 0.1, 0.2],
#               'subsample': np.arange(0.5, 1.0, 0.1),
#               'colsample_bytree': np.arange(0.5, 1.0, 0.1),
#               'colsample_bylevel': np.arange(0.5, 1.0, 0.1),
#               'n_estimators': [250,500,750],
#               'num_class': [10]
#               }
#     model_cv = model_selection.GridSearchCV(estimator=model, param_grid=params,
#                                           scoring='f1_weighted', verbose=10, n_jobs=1, cv=10)
#     model_cv.fit(x_train_sam, y_train_sam)

#     print("Best score is: {}".format(model_cv.best_score_))
#     print("Tuned Model Parameter: {}".format(model_cv.best_params_))
    


In [None]:
# grid_search(model_xgb)

In [None]:
# #Supervised Algorithms
# from sklearn import model_selection
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression, LogisticRegression
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import mean_squared_error, accuracy_score, recall_score, precision_score, f1_score
# from sklearn.neighbors import KNeighborsRegressor
# from pprint import pprint
# from sklearn.model_selection import StratifiedKFold
# from sklearn.naive_bayes import GaussianNB
# import xgboost
# from xgboost import XGBClassifier
# from sklearn.neighbors import KNeighborsClassifier
# #Comparing base models accuracies by using k-fold cross validation - 10 folds

# from sklearn.model_selection import cross_val_score

# #Sampling Standarized Tarining Data to Optimize Time - DEPLOYING BASE MODEL
# #Merging train data
# new_train = pd.concat((x_train_scaled, pd.DataFrame(y_train, columns=["Lithology_encoded"])), axis=1)

# #Randomly sampling data
# sampled_train = new_train.sample(n=819358, random_state=0)


# #Spliting training data
# x_train_sam = sampled_train.drop(["Lithology_encoded"], axis=1)
# y_train_sam = sampled_train["Lithology_encoded"]

# new_test = pd.concat((x_test_scaled, pd.DataFrame(y_test, columns=["Lithology_encoded"])), axis=1)

# #Randomly sampling data
# sampled_test = new_test.sample(n=120000, random_state=None)


# #Spliting test data
# x_test_sam = sampled_test.drop(["Lithology_encoded"], axis=1)
# y_test_sam = sampled_test["Lithology_encoded"]
# model_xgb = XGBClassifier()

# model_xgb.fit(x_train_sam, y_train_sam.values.ravel(), early_stopping_rounds=100, eval_set=[(x_test_sam, y_test_sam)], verbose=100)

# train_pred_xgb = model_xgb.predict(x_train_sam)
# open_pred_xgb = model_xgb.predict(x_test_sam)
# hidden_pred_xgb = model_xgb.predict(x_hidden)
# #Printing Reports 

# print('-----------------------TRAIN SET REPORT---------------------')
# print("Open set RMSE:", np.sqrt(mean_squared_error(y_train, train_pred_xgb)))
# print('Open set penalty matrix score:', score(y_train.values, train_pred_xgb))
# print('Open set report:', metrics.classification_report(y_train, train_pred_xgb))
# print('-----------------------OPEN SET REPORT---------------------')
# print("Open set RMSE:", np.sqrt(mean_squared_error(y_test, open_pred_xgb)))
# print('Open set penalty matrix score:', score(y_test.values, open_pred_xgb))
# print('Open set report:', metrics.classification_report(y_test, open_pred_xgb))
# print('-----------------------HIDDEN SET REPORT---------------------')
# print("Hidden set RMSE:", np.sqrt(mean_squared_error(y_hidden, hidden_pred_xgb)))
# print('Hidden set penalty matrix score:', score(y_hidden.values, hidden_pred_xgb))
# print('Hidden set report:', metrics.classification_report(y_hidden, hidden_pred_xgb))