# Load Data ( Prepare the test dataset )

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 150

df = pd.read_excel('AASG_Thermed_AllThicksAndConds.xlsx')
df2 = pd.read_csv('clean_new_well_data_fixed.csv')

In [2]:
def outlierDrop(df,df_toCheck,std_cut_off):
    init_len = df.shape[0]
    for i in df_toCheck:
        mean = df[i].mean()
        std = df[i].std()
        cut_off = std * std_cut_off
        lower, upper =mean - cut_off, mean + cut_off
        df = df[(df[i] < upper) & (df[i] > lower)]
    print("numbere of outliers removed: ", init_len - df.shape[0])
    return df
#df = outlierDrop(df,['CorrBHT', 'HeatFlow','MeasureDepth_m'],4)
df = outlierDrop(df,['HeatFlow'],3)
df = df[df['HeatFlow']>0]

df.reset_index(inplace=True, drop=True)


# Find Closest Points for Estimation with Phys Model

num_sample=10000
sampled_df2 = df2.sample(num_sample)

import math
def roundup(x):
    return int(math.ceil(x / 10.0)) * 10

# Round by 10 to later compare with T1~T500
sampled_df2.depth = np.round(sampled_df2.depth,decimals=-1).astype('int')

sampled_df2

from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=1)
neigh.fit(np.transpose(np.array([df.LatDegree, df.LongDegree])))
pred_indices = neigh.kneighbors(np.transpose(np.array([sampled_df2.lat,sampled_df2.lon])),return_distance=False)

sampled_df2.insert(6, 'closest',pred_indices)

sampled_df2.reset_index(inplace=True)

sampled_df2

predicted_values_by_physics_model = []
for i in range(0, num_sample):
    curr_depth = sampled_df2.depth[i]
    curr_row = df.loc[sampled_df2.closest[i]] # obtain the row
    t_string = 'T'+str(int(curr_depth/10))
    predicted_value = curr_row[t_string]
    predicted_values_by_physics_model.append(predicted_value)

predicted_values_by_physics_model = np.array(predicted_values_by_physics_model)

sampled_df2.insert(8, 'physics_pred', predicted_values_by_physics_model)

sampled_df2

# Add Geological Information

# - Predict: Geological information of sampled_df2, cond1*thick1 + ...

lat_to_interpolate = sampled_df2.lat
lon_to_interpolate = sampled_df2.lon

layers = df.iloc[:,52:101].values
conds = df.iloc[:,101:150].values
mult = np.multiply(layers,conds)
np.nan_to_num(mult, 0)
mult.shape

# Read optimal values
f = open("optim_result.out", "r")
lines = f.readlines()

optimal_neigh = []
optimal_width = []
for line in lines:
    optimal_neigh.append(line.split(',')[0][0])
    optimal_width.append(line.split(',')[1])
optimal_neigh = np.array(optimal_neigh).astype('int')
optimal_width = np.array(optimal_width).astype('float')

# Predict 49 layers information for each sampled_df2 lat and lon
from sklearn.neighbors import KNeighborsRegressor
predicted_mults = []
for i in range(0,49):
    def gaussian_kernel(distances):
                kernel_width = optimal_width[i]
                weights = np.exp(-(distances**2)/kernel_width)
                return weights
    knn = KNeighborsRegressor(n_neighbors=optimal_neigh[i],weights=gaussian_kernel)
    #knn = KNeighborsRegressor(n_neighbors=1,weights=gaussian_kernel)
    knn.fit(np.transpose(np.array([df.LatDegree, df.LongDegree])), mult[:,i])
    y_pred = knn.predict(np.transpose(np.array([sampled_df2.lat, sampled_df2.lon])))
    predicted_mults.append(y_pred)

predicted_mults = np.transpose(np.array(predicted_mults))

# Predict T_SURF
def gaussian_kernel(distances):
            kernel_width = 2.598
            weights = np.exp(-(distances**2)/kernel_width)
            return weights
knn = KNeighborsRegressor(n_neighbors=1,weights=gaussian_kernel)
knn.fit(np.transpose(np.array([df.LatDegree, df.LongDegree])), df.SurfTemp)
predicted_tsurf = knn.predict(np.transpose(np.array([sampled_df2.lat, sampled_df2.lon])))

numbere of outliers removed:  67


# Load XGB and test

In [4]:
import pickle
# load the model from disk
gbm = pickle.load(open('xgbSaved.pkl', 'rb'))

In [5]:
X_test = np.transpose(np.array([sampled_df2.lat,
                               sampled_df2.lon,
                               sampled_df2.depth,
                               predicted_tsurf]))
X_test = np.concatenate((X_test, predicted_mults),axis=1)
y_pred = gbm.predict(X_test)

In [6]:
y_pred

array([36.3388  , 29.168438, 30.444153, ..., 63.45916 , 45.91786 ,
       26.305988], dtype=float32)