In [162]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [163]:
df = pd.read_csv('AASG_Thermed_AllTempsThicksConds.csv', low_memory=False)

In [164]:
def remove_outliers_iqr(data, column):
    # Calculate the first and third quartiles
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    
    # Calculating the IQR (Interquartile Range)
    IQR = Q3 - Q1
    
    # Defining the lower and upper bounds to identify outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Creating masks for outliers and non-outliers
    outliers_mask = (data[column] < lower_bound) | (data[column] > upper_bound)
    # Creating a mask for non-outliers
    non_outliers_mask = (data[column] >= lower_bound) & (data[column] <= upper_bound)
    
    return data[non_outliers_mask].copy(), outliers_mask

In [165]:
df = remove_outliers_iqr(df, 'HeatFlow')[0]

In [166]:
df = df.reset_index(drop=True)

In [167]:

# Forming X and Y
X = np.transpose(np.array([df.LatDegree,
                                df.LongDegree,
                                df.MeasureDepth_m,
                                df.SurfTemp]))
Y = df.CorrBHT.values
# Adding Geological Layer information to X
layers = df.iloc[:,52:101].values
conds = df.iloc[:,101:150].values
mult = np.multiply(layers,conds)
np.nan_to_num(mult, 0)
X = np.concatenate((X, mult),axis=1)

In [168]:
df2 = pd.read_csv('clean_new_well_data_fixed.csv')
num_sample=10000
sampled_df2 = df2.sample(num_sample)
sampled_df2.reset_index(inplace=True,drop=True)
sampled_df2.head()

Unnamed: 0,id,depth,temp,lat,lon,corrtemp
0,4704701120,1175.3088,23.888889,37.478631,-81.862642,29.561902
1,4705900811,1144.6002,41.566667,37.572413,-81.844652,47.03946
2,4708509707,1127.3028,24.116667,39.22895,-81.061086,29.476681
3,4705900805,1030.6812,34.088889,37.904429,-82.169272,38.81893
4,4708300103,2684.2212,91.277778,38.707282,-79.969042,106.7889


In [169]:
# Interpolating Geological Layer information for X
lat_to_interpolate = sampled_df2.lat
lon_to_interpolate = sampled_df2.lon
layers = df.iloc[:,52:101].values
conds = df.iloc[:,101:150].values
mult = np.multiply(layers,conds)
np.nan_to_num(mult, 0)

f = open("optim_result.out", "r")
lines = f.readlines()

optimal_neigh = []
optimal_width = []
for line in lines:
    optimal_neigh.append(line.split(',')[0][0])
    optimal_width.append(line.split(',')[1])
optimal_neigh = np.array(optimal_neigh).astype('int')
optimal_width = np.array(optimal_width).astype('float')

# Predicting 49 layers information for each sampled_df2 lat and lon
from sklearn.neighbors import KNeighborsRegressor
predicted_mults = []
for i in range(0,49):
    def gaussian_kernel(distances):
                kernel_width = optimal_width[i]
                weights = np.exp(-(distances**2)/kernel_width)
                return weights
    knn = KNeighborsRegressor(n_neighbors=optimal_neigh[i],weights=gaussian_kernel)
    #knn = KNeighborsRegressor(n_neighbors=1,weights=gaussian_kernel)
    knn.fit(np.transpose(np.array([df.LatDegree, df.LongDegree])), mult[:,i])
    y_pred = knn.predict(np.transpose(np.array([sampled_df2.lat, sampled_df2.lon])))
    predicted_mults.append(y_pred)
    
predicted_mults = np.transpose(np.array(predicted_mults))

# Predicting T_SURF
def gaussian_kernel(distances):
            kernel_width = 2.598
            weights = np.exp(-(distances**2)/kernel_width)
            return weights
knn = KNeighborsRegressor(n_neighbors=1,weights=gaussian_kernel)
knn.fit(np.transpose(np.array([df.LatDegree, df.LongDegree])), df.SurfTemp)
predicted_tsurf = knn.predict(np.transpose(np.array([sampled_df2.lat, sampled_df2.lon])))

In [170]:
# Forming X and Y
new_X = np.transpose(np.array([sampled_df2.lat,
                                sampled_df2.lon,
                                sampled_df2.depth,
                                predicted_tsurf]))
new_X = np.concatenate((new_X, predicted_mults),axis=1)
new_Y = sampled_df2.corrtemp.values

In [171]:
import sklearn.metrics as m

In [172]:
def Rftest():
    model = RandomForestRegressor(max_depth=10, n_estimators=50)
    model.fit(X, Y)
    y_pred = model.predict(new_X)
    y_test = new_Y
    std = (np.nanstd(abs(y_test-y_pred)))
    return m.mean_absolute_error(y_test, y_pred), m.mean_squared_error(y_test, y_pred), std

In [173]:
mae, mse, std = Rftest()
print(mae, mse, std) 

7.595364895715875 91.99430014470853 5.8570241800453156
