In [23]:
import lasio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sweetviz as sv

In [24]:
def las_read(file):
    las = lasio.read(file)
    header = las.header
    data = las.df()
    data.reset_index(inplace = True)
    return data, header

In [25]:
# Importing all the files
# TRM2
df1_sonic, h11 = las_read('TRM2_data/1202125650_20190118P_TRM2_Elan_Petrophysical_Analysis_w_EC_logs.las')
df1_elan, h12 = las_read('TRM2_data/1202125650_20190118P_TRM2_Elan_Petrophysical_Analysis.las')

# CCS1
df2_sonic, h21 = las_read('CCS1_data/1211523415_20090426m_Sonic_P_and_S_Output_7240ft_350ft.las')
df2_elan, h22 = las_read('CCS1_data/1211523415_2020_CCS1_Elan_CoreCal.las')
df2_rho, h23 = las_read('CCS1_data/1211523415_20200727_CCS1_Compiled_Raw_Logs_Petrel.las')

# VW1
df3_sonic, h31 = las_read('VW1_data/1211523460_20101116_VW1_PandS_5250-7228.las')
df3_elan, h32 = las_read('VW1_data/1211523460_2020_VW1_Elan_CoreCal.las')

# CCS2
df4_sonic, h41 = las_read('CCS2_data/1211523713_20150503_Sonic_Intermediate_Anisotropy_PnS_350_5120ft.las')
df4_elan, h42 = las_read('CCS2_data/1211523713_20150529_ELAN_PetrophysicalAnalysis_344-7200ft.las')

In [26]:
# Merging Datasets
merged_df1 = pd.merge(df1_elan, df1_sonic, on='MD')
merged_df1 = merged_df1.rename(columns={'MD': 'DEPT'})

df2_elan_renamed = df2_elan.rename(columns={'MD': 'DEPT'})
merged_df2 = pd.merge(df2_elan_renamed, df2_sonic, on='DEPT')

df3_elan_renamed = df3_elan.rename(columns={'MD': 'DEPT'})
merged_df3 = pd.merge(df3_elan_renamed, df3_sonic, on='DEPT')

df4_elan_renamed = df4_elan.rename(columns={'MD': 'DEPT'})
merged_df4 = pd.merge(df4_elan_renamed, df4_sonic, on='DEPT')

In [27]:
merged_df3.columns

Index(['DEPT', 'BOUND_WATER', 'CALCITE', 'CHLORITE', 'CLASSIFICATION_IPSOM',
       'DOLOMITE', 'DRY_WEIGHT_CALCITE', 'DRY_WEIGHT_CHLORITE',
       'DRY_WEIGHT_DOLOMITE', 'DRY_WEIGHT_HEMATITE', 'DRY_WEIGHT_ILLITE',
       'DRY_WEIGHT_K-FELDSPAR', 'DRY_WEIGHT_KAOLINITE',
       'DRY_WEIGHT_N-FELDSPAR', 'DRY_WEIGHT_PYRITE', 'DRY_WEIGHT_QUARTZ',
       'DRY_WEIGHT_SIDERITE', 'HEMATITE', 'HRA_CLASS', 'ILLITE', 'K-FELDSPAR',
       'KAOLINITE', 'KSDR_PY', 'N-FELDSPAR', 'PHIT', 'PIGE', 'PIGN', 'PYRITE',
       'QUARTZ', 'RHGA_GEO', 'SIDERITE', 'SW', 'SWE', 'SWI_GEO', 'UIWATER',
       'UWATER', 'VCL_GEO', 'XIWATER', 'XWATER', 'RLA1', 'RLA2', 'RLA3',
       'RLA4', 'RLA5', 'BS', 'HCAL', 'HDAR', 'HD1_PPC1', 'HD2_PPC1', 'TENS',
       'GR_EDTC', 'NPHI', 'NPOR', 'PEFZ', 'RHOZ', 'DPHZ', 'SPHI', 'VCL_HILT',
       'DTCO', 'DTSM_FAST', 'DTST', 'PR_FAST', 'VPVS_FAST', 'MAXXENE_OVERALL',
       'MINXENE_OVERALL', 'FLAG'],
      dtype='object')

In [28]:
# Excluding Hematite and making a df with columns we need
def get_proper_columns(df, cols_list):
    df_cols = cols_list
    df_a = df[df_cols]
    return df_a

In [29]:
cols_1 = ['BOUND_WATER_x','CALCITE_x','CHLORITE_x','DOLOMITE_x',#'HEMATITE',
            'ILLITE_x','K-FELDSPAR_x','KAOLINITE_x','QUARTZ_x','UWATER_x', 
              'DEPT', 'DTCO', 'DTSH_FAST']
df1 = get_proper_columns(merged_df1, cols_1)
df1 = df1.rename(columns={'DTSH_FAST': 'DTSM_FAST', 'BOUND_WATER_x':'BOUND_WATER',
                          'CALCITE_x':'CALCITE','CHLORITE_x':'CHLORITE','DOLOMITE_x':
                          'DOLOMITE', 'ILLITE_x':'ILLITE','K-FELDSPAR_x':'K-FELDSPAR',
                          'KAOLINITE_x':'KAOLINITE','QUARTZ_x':'QUARTZ','UWATER_x':'UWATER'})

cols_2 = ['BOUND_WATER','CALCITE','CHLORITE','DOLOMITE',#'HEMATITE',
            'ILLITE','K-FELDSPAR','KAOLINITE','QUARTZ','UWATER', 'DEPT', 'DTCO', 'DTSM_FAST','RHOZ']
df2 = get_proper_columns(merged_df2, cols_2)

cols_3 = ['BOUND_WATER','CALCITE','CHLORITE','DOLOMITE',#'HEMATITE',
            'ILLITE','K-FELDSPAR','KAOLINITE','QUARTZ','UWATER', 'DEPT', 'DTCO', 'DTSM_FAST']
df3 = get_proper_columns(merged_df3, cols_3)

cols_4 = ['BOUND_WATER','CALCITE','CHLORITE','DOLOMITE',#'HEMATITE',
            'ILLITE','K_FELDSPAR','KAOLINITE','QUARTZ','UWATER', 'DEPT', 'DTCO', 'DTSM_FAST']
df4 = get_proper_columns(merged_df4, cols_4)
df4 = df4.rename(columns={'K_FELDSPAR': 'K-FELDSPAR'})

In [30]:
df1.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,DTCO,DTSM_FAST
0,,,,,,,,,,4357.25,49.79515,93.15175
1,,,,,,,,,,4357.75,50.36954,91.55038
2,,,,,,,,,,4358.25,50.84312,89.19648
3,,,,,,,,,,4358.75,50.95559,84.402
4,,,,,,,,,,4359.25,50.59177,81.83669


In [31]:
# Percent of Records Missing
def missing_value_percent(df):
    total_records = len(df)
    total_missing = df.isnull().sum().sum()
    percent_missing_total = (total_missing / (total_records * len(df.columns))) * 100
    print(percent_missing_total)

In [32]:
missing_value_percent(df1)

1.2934439178515007


In [33]:
missing_value_percent(df2)

2.803885534260961


In [34]:
missing_value_percent(df3)

0.0


In [35]:
df3.isnull().sum()

BOUND_WATER    0
CALCITE        0
CHLORITE       0
DOLOMITE       0
ILLITE         0
K-FELDSPAR     0
KAOLINITE      0
QUARTZ         0
UWATER         0
DEPT           0
DTCO           0
DTSM_FAST      0
dtype: int64

In [36]:
missing_value_percent(df4)

0.020574015019030965


In [41]:
def missing_drop(df):
    df = df.dropna()
    df.reset_index(inplace=True)
    df=df.drop(columns =['index'])
    return df

In [42]:
df1 = missing_drop(df1)
df2 = missing_drop(df2)
df3 = missing_drop(df3)
df4 = missing_drop(df4)

In [43]:
def calculate_voigt_reuss_bounds(df, minerals, bulk_modulus, shear_modulus):
    dataset = df[minerals]
    
    voigt_bulk = []
    reuss_bulk = []
    voigt_shear = []
    reuss_shear = []
    
    for index, row in dataset.iterrows():
        sum_voigt_bulk = sum_voigt_shear = 0
        sum_reuss_bulk = sum_reuss_shear = 0
        
        for mineral in minerals:
            volume_fraction = row[mineral]
            sum_voigt_bulk += volume_fraction * bulk_modulus[mineral]
            sum_voigt_shear += volume_fraction * shear_modulus[mineral]
            
            # Avoiding /0
            if bulk_modulus[mineral] > 0:
                sum_reuss_bulk += volume_fraction / bulk_modulus[mineral]
            if shear_modulus[mineral] > 0:
                sum_reuss_shear += volume_fraction / shear_modulus[mineral]
        
        voigt_bulk.append(sum_voigt_bulk)
        voigt_shear.append(sum_voigt_shear)
        
        reuss_bulk_value = 1 / sum_reuss_bulk if sum_reuss_bulk > 0 else 0
        reuss_shear_value = 1 / sum_reuss_shear if sum_reuss_shear > 0 else 0
        
        reuss_bulk.append(reuss_bulk_value)
        reuss_shear.append(reuss_shear_value)
        
        df.loc[index, 'Voigt_Bulk'] = sum_voigt_bulk
        df.loc[index, 'Reuss_Bulk'] = reuss_bulk_value
        df.loc[index, 'Voigt_Shear'] = sum_voigt_shear
        df.loc[index, 'Reuss_Shear'] = reuss_shear_value
    
    df['K_VRH'] = (df['Voigt_Bulk']+df['Reuss_Bulk'])/2
    df['G_VRH'] = (df['Voigt_Shear']+df['Reuss_Shear'])/2
    
    return df

In [44]:
# TEST CASE for V-R Boundaries Function
minerals = [
    'Clay', 'Quartz', 'K-feldspar', 'P-feldspar', 'Calcite',
    'Dolomite', 'Siderite', 'Pyrite', 'Magnetite', 'Hematite', 'Ilmenite', 'Air'
]

bulk_modulus = {
    'Clay': 1.5, 'Quartz': 37, 'K-feldspar': 37.5, 'P-feldspar': 75.6,
    'Calcite': 76.8, 'Dolomite': 94.9, 'Siderite': 123.7, 'Pyrite': 147.4,
    'Magnetite': 161.4, 'Hematite': 100.2, 'Ilmenite': 168, 'Air': 2.2
}

shear_modulus = {
    'Clay': 1.4, 'Quartz': 44, 'K-feldspar': 15, 'P-feldspar': 25.6,
    'Calcite': 32, 'Dolomite': 45, 'Siderite': 51, 'Pyrite': 132.5,
    'Magnetite': 91.4, 'Hematite': 95.2, 'Ilmenite': 64, 'Air': 0.1
}

data = {
    'Clay': [0.07], 'Quartz': [0.32], 'K-feldspar': [0.1188], 'P-feldspar': [0.2574],
    'Calcite': [0.0099], 'Dolomite': [0.0198], 'Siderite': [0.0594], 'Pyrite': [0.0297],
    'Magnetite': [0.0495], 'Hematite': [0.0297], 'Ilmenite': [0.0396], 'Air': [0.01]
}
test_df = pd.DataFrame(data)

test_df = calculate_voigt_reuss_bounds(test_df, minerals, bulk_modulus, shear_modulus)
test_df

Unnamed: 0,Clay,Quartz,K-feldspar,P-feldspar,Calcite,Dolomite,Siderite,Pyrite,Magnetite,Hematite,Ilmenite,Air,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH
0,0.07,0.32,0.1188,0.2574,0.0099,0.0198,0.0594,0.0297,0.0495,0.0297,0.0396,0.01,67.86438,14.643094,40.60903,5.591029,41.253737,23.10003


In [45]:
minerals_list = ['CALCITE','DOLOMITE','UWATER', 'CHLORITE', 'ILLITE', 'KAOLINITE','BOUND_WATER'
                    ,'K-FELDSPAR','QUARTZ']

bulk_modulus = {
    'KAOLINITE': 1.5,
    'CHLORITE':1.5, 
    'ILLITE':1.5,
    'QUARTZ': 37,
    'K-FELDSPAR': 37.5,
    'CALCITE': 76.8,
    'DOLOMITE': 94.9,
    #'Hematite': 206.6,
    'BOUND_WATER':2.2,
    'UWATER': 2.2
}

shear_modulus = {
    'KAOLINITE': 1.4,
    'CHLORITE':1.4, 
    'ILLITE':1.4,
    'QUARTZ': 44,
    'K-FELDSPAR': 15,
    'CALCITE': 32,
    'DOLOMITE': 45,
    #'Hematite': 91.0,
    'BOUND_WATER': 0.0001,
    'UWATER': 0.0001
}

df1 = calculate_voigt_reuss_bounds(df1, minerals_list, bulk_modulus, shear_modulus)
df2 = calculate_voigt_reuss_bounds(df2, minerals_list, bulk_modulus, shear_modulus)
df3 = calculate_voigt_reuss_bounds(df3, minerals_list, bulk_modulus, shear_modulus)
df4 = calculate_voigt_reuss_bounds(df4, minerals_list, bulk_modulus, shear_modulus)

In [46]:
# Train Test Split
# Train : TR2, CCS1, VW1
# Test : CCS2

df = pd.concat([df1, df2, df3, df4])
train_df = pd.concat([df1, df3, df4])
test_df = df2

In [47]:
# Percent of Total Records Missing
missing_value_percent(df)

4.254993604838806


In [48]:
test_df.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,DTCO,DTSM_FAST,RHOZ,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,53.47908,103.45393,2.62311,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,54.30895,102.99964,2.61659,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,54.61615,103.17477,2.62357,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,55.44879,104.25208,2.61228,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,57.11026,106.35674,2.58807,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809


In [49]:
train_df_dtco = train_df.drop(columns = ['DTSM_FAST'])
test_df_dtco= test_df.drop(columns = ['DTSM_FAST'])

train_df_dtsm = train_df.drop(columns = ['DTCO'])
test_df_dtsm = test_df.drop(columns = ['DTCO'])

In [50]:
def convert_velocity(df, target, col):
    # The conversion factor to go from microseconds/feet to seconds/meter is 0.000001 / 0.3048
    if target not in df.columns:
        raise ValueError(f"Column {target} not found in DataFrame.")
    
    conversion_factor = 0.000001 / 0.3048
    df[col] = 1 / (df[target] * conversion_factor)
    df.drop(columns=[target], inplace = True)
    return df

df = convert_velocity(df, 'DTCO', 'VP')
train_df_vp = convert_velocity(train_df_dtco, "DTCO", 'VP')
test_df_vp = convert_velocity(test_df_dtco, "DTCO", 'VP')

df = convert_velocity(df, 'DTSM_FAST', 'VS')
train_df_vs = convert_velocity(train_df_dtsm, 'DTSM_FAST', 'VS')
test_df_vs = convert_velocity(test_df_dtsm, 'DTSM_FAST', 'VS')

In [51]:
test_df_vs

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,RHOZ,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VS
0,0.00026,0.08100,0.00000,0.47844,0.00000,0.23040,0.00381,0.10848,0.05685,5101.0,2.62311,64.409873,22.898393,32.356260,0.001751,43.654133,16.179005,2946.238968
1,0.00008,0.06900,0.00000,0.47355,0.00000,0.23773,0.00114,0.11768,0.05759,5101.5,2.61659,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478,2959.233644
2,0.00003,0.04209,0.00000,0.48604,0.00000,0.24202,0.00041,0.12550,0.05397,5102.0,2.62357,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706,2954.210608
3,0.00036,0.05095,0.00000,0.43499,0.00000,0.25269,0.00529,0.15076,0.04891,5102.5,2.61228,60.363835,23.819622,31.636151,0.002029,42.091729,15.819090,2923.682674
4,0.00088,0.08433,0.00000,0.29721,0.00000,0.27180,0.01301,0.23370,0.03773,5103.0,2.58807,53.625630,22.718279,30.451028,0.002590,38.171954,15.226809,2865.826839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4212,0.00940,0.00000,0.02083,0.12972,0.04289,0.16409,0.02395,0.51980,0.00001,7207.0,2.70370,37.848610,12.118974,31.292689,0.010617,24.983792,15.651653,3009.357443
4213,0.00931,0.00000,0.02140,0.19529,0.04213,0.14979,0.02296,0.47591,0.00001,7207.5,2.69806,41.909055,12.374546,32.096027,0.010720,27.141800,16.053373,3009.357443
4214,0.00981,0.00000,0.02457,0.21701,0.04374,0.14476,0.02161,0.46449,0.00399,7208.0,2.69366,43.374119,11.775766,32.500299,0.007242,27.574942,16.253771,3009.357443
4215,0.00992,0.00002,0.02629,0.22285,0.04463,0.11656,0.01845,0.49333,0.00600,7208.5,2.67802,43.943290,11.681370,33.608930,0.006278,27.812330,16.807604,3009.357443


In [54]:
def calculate_vp(df):
    # (1 GPa = 1e9 Pa)
    KV = df['Voigt_Bulk'] * 1e9
    GV = df['Voigt_Shear'] * 1e9
    KR = df['Reuss_Bulk'] * 1e9
    GR = df['Reuss_Shear'] * 1e9
    
    # (1 g/cm^3 = 1000 kg/m^3)
    rho = df['RHOZ'] * 1000
    
    df['VOIGT_VP'] = np.sqrt((KV + 4.0/3.0 * GV) / rho)
    df['REUSS_VP'] = np.sqrt((KR + 4.0/3.0 * GR) / rho)
    
    return df

test_df_vp = calculate_vp(test_df_vp)
test_df_vp.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,...,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VP,VRH_VP,VOIGT_VP,REUSS_VP
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,...,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005,5699.424897,4986.577661,6403.24458,2954.720357
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,...,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478,5612.334615,4988.943592,6384.41977,3003.048539
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,...,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706,5580.766861,4999.528863,6367.070757,3074.245872
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,...,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909,5496.963955,4918.052674,6265.390794,3019.828207
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,...,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809,5337.044517,4753.294818,6033.921752,2963.007179


In [55]:
def calculate_vpvrh(df):
    # (1 GPa = 1e9 Pa)
    KVRH = df['K_VRH'] * 1e9
    GVRH = df['G_VRH'] * 1e9
    
    # (1 g/cm^3 = 1000 kg/m^3)
    rho = df['RHOZ'] * 1000
    
    df['VRH_VP'] = np.sqrt((KVRH + 4.0/3.0 * GVRH) / rho)
    return df

test_df_vp = calculate_vpvrh(test_df_vp)
test_df_vp.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,...,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VP,VRH_VP,VOIGT_VP,REUSS_VP
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,...,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005,5699.424897,4986.577661,6403.24458,2954.720357
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,...,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478,5612.334615,4988.943592,6384.41977,3003.048539
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,...,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706,5580.766861,4999.528863,6367.070757,3074.245872
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,...,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909,5496.963955,4918.052674,6265.390794,3019.828207
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,...,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809,5337.044517,4753.294818,6033.921752,2963.007179


In [56]:
def calculate_vs(df):
    # (1 GPa = 1e9 Pa)
    
    GV = df['Voigt_Shear'] * 1e9
    GR = df['Reuss_Shear'] * 1e9
    
    # (1 g/cm^3 = 1000 kg/m^3)
    rho = df['RHOZ'] * 1000
    
    df['VOIGT_VS'] = np.sqrt(GV / rho)
    df['REUSS_VS'] = np.sqrt(GR / rho)
    
    return df

test_df_vs = calculate_vs(test_df_vs)
test_df_vs.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,RHOZ,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VS,VOIGT_VS,REUSS_VS
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,2.62311,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005,2946.238968,3512.132604,25.835856
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,2.61659,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478,2959.233644,3511.446274,25.742169
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,2.62357,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706,2954.210608,3512.65488,26.567101
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,2.61228,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909,2923.682674,3480.021737,27.872973
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,2.58807,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809,2865.826839,3430.14887,31.632862


In [57]:
def calculate_vsvrh(df):
    # (1 GPa = 1e9 Pa)
    GVRH = df['G_VRH'] * 1e9
    
    # (1 g/cm^3 = 1000 kg/m^3)
    rho = df['RHOZ'] * 1000
    
    df['VRH_VS'] = np.sqrt(GVRH / rho)
    return df

test_df_vs = calculate_vsvrh(test_df_vs)
test_df_vs.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,...,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VS,VOIGT_VS,REUSS_VS,VRH_VS
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,...,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005,2946.238968,3512.132604,25.835856,2483.519974
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,...,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478,2959.233644,3511.446274,25.742169,2483.034192
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,...,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706,2954.210608,3512.65488,26.567101,2483.893125
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,...,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909,2923.682674,3480.021737,27.872973,2460.825897
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,...,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809,2865.826839,3430.14887,31.632862,2425.584662


In [58]:
df.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,RHOZ,VP,VS
0,0.00139,0.92588,0.0,0.0,0.00081,0.04413,0.01871,0.00906,1e-05,4361.25,73.130039,36.863518,30.716078,0.071195,54.996778,15.393636,,6263.51006,3595.064147
1,0.00291,0.9129,0.0,0.0,0.01286,0.04058,0.01871,0.01203,1e-05,4361.75,72.131359,28.036393,30.395018,0.034183,50.083876,15.214601,,6481.686259,3574.159111
2,0.00468,0.89792,0.0,0.0,0.0269,0.03663,0.01871,0.01373,0.00144,4362.25,70.92377,21.631896,29.950865,0.016323,46.277833,14.983594,,6563.708892,3548.413197
3,0.00407,0.88793,0.0,0.0,0.02208,0.03811,0.01871,0.01604,0.01306,4362.75,70.3145,20.837212,29.748278,0.005836,45.575856,14.877057,,6334.250151,3545.193129
4,0.00134,0.88843,0.0,0.0,0.00042,0.04504,0.01871,0.01929,0.02676,4363.25,70.724669,25.762189,29.980905,0.003558,48.243429,14.992231,,6141.289554,3560.947343


In [59]:
train_df_vs.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VS
0,0.00139,0.92588,0.0,0.0,0.00081,0.04413,0.01871,0.00906,1e-05,4361.25,73.130039,36.863518,30.716078,0.071195,54.996778,15.393636,3595.064147
1,0.00291,0.9129,0.0,0.0,0.01286,0.04058,0.01871,0.01203,1e-05,4361.75,72.131359,28.036393,30.395018,0.034183,50.083876,15.214601,3574.159111
2,0.00468,0.89792,0.0,0.0,0.0269,0.03663,0.01871,0.01373,0.00144,4362.25,70.92377,21.631896,29.950865,0.016323,46.277833,14.983594,3548.413197
3,0.00407,0.88793,0.0,0.0,0.02208,0.03811,0.01871,0.01604,0.01306,4362.75,70.3145,20.837212,29.748278,0.005836,45.575856,14.877057,3545.193129
4,0.00134,0.88843,0.0,0.0,0.00042,0.04504,0.01871,0.01929,0.02676,4363.25,70.724669,25.762189,29.980905,0.003558,48.243429,14.992231,3560.947343


In [60]:
test_df_vs.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,...,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VS,VOIGT_VS,REUSS_VS,VRH_VS
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,...,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005,2946.238968,3512.132604,25.835856,2483.519974
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,...,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478,2959.233644,3511.446274,25.742169,2483.034192
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,...,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706,2954.210608,3512.65488,26.567101,2483.893125
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,...,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909,2923.682674,3480.021737,27.872973,2460.825897
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,...,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809,2865.826839,3430.14887,31.632862,2425.584662


## Exploratory Data Analysis

In [None]:
a
# Analyze the single dataframe 'df'
report_df = sv.analyze(df)
report_df.show_html('report_df.html')

# Analyze the training dataframe 'train_df_vs'
report_train = sv.analyze(train_df_vs)
report_train.show_html('report_train_df_vs.html')

# Analyze the test dataframe 'test_df_vs'
report_test = sv.analyze(test_df_vs)
report_test.show_html('report_test_df_vs.html')

# Comparative report between the two datasets
compare_report = sv.compare([train_df_vs, "Training Data"], [test_df_vs, "Test Data"])
compare_report.show_html('compare_report.html')

In [62]:
test_df_vs.shape

(4217, 21)

In [63]:
train_df_vp.shape

(17798, 17)

In [64]:
df.shape

(22015, 19)

In [None]:
df.to_csv("DATASETS/df.csv", index=False)

train_df_vp.to_csv("DATASETS/train_df_vp.csv", index=False)
test_df_vp.to_csv("DATASETS/test_df_vp.csv", index=False)

train_df_vs.to_csv("DATASETS/train_df_vs.csv", index=False)
test_df_vs.to_csv("DATASETS/test_df_vs.csv", index=False)