In [47]:
import lasio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sweetviz as sv
import seaborn as sns

In [48]:
def las_read(file):
    las = lasio.read(file)
    header = las.header
    data = las.df()
    data.reset_index(inplace = True)
    return data, header

### Importing all the files

In [49]:
# TRM2
df1_sonic, h11 = las_read('Datasets/Input Data/TRM2/1202125650_20190118P_TRM2_Elan_Petrophysical_Analysis_w_EC_logs.las')
df1_elan, h12 = las_read('Datasets/Input Data/TRM2/1202125650_20190118P_TRM2_Elan_Petrophysical_Analysis.las')
df1_rho, h121 = las_read('Datasets/Input Data/TRM2/1202125650_20181203_TRM2_2A_PEX-AIT_MAIN_CUSTOMER_R1_E85I_03.las')

# CCS1
df2_sonic, h21 = las_read('Datasets/Input Data/CCS1/1211523415_20090426m_Sonic_P_and_S_Output_7240ft_350ft.las')
df2_elan, h22 = las_read('Datasets/Input Data/CCS1/1211523415_2020_CCS1_Elan_CoreCal.las')
df2_rho, h23 = las_read('Datasets/Input Data/CCS1/1211523415_20200727_CCS1_Compiled_Raw_Logs_Petrel.las')

# VW1
df3_sonic, h31 = las_read('Datasets/Input Data/VW1/1211523460_20101116_VW1_PandS_5250-7228.las')
df3_elan, h32 = las_read('Datasets/Input Data/VW1/1211523460_2020_VW1_Elan_CoreCal.las')

# CCS2
df4_sonic, h41 = las_read('Datasets/Input Data/CCS2/1211523713_20150503_Sonic_Intermediate_Anisotropy_PnS_350_5120ft.las')
df4_elan, h42 = las_read('Datasets/Input Data/CCS2/1211523713_20150529_ELAN_PetrophysicalAnalysis_344-7200ft.las')

# VW2
df5_sonic, h51 = las_read('Datasets/Input Data/VW2/1211523552_20121031_Sonic_PNS.las')
df5_elan, h52 = las_read('Datasets/Input Data/VW2/1211523552_20121030_ELAN_iteration_v20160209.las')

In [7]:
def adjust_depth(df, column, suffix_map):
    """
    Generalized function to adjust depth or measured depth values in a DataFrame.

    Parameters:
    -----------
    df (pandas.DataFrame): The input DataFrame.
    column (str): The name of the column to be adjusted (e.g., 'DEPT', 'MD').
    suffix_map (dict): A mapping of suffixes (as strings) to their replacements.

    Returns:
    --------
    pandas.DataFrame: The modified DataFrame with adjusted column values.
    """
    def adjust_value(value):
        value_str = str(value)
        for suffix, replacement in suffix_map.items():
            if value_str.endswith(suffix):
                return float(f"{int(value)}{replacement}")
        return value

    df[column] = df[column].apply(adjust_value)
    return df

# Adjusting 'DEPT' column in df1_rho
suffix_map_dept = {'.0': '.25', '.5': '.75'}
df1_rho = adjust_depth(df1_rho, column='DEPT', suffix_map=suffix_map_dept)

# Adjusting 'MD' column in df5_elan
suffix_map_md = {'.2': '.0', '.7': '.5'}
df5_elan = adjust_depth(df5_elan, column='MD', suffix_map=suffix_map_md)

### Dataset Rearrangement

In [8]:
# Merging Datasets
merged_df1 = pd.merge(df1_elan, df1_sonic, on='MD')
merged_df1 = merged_df1.rename(columns={'MD': 'DEPT'})
merged_df1 = pd.merge(df1_rho, merged_df1, on='DEPT')

df2_elan_renamed = df2_elan.rename(columns={'MD': 'DEPT'})
merged_df2 = pd.merge(df2_elan_renamed, df2_sonic, on='DEPT')

df3_elan_renamed = df3_elan.rename(columns={'MD': 'DEPT'})
merged_df3 = pd.merge(df3_elan_renamed, df3_sonic, on='DEPT')

df4_elan_renamed = df4_elan.rename(columns={'MD': 'DEPT'})
merged_df4 = pd.merge(df4_elan_renamed, df4_sonic, on='DEPT')

df5_elan_renamed = df5_elan.rename(columns={'MD': 'DEPT'})
merged_df5 = pd.merge(df5_elan_renamed, df5_sonic, on='DEPT')

In [9]:
# Excluding Hematite and making a df with columns we need
def get_proper_columns(df, cols_list):
    df_cols = cols_list
    df_a = df[df_cols]
    return df_a

In [10]:
merged_df4.columns

Index(['DEPT', 'ANHYDRITE', 'BFV', 'BOUND_WATER', 'BS_x', 'CALCITE',
       'CHLORITE', 'DOLOMITE', 'DRY_WEIGHT_ANHYDRITE', 'DRY_WEIGHT_CALCITE',
       'DRY_WEIGHT_CHLORITE', 'DRY_WEIGHT_DOLOMITE', 'DRY_WEIGHT_ILLITE',
       'DRY_WEIGHT_K_FELDSPAR', 'DRY_WEIGHT_KAOLINITE',
       'DRY_WEIGHT_N_FELDSPAR', 'DRY_WEIGHT_PYRITE', 'DRY_WEIGHT_QUARTZ',
       'DSOZ', 'GR_x', 'GR_EDTC', 'HCAL', 'HCGR', 'HSGR', 'ILLITE',
       'K_FELDSPAR', 'KAOLINITE', 'KINT_GEO', 'KSDRBB', 'N_FELDSPAR', 'NPOR',
       'NPOR_LIM_EC', 'NPOR_SAN_EC', 'PEFZ', 'PHIT', 'PIGE', 'PIGN', 'PYRITE',
       'QUARTZ', 'RHGA_GEO', 'RHOZ_x', 'RLA1', 'RLA2', 'RLA3', 'RLA4', 'RLA5',
       'RSOZ', 'RXOZ', 'SW', 'SXO', 'T2CUTOFF', 'T2LM', 'TCMR', 'UGAS',
       'UIWATER', 'UOIL', 'UWATER', 'UZ', 'XGAS', 'XIWATER', 'XOIL', 'XWATER',
       'GR_y', 'RHOZ_y', 'BS_y', 'P1AZ', 'SDEVM', 'HAZIM', 'RB', 'DEVI',
       'HD1_PPC1', 'HD1_PPC2', 'HD2_PPC1', 'HD2_PPC2', 'TENS', 'HDAR',
       'MINXENE_OVERALL', 'MAXXENE_OVERALL', 'FSH_A

In [11]:
# Reordering and renaming datasets to a consistent format
cols_1 = ['BOUND_WATER_x','CALCITE_x','CHLORITE_x','DOLOMITE_x',#'HEMATITE',
            'ILLITE_x','K-FELDSPAR_x','KAOLINITE_x','QUARTZ_x','UWATER_x', 
              'DEPT', 'DTCO', 'DTSH_FAST', 'RHOZ']
df1 = get_proper_columns(merged_df1, cols_1)
df1 = df1.rename(columns={'DTSH_FAST': 'DTSM_FAST', 'BOUND_WATER_x':'BOUND_WATER',
                          'CALCITE_x':'CALCITE','CHLORITE_x':'CHLORITE','DOLOMITE_x':
                          'DOLOMITE', 'ILLITE_x':'ILLITE','K-FELDSPAR_x':'K-FELDSPAR',
                          'KAOLINITE_x':'KAOLINITE','QUARTZ_x':'QUARTZ','UWATER_x':'UWATER'})

cols_2 = ['BOUND_WATER','CALCITE','CHLORITE','DOLOMITE',#'HEMATITE',
            'ILLITE','K-FELDSPAR','KAOLINITE','QUARTZ','UWATER', 'DEPT', 'DTCO', 'DTSM_FAST','RHOZ']
df2 = get_proper_columns(merged_df2, cols_2)

cols_3 = ['BOUND_WATER','CALCITE','CHLORITE','DOLOMITE',#'HEMATITE',
            'ILLITE','K-FELDSPAR','KAOLINITE','QUARTZ','UWATER', 'DEPT', 'DTCO', 'DTSM_FAST', 'RHOZ']
df3 = get_proper_columns(merged_df3, cols_3)

cols_4 = ['BOUND_WATER','CALCITE','CHLORITE','DOLOMITE',#'HEMATITE',
            'ILLITE','K_FELDSPAR','KAOLINITE','QUARTZ','UWATER', 'DEPT', 'DTCO', 'DTSM_FAST', 'RHOZ_x']
df4 = get_proper_columns(merged_df4, cols_4)
df4 = df4.rename(columns={'K_FELDSPAR': 'K-FELDSPAR', 'RHOZ_x': 'RHOZ'})

cols_5 = ['BOUND_WATER','CALCITE','CHLORITE','DOLOMITE',#'HEMATITE',
            'ILLITE','K_FELDSPAR','KAOLINITE','QUARTZ','UWATER', 'DEPT', 'DTCO', 'DTSM', 'RHOZ_x']
df5 = get_proper_columns(merged_df5, cols_5)
df5 = df5.rename(columns={'DTSM': 'DTSM_FAST', 'K_FELDSPAR': 'K-FELDSPAR', 'RHOZ_x': 'RHOZ'})

### Handling Missing Data

In [13]:
# Percent of Records Missing
def missing_value_percent(df):
    total_records = len(df)
    total_missing = df.isnull().sum().sum()
    percent_missing_total = (total_missing / (total_records * len(df.columns))) * 100
    print(percent_missing_total)

In [14]:
missing_value_percent(df1)

1.2085308056872037


In [15]:
missing_value_percent(df2)

2.803885534260961


In [16]:
missing_value_percent(df3)

0.0


In [17]:
missing_value_percent(df4)

0.018991398479105505


In [18]:
missing_value_percent(df5)

1.7743182267603863


In [19]:
def missing_drop(df):
    df = df.dropna()
    df.reset_index(inplace=True)
    df=df.drop(columns =['index'])
    return df

In [20]:
df1 = missing_drop(df1)
df2 = missing_drop(df2)
df3 = missing_drop(df3)
df4 = missing_drop(df4)
df5 = missing_drop(df5)

In [21]:
df5

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,DTCO,DTSM_FAST,RHOZ
0,0.03771,0.0,0.03154,0.47394,0.22472,0.02298,0.06837,0.09192,0.00000,5306.5,67.74281,121.77684,2.58802
1,0.03764,0.0,0.02957,0.45919,0.22500,0.02624,0.07058,0.10495,0.00000,5307.0,64.68021,122.02647,2.54553
2,0.03624,0.0,0.02738,0.48194,0.21705,0.02351,0.06929,0.09403,0.00000,5307.5,62.02827,121.66800,2.52025
3,0.03400,0.0,0.01668,0.55457,0.20693,0.01022,0.07604,0.04086,0.00000,5308.0,60.95219,121.04729,2.50908
4,0.03380,0.0,0.01240,0.59635,0.20720,0.00000,0.08069,0.00000,0.00000,5308.5,64.22356,120.51717,2.51096
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3735,0.00000,0.0,0.00000,0.23565,0.00000,0.33617,0.00000,0.31129,0.01499,7174.0,47.26740,89.87509,2.98712
3736,0.00000,0.0,0.00000,0.24785,0.00000,0.33411,0.00000,0.30520,0.01499,7174.5,47.31889,89.63716,2.99633
3737,0.00000,0.0,0.00000,0.24125,0.00000,0.33731,0.00000,0.30776,0.01499,7175.0,47.22272,89.27200,2.99191
3738,0.00801,0.0,0.01899,0.23096,0.04435,0.30294,0.00000,0.29553,0.01499,7175.5,47.76002,89.04203,2.97761


### Calculating Voigt-Reuss Boundaries

In [23]:
def calculate_voigt_reuss_bounds(df, minerals, bulk_modulus, shear_modulus):
    """
    Calculates the Voigt and Reuss bounds for bulk and shear moduli based on mineralogical composition.

    This function computes the Voigt and Reuss bounds for bulk and shear moduli 
    for a given dataset of mineral volume fractions. It also calculates the average 
    moduli (Voigt-Reuss-Hill averages) and adds the results as new columns to the input DataFrame.

    Parameters:
    -----------
    df (pandas.DataFrame): 
        A DataFrame containing the volume fractions of minerals for each sample.
    minerals (list): 
        A list of column names representing the minerals in the DataFrame.
    bulk_modulus (dict): 
        A dictionary mapping each mineral to its bulk modulus (GPa).
    shear_modulus (dict): 
        A dictionary mapping each mineral to its shear modulus (GPa).

    Returns:
    --------
    pandas.DataFrame:
        The input DataFrame with the following additional columns:
        - `Voigt_Bulk`: Voigt bound for bulk modulus.
        - `Reuss_Bulk`: Reuss bound for bulk modulus.
        - `Voigt_Shear`: Voigt bound for shear modulus.
        - `Reuss_Shear`: Reuss bound for shear modulus.
        - `K_VRH`: Average bulk modulus (Voigt-Reuss-Hill average).
        - `G_VRH`: Average shear modulus (Voigt-Reuss-Hill average).

    Calculation Details:
    --------------------
    - Voigt Bound: Assumes a uniform strain distribution and is calculated as the weighted sum 
      of the mineral moduli (bulk or shear) using their volume fractions.
    - Reuss Bound: Assumes a uniform stress distribution and is calculated as the reciprocal of 
      the weighted sum of the reciprocal mineral moduli (bulk or shear) using their volume fractions.
    - Voigt-Reuss-Hill Average (VRH): The average of the Voigt and Reuss bounds.

    Example:
    --------
    Given the following input:
    - `df` with columns `Mineral1`, `Mineral2`, ... containing volume fractions.
    - `bulk_modulus = {'Mineral1': 40, 'Mineral2': 60}`.
    - `shear_modulus = {'Mineral1': 20, 'Mineral2': 30}`.

    The function computes the Voigt, Reuss, and VRH averages for each row in the DataFrame.

    Usage:
    ------
    df = calculate_voigt_reuss_bounds(df, minerals, bulk_modulus, shear_modulus)
    """
    dataset = df[minerals]
    voigt_bulk = []
    reuss_bulk = []
    voigt_shear = []
    reuss_shear = []
    
    for index, row in dataset.iterrows():
        sum_voigt_bulk = sum_voigt_shear = 0
        sum_reuss_bulk = sum_reuss_shear = 0
        for mineral in minerals:
            volume_fraction = row[mineral]
            sum_voigt_bulk += volume_fraction * bulk_modulus[mineral]
            sum_voigt_shear += volume_fraction * shear_modulus[mineral]
            # Avoid division by zero
            if bulk_modulus[mineral] > 0:
                sum_reuss_bulk += volume_fraction / bulk_modulus[mineral]
            if shear_modulus[mineral] > 0:
                sum_reuss_shear += volume_fraction / shear_modulus[mineral]
        voigt_bulk.append(sum_voigt_bulk)
        voigt_shear.append(sum_voigt_shear)
        reuss_bulk_value = 1 / sum_reuss_bulk if sum_reuss_bulk > 0 else 0
        reuss_shear_value = 1 / sum_reuss_shear if sum_reuss_shear > 0 else 0
        reuss_bulk.append(reuss_bulk_value)
        reuss_shear.append(reuss_shear_value)
        
        df.loc[index, 'Voigt_Bulk'] = sum_voigt_bulk
        df.loc[index, 'Reuss_Bulk'] = reuss_bulk_value
        df.loc[index, 'Voigt_Shear'] = sum_voigt_shear
        df.loc[index, 'Reuss_Shear'] = reuss_shear_value
    
    df['K_VRH'] = (df['Voigt_Bulk'] + df['Reuss_Bulk']) / 2
    df['G_VRH'] = (df['Voigt_Shear'] + df['Reuss_Shear']) / 2
    
    return df

In [24]:
# TEST CASE for V-R Boundaries Function
minerals = [
    'Clay', 'Quartz', 'K-feldspar', 'P-feldspar', 'Calcite',
    'Dolomite', 'Siderite', 'Pyrite', 'Magnetite', 'Hematite', 'Ilmenite', 'Air'
]
bulk_modulus = {
    'Clay': 1.5, 'Quartz': 37, 'K-feldspar': 37.5, 'P-feldspar': 75.6,
    'Calcite': 76.8, 'Dolomite': 94.9, 'Siderite': 123.7, 'Pyrite': 147.4,
    'Magnetite': 161.4, 'Hematite': 100.2, 'Ilmenite': 168, 'Air': 2.2
}
shear_modulus = {
    'Clay': 1.4, 'Quartz': 44, 'K-feldspar': 15, 'P-feldspar': 25.6,
    'Calcite': 32, 'Dolomite': 45, 'Siderite': 51, 'Pyrite': 132.5,
    'Magnetite': 91.4, 'Hematite': 95.2, 'Ilmenite': 64, 'Air': 0.1
}
data = {
    'Clay': [0.07], 'Quartz': [0.32], 'K-feldspar': [0.1188], 'P-feldspar': [0.2574],
    'Calcite': [0.0099], 'Dolomite': [0.0198], 'Siderite': [0.0594], 'Pyrite': [0.0297],
    'Magnetite': [0.0495], 'Hematite': [0.0297], 'Ilmenite': [0.0396], 'Air': [0.01]
}
test_df = pd.DataFrame(data)

test_df = calculate_voigt_reuss_bounds(test_df, minerals, bulk_modulus, shear_modulus)
test_df

Unnamed: 0,Clay,Quartz,K-feldspar,P-feldspar,Calcite,Dolomite,Siderite,Pyrite,Magnetite,Hematite,Ilmenite,Air,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH
0,0.07,0.32,0.1188,0.2574,0.0099,0.0198,0.0594,0.0297,0.0495,0.0297,0.0396,0.01,67.86438,14.643094,40.60903,5.591029,41.253737,23.10003


In [25]:
minerals_list = ['CALCITE','DOLOMITE','UWATER', 'CHLORITE', 'ILLITE', 'KAOLINITE','BOUND_WATER'
                    ,'K-FELDSPAR','QUARTZ']

bulk_modulus = {
    'KAOLINITE': 1.5,
    'CHLORITE':1.5, 
    'ILLITE':1.5,
    'QUARTZ': 37,
    'K-FELDSPAR': 37.5,
    'CALCITE': 76.8,
    'DOLOMITE': 94.9,
    #'Hematite': 206.6,
    'BOUND_WATER':2.2,
    'UWATER': 2.2
}

shear_modulus = {
    'KAOLINITE': 1.4,
    'CHLORITE':1.4, 
    'ILLITE':1.4,
    'QUARTZ': 44,
    'K-FELDSPAR': 15,
    'CALCITE': 32,
    'DOLOMITE': 45,
    #'Hematite': 91.0,
    'BOUND_WATER': 0.0001,
    'UWATER': 0.0001
}

df1 = calculate_voigt_reuss_bounds(df1, minerals_list, bulk_modulus, shear_modulus)
df2 = calculate_voigt_reuss_bounds(df2, minerals_list, bulk_modulus, shear_modulus)
df3 = calculate_voigt_reuss_bounds(df3, minerals_list, bulk_modulus, shear_modulus)
df4 = calculate_voigt_reuss_bounds(df4, minerals_list, bulk_modulus, shear_modulus)
df5 = calculate_voigt_reuss_bounds(df5, minerals_list, bulk_modulus, shear_modulus)

### Train Test Split

In [26]:
# Train : TR2, CCS1, VW1
# Test1 : CCS2
# Test2 : VW2

df = pd.concat([df1, df2, df3, df4, df5])
train_df = pd.concat([df1, df3, df4])
test_df = df2
test_df2 = df5

In [27]:
# Percent of Total Records Missing
missing_value_percent(df)

0.0


In [28]:
test_df.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,DTCO,DTSM_FAST,RHOZ,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,53.47908,103.45393,2.62311,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,54.30895,102.99964,2.61659,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,54.61615,103.17477,2.62357,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,55.44879,104.25208,2.61228,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,57.11026,106.35674,2.58807,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809


In [29]:
train_df_dtco = train_df.drop(columns = ['DTSM_FAST'])
test_df_dtco = test_df.drop(columns = ['DTSM_FAST'])
test_df2_dtco = test_df2.drop(columns = ['DTSM_FAST'])

train_df_dtsm = train_df.drop(columns = ['DTCO'])
test_df_dtsm = test_df.drop(columns = ['DTCO'])
test_df2_dtsm = test_df2.drop(columns = ['DTCO'])

### Generating Target Variables: S-wave Velocity and P-wave Velocity

In [30]:
def convert_velocity(df, target, col):
    """
    Converts velocity from microseconds per foot (µs/ft) to meters per second (m/s) 
    and adds the converted values as a new column in the DataFrame.

    The conversion formula is:
    Velocity (m/s) = 1 / (Travel time (µs/ft) * Conversion factor)
    
    The conversion factor used to go from microseconds/foot to seconds/meter is 0.000001 / 0.3048.

    Parameters:
    -----------
    df (pandas.DataFrame): 
        The input DataFrame containing the column with velocity in µs/ft.
    target (str): 
        The name of the column in the DataFrame containing velocity in µs/ft.
    col (str): 
        The name of the new column to store the converted velocity in m/s.

    Returns:
    --------
    pandas.DataFrame: 
        The updated DataFrame with the new column for velocity in m/s.
        The original column (`target`) is removed.

    Raises:
    -------
    ValueError: 
        If the specified `target` column is not found in the DataFrame.

    Example:
    --------
    Input DataFrame:
    ----------------
    | Depth | Velocity_µs_ft |
    |-------|----------------|
    | 100   | 189            |
    | 200   | 180            |

    Call:
    -----
    df = convert_velocity(df, target='Velocity_µs_ft', col='Velocity_m_s')

    Output DataFrame:
    -----------------
    | Depth | Velocity_m_s |
    |-------|--------------|
    | 100   | 1757.77      |
    | 200   | 1822.22      |

    Usage:
    -------
    df = convert_velocity(df, target='Travel_Time', col='Velocity')
    """
    if target not in df.columns:
        raise ValueError(f"Column {target} not found in DataFrame.")
    
    conversion_factor = 0.000001 / 0.3048  # µs/ft to s/m conversion factor
    df[col] = 1 / (df[target] * conversion_factor)  # Calculate velocity in m/s
    df.drop(columns=[target], inplace=True)
    return df

df = convert_velocity(df, 'DTCO', 'VP')
train_df_vp = convert_velocity(train_df_dtco, "DTCO", 'VP')
test_df_vp = convert_velocity(test_df_dtco, "DTCO", 'VP')
test_df2_vp = convert_velocity(test_df2_dtco, "DTCO", 'VP')

df = convert_velocity(df, 'DTSM_FAST', 'VS')
train_df_vs = convert_velocity(train_df_dtsm, 'DTSM_FAST', 'VS')
test_df_vs = convert_velocity(test_df_dtsm, 'DTSM_FAST', 'VS')
test_df2_vs = convert_velocity(test_df2_dtsm, 'DTSM_FAST', 'VS')

In [31]:
test_df_vs.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,RHOZ,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VS
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,2.62311,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005,2946.238968
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,2.61659,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478,2959.233644
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,2.62357,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706,2954.210608
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,2.61228,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909,2923.682674
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,2.58807,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809,2865.826839


In [32]:
def calculate_vp(df):
    # (1 GPa = 1e9 Pa)
    KV = df['Voigt_Bulk'] * 1e9
    GV = df['Voigt_Shear'] * 1e9
    KR = df['Reuss_Bulk'] * 1e9
    GR = df['Reuss_Shear'] * 1e9
    
    # (1 g/cm^3 = 1000 kg/m^3)
    rho = df['RHOZ'] * 1000
    
    df['VOIGT_VP'] = np.sqrt((KV + 4.0/3.0 * GV) / rho)
    df['REUSS_VP'] = np.sqrt((KR + 4.0/3.0 * GR) / rho)
    return df

def calculate_vs(df):
    # (1 GPa = 1e9 Pa)
    
    GV = df['Voigt_Shear'] * 1e9
    GR = df['Reuss_Shear'] * 1e9
    
    # (1 g/cm^3 = 1000 kg/m^3)
    rho = df['RHOZ'] * 1000
    
    df['VOIGT_VS'] = np.sqrt(GV / rho)
    df['REUSS_VS'] = np.sqrt(GR / rho)
    return df

In [33]:
test_df_vp = calculate_vp(test_df_vp)
test_df2_vp = calculate_vp(test_df2_vp)
test_df_vp.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,RHOZ,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VP,VOIGT_VP,REUSS_VP
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,2.62311,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005,5699.424897,6403.24458,2954.720357
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,2.61659,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478,5612.334615,6384.41977,3003.048539
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,2.62357,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706,5580.766861,6367.070757,3074.245872
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,2.61228,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909,5496.963955,6265.390794,3019.828207
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,2.58807,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809,5337.044517,6033.921752,2963.007179


In [34]:
test_df_vs = calculate_vs(test_df_vs)
test_df2_vs = calculate_vs(test_df2_vs)
test_df_vs.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,RHOZ,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VS,VOIGT_VS,REUSS_VS
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,2.62311,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005,2946.238968,3512.132604,25.835856
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,2.61659,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478,2959.233644,3511.446274,25.742169
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,2.62357,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706,2954.210608,3512.65488,26.567101
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,2.61228,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909,2923.682674,3480.021737,27.872973
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,2.58807,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809,2865.826839,3430.14887,31.632862


In [35]:
train_df_vp

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,RHOZ,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VP
0,0.00139,0.92588,0.0,0.00000,0.00081,0.04413,0.01871,0.00906,0.00001,4361.25,2.85980,73.130039,36.863518,30.716078,0.071195,54.996778,15.393636,6263.510060
1,0.00291,0.91290,0.0,0.00000,0.01286,0.04058,0.01871,0.01203,0.00001,4361.75,2.85420,72.131359,28.036393,30.395018,0.034183,50.083876,15.214601,6481.686259
2,0.00468,0.89792,0.0,0.00000,0.02690,0.03663,0.01871,0.01373,0.00144,4362.25,2.83240,70.923770,21.631896,29.950865,0.016323,46.277833,14.983594,6563.708892
3,0.00407,0.88793,0.0,0.00000,0.02208,0.03811,0.01871,0.01604,0.01306,4362.75,2.79990,70.314500,20.837212,29.748278,0.005836,45.575856,14.877057,6334.250151
4,0.00134,0.88843,0.0,0.00000,0.00042,0.04504,0.01871,0.01929,0.02676,4363.25,2.76980,70.724669,25.762189,29.980905,0.003558,48.243429,14.992231,6141.289554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9704,0.01027,0.72182,0.0,0.15156,0.08489,0.02756,0.00000,0.00000,0.00124,5202.00,2.78394,71.004977,13.595115,30.450687,0.008681,42.300046,15.229684,6308.483607
9705,0.01418,0.66856,0.0,0.17258,0.11716,0.01852,0.00000,0.00000,0.00124,5202.50,2.76674,68.627414,10.402214,29.601846,0.006480,39.514814,14.804163,6442.669751
9706,0.01688,0.64554,0.0,0.16899,0.13946,0.01769,0.00000,0.00000,0.00124,5203.00,2.76142,66.527052,8.939138,28.722426,0.005515,37.733095,14.363970,6389.007553
9707,0.01823,0.67021,0.0,0.12367,0.15066,0.02293,0.00000,0.00000,0.00124,5203.50,2.75383,64.337110,8.338105,27.566746,0.005133,36.337608,13.785939,6250.624191


### Calculating VRH Velocities 

In [36]:
def calculate_vpvrh(df):
    # (1 GPa = 1e9 Pa)
    KVRH = df['K_VRH'] * 1e9
    GVRH = df['G_VRH'] * 1e9
    
    # (1 g/cm^3 = 1000 kg/m^3)
    rho = df['RHOZ'] * 1000
    
    df['VRH_VP'] = np.sqrt((KVRH + 4.0/3.0 * GVRH) / rho)
    return df

def calculate_vsvrh(df):
    # (1 GPa = 1e9 Pa)
    GVRH = df['G_VRH'] * 1e9
    
    # (1 g/cm^3 = 1000 kg/m^3)
    rho = df['RHOZ'] * 1000
    
    df['VRH_VS'] = np.sqrt(GVRH / rho)
    return df

In [37]:
df = calculate_vpvrh(df)
train_df_vp = calculate_vpvrh(train_df_vp)
test_df_vp = calculate_vpvrh(test_df_vp)
test_df2_vp = calculate_vpvrh(test_df2_vp)
test_df_vp.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,...,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VP,VOIGT_VP,REUSS_VP,VRH_VP
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,...,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005,5699.424897,6403.24458,2954.720357,4986.577661
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,...,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478,5612.334615,6384.41977,3003.048539,4988.943592
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,...,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706,5580.766861,6367.070757,3074.245872,4999.528863
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,...,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909,5496.963955,6265.390794,3019.828207,4918.052674
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,...,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809,5337.044517,6033.921752,2963.007179,4753.294818


In [38]:
test_df_vs = calculate_vsvrh(test_df_vs)
test_df2_vs = calculate_vsvrh(test_df2_vs)
test_df_vs.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,...,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VS,VOIGT_VS,REUSS_VS,VRH_VS
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,...,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005,2946.238968,3512.132604,25.835856,2483.519974
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,...,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478,2959.233644,3511.446274,25.742169,2483.034192
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,...,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706,2954.210608,3512.65488,26.567101,2483.893125
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,...,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909,2923.682674,3480.021737,27.872973,2460.825897
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,...,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809,2865.826839,3430.14887,31.632862,2425.584662


In [39]:
df.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,RHOZ,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VP,VS,VRH_VP
0,0.00139,0.92588,0.0,0.0,0.00081,0.04413,0.01871,0.00906,1e-05,4361.25,2.8598,73.130039,36.863518,30.716078,0.071195,54.996778,15.393636,6263.51006,3595.064147,5138.872438
1,0.00291,0.9129,0.0,0.0,0.01286,0.04058,0.01871,0.01203,1e-05,4361.75,2.8542,72.131359,28.036393,30.395018,0.034183,50.083876,15.214601,6481.686259,3574.159111,4965.36989
2,0.00468,0.89792,0.0,0.0,0.0269,0.03663,0.01871,0.01373,0.00144,4362.25,2.8324,70.92377,21.631896,29.950865,0.016323,46.277833,14.983594,6563.708892,3548.413197,4836.544372
3,0.00407,0.88793,0.0,0.0,0.02208,0.03811,0.01871,0.01604,0.01306,4362.75,2.7999,70.3145,20.837212,29.748278,0.005836,45.575856,14.877057,6334.250151,3545.193129,4833.449955
4,0.00134,0.88843,0.0,0.0,0.00042,0.04504,0.01871,0.01929,0.02676,4363.25,2.7698,70.724669,25.762189,29.980905,0.003558,48.243429,14.992231,6141.289554,3560.947343,4963.331164


In [40]:
train_df_vs.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,RHOZ,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VS
0,0.00139,0.92588,0.0,0.0,0.00081,0.04413,0.01871,0.00906,1e-05,4361.25,2.8598,73.130039,36.863518,30.716078,0.071195,54.996778,15.393636,3595.064147
1,0.00291,0.9129,0.0,0.0,0.01286,0.04058,0.01871,0.01203,1e-05,4361.75,2.8542,72.131359,28.036393,30.395018,0.034183,50.083876,15.214601,3574.159111
2,0.00468,0.89792,0.0,0.0,0.0269,0.03663,0.01871,0.01373,0.00144,4362.25,2.8324,70.92377,21.631896,29.950865,0.016323,46.277833,14.983594,3548.413197
3,0.00407,0.88793,0.0,0.0,0.02208,0.03811,0.01871,0.01604,0.01306,4362.75,2.7999,70.3145,20.837212,29.748278,0.005836,45.575856,14.877057,3545.193129
4,0.00134,0.88843,0.0,0.0,0.00042,0.04504,0.01871,0.01929,0.02676,4363.25,2.7698,70.724669,25.762189,29.980905,0.003558,48.243429,14.992231,3560.947343


In [41]:
test_df_vs.head()

Unnamed: 0,BOUND_WATER,CALCITE,CHLORITE,DOLOMITE,ILLITE,K-FELDSPAR,KAOLINITE,QUARTZ,UWATER,DEPT,...,Voigt_Bulk,Reuss_Bulk,Voigt_Shear,Reuss_Shear,K_VRH,G_VRH,VS,VOIGT_VS,REUSS_VS,VRH_VS
0,0.00026,0.081,0.0,0.47844,0.0,0.2304,0.00381,0.10848,0.05685,5101.0,...,64.409873,22.898393,32.35626,0.001751,43.654133,16.179005,2946.238968,3512.132604,25.835856,2483.519974
1,8e-05,0.069,0.0,0.47355,0.0,0.23773,0.00114,0.11768,0.05759,5101.5,...,63.636714,23.594883,32.263222,0.001734,43.615799,16.132478,2959.233644,3511.446274,25.742169,2483.034192
2,3e-05,0.04209,0.0,0.48604,0.0,0.24202,0.00041,0.1255,0.05397,5102.0,...,63.196373,24.792859,32.371559,0.001852,43.994616,16.186706,2954.210608,3512.65488,26.567101,2483.893125
3,0.00036,0.05095,0.0,0.43499,0.0,0.25269,0.00529,0.15076,0.04891,5102.5,...,60.363835,23.819622,31.636151,0.002029,42.091729,15.81909,2923.682674,3480.021737,27.872973,2460.825897
4,0.00088,0.08433,0.0,0.29721,0.0,0.2718,0.01301,0.2337,0.03773,5103.0,...,53.62563,22.718279,30.451028,0.00259,38.171954,15.226809,2865.826839,3430.14887,31.632862,2425.584662


## Exploratory Data Analysis

In [None]:

report_df = sv.analyze(df)
report_df.show_html('report_df.html')

report_train = sv.analyze(train_df_vs)
report_train.show_html('report_train_df_vs.html')

report_test = sv.analyze(test_df_vs)
report_test.show_html('report_test_df_vs.html')

report_test2 = sv.analyze(test_df2_vs)
report_test2.show_html('report_test_df2_vs.html')

compare_report = sv.compare([train_df_vs, "Training Data"], [test_df_vs, "Test Data"])
compare_report.show_html('compare_report.html')

In [42]:
test_df_vs.shape

(4217, 21)

In [43]:
test_df2_vs.shape

(3740, 21)

In [44]:
train_df_vp.shape

(17798, 19)

In [45]:
df.shape

(25755, 20)

### Saving the files to local 

In [46]:
df.to_csv("Datasets/Training and Testing data/df.csv", index=False)

train_df_vp.to_csv("Datasets/Training and Testing data/train_df_vp.csv", index=False)
test_df_vp.to_csv("Datasets/Training and Testing data/test_df_vp.csv", index=False)
test_df2_vp.to_csv("Datasets/Training and Testing data/test_df2_vp.csv", index=False)

train_df_vs.to_csv("Datasets/Training and Testing data/train_df_vs.csv", index=False)
test_df_vs.to_csv("Datasets/Training and Testing data/test_df_vs.csv", index=False)
test_df2_vs.to_csv("Datasets/Training and Testing data/test_df2_vs.csv", index=False)