In [None]:
import pandas as pd

# Load the dataset, assuming the first two rows are header and units
file_path = '/mnt/data/well_1.csv'
data = pd.read_csv(file_path, header=[0, 1])

# Display the first few rows of the dataset to understand its structure
data.head()

# Extracting headers and units
headers = data.columns.get_level_values(0)
units = data.columns.get_level_values(1)

# Creating a data dictionary
data_dictionary = {header: unit for header, unit in zip(headers, units)}

# Modifying the DataFrame to have a single header row
# Concatenating mnemonic and unit to create a unique identifier for each column
new_columns = [f"{header} ({unit})" for header, unit in zip(headers, units)]
data.columns = new_columns

# Displaying the data dictionary and the modified DataFrame
data_dictionary, data.head()

# Example mapping based on provided information and common industry standards
mnemonic_mapping = {
    "DEPMD": "MD",  # Measure Depth
    "ROPA_AVG": "ROP",  # Average Rate of Penetration
    # Add more mappings as needed
}

# Function to apply the mapping to the DataFrame's columns


def standardize_mnemonics(df, mapping):
    new_columns = []
    for col in df.columns:
        mnemonic, unit = col.split(' (')
        # Apply the mapping if the mnemonic is in our mapping dictionary
        standardized_mnemonic = mapping.get(mnemonic, mnemonic)
        new_columns.append(f"{standardized_mnemonic} ({unit}")
    df.columns = new_columns
    return df


# Applying the mapping to the DataFrame
standardized_data = standardize_mnemonics(data.copy(), mnemonic_mapping)
standardized_data.head()


# Expanded mnemonic mapping based on common drilling measurements
expanded_mnemonic_mapping = {
    "DEPMD": "MD", "DEPTH": "MD", "MD_TVD": "MD",  # Measured Depth
    "ROPA_AVG": "ROP", "ROP5MIN": "ROP", "ROP_INST": "ROP",  # Rate of Penetration
    "WOB_AVG": "WOB", "WOB_INST": "WOB",  # Weight on Bit
    "SURFRPM_AVG": "RPM", "BITRPM_AVG": "RPM", "ROT_SPEED": "RPM",  # Rotations Per Minute
    "SPP_AVG": "SPP", "SPPA": "SPP", "STANDPIPE_PRESS": "SPP",  # Standpipe Pressure
    "TORQ_AVG": "TORQUE", "TORQUE_INST": "TORQUE", "DRILL_TORQUE": "TORQUE",  # Torque
    "FLOWIN_AVG": "FLOW", "MUDFLOW": "FLOW", "FLOW_RATE": "FLOW",  # Mud Flow Rate
    "TIN_AVG": "TEMP", "TOUT_AVG": "TEMP", "TEMPERATURE": "TEMP",  # Temperature
    "CHKP_AVG": "PRESSURE", "CEMENTP_AVG": "PRESSURE", "PRESSURE_READING": "PRESSURE",  # Pressure
    "MSE_CALC": "MSE", "SPEC_ENERGY": "MSE",  # Mechanical Specific Energy
    # Existing mappings
    "DEPMD": "MD", "ROPA_AVG": "ROP",
    # Add more mappings as needed
}

# Applying the expanded mapping to the DataFrame
standardized_data_expanded = standardize_mnemonics(
    data.copy(), expanded_mnemonic_mapping)
standardized_data_expanded.head()


# Further expanding the mnemonic mapping with a comprehensive set of common drilling measurements
expanded_mnemonic_mapping.update({
    # Additional mnemonics based on common drilling measurements
    "TOT_DEPTH": "MD", "MEAS_DEPTH": "MD",
    "ROP_RATE": "ROP", "PENETRATE_RATE": "ROP",
    "WT_ON_BIT": "WOB", "BIT_WEIGHT": "WOB",
    "ROTATE_RPM": "RPM", "DRILL_RPM": "RPM",
    "S_PIPE_PRES": "SPP", "SP_PRESSURE": "SPP",
    "ROT_TORQUE": "TORQUE", "TORQ_ON_BIT": "TORQUE",
    "MUD_RATE": "FLOW", "IN_FLOW": "FLOW",
    "MUD_TEMP": "TEMP", "INLET_TEMP": "TEMP",
    "MUD_PRESS": "PRESSURE", "ANNULAR_PRESS": "PRESSURE",
    "MSE_EST": "MSE", "ENERGY_INDEX": "MSE", "DRILL_EFFICIENCY": "MSE",
    "TVD": "TV Depth", "VERT_DEPTH": "TV Depth", "TRUE_VD": "TV Depth", "TV_DEPTH": "TV Depth", "VERTICAL_DEPTH": "TV Depth",
    "GR": "Gamma Ray", "GAMMA": "Gamma Ray", "GAMMA_RAY": "Gamma Ray", "GR_LOG": "Gamma Ray", "GAMMA_LOG": "Gamma Ray",
    "INCL": "Inclination", "WELL_INCL": "Inclination", "BORE_INCL": "Inclination", "INCL_ANGLE": "Inclination",
    "AZIM": "Azimuth", "WELL_AZIM": "Azimuth", "BORE_AZIM": "Azimuth", "AZIM_ANGLE": "Azimuth",
    "PUMP_PRES": "Pump Pressure", "MUD_PUMP_PRESS": "Pump Pressure", "PUMP_PRESSURE": "Pump Pressure", "MP_PRESSURE": "Pump Pressure", "PUMP_PSI": "Pump Pressure",
    "VISC": "Viscosity", "MUD_VISC": "Viscosity", "FLUID_VISC": "Viscosity", "MUD_THICKNESS": "Viscosity",
    # Any other specific mnemonics can be added here
})

# Reapplying the expanded mapping to the DataFrame
standardized_data_further_expanded = standardize_mnemonics(
    data.copy(), expanded_mnemonic_mapping)
standardized_data_further_expanded.head()

In [1]:
import pandas as pd

# Load the dataset, assuming the first two rows are header and units
file_path = 'c:/development/MSE_analysis/data_to_work/well_1.csv'
data = pd.read_csv(file_path, header=[0, 1])

# Display the first few rows of the dataset to understand its structure
data.head()

# Extracting headers and units
headers = data.columns.get_level_values(0)
units = data.columns.get_level_values(1)

# Creating a data dictionary
data_dictionary = {header: unit for header, unit in zip(headers, units)}

# Modifying the DataFrame to have a single header row
# Concatenating mnemonic and unit to create a unique identifier for each column
new_columns = [f"{header} ({unit})" for header, unit in zip(headers, units)]
data.columns = new_columns

# Displaying the data dictionary and the modified DataFrame
data_dictionary, data.head()


({'DEPMD': 'ft',
  'WOB_AVG': 'tf ',
  'DEPTVD': 'm',
  'ROPA_AVG': 'm/h',
  'TORQ_AVG': 'kLbf.ft',
  'SURFRPM_AVG': 'rpm',
  'MOTORRPM_AVG': 'rpm',
  'BITRPM_AVG': 'rpm',
  'SPP_AVG': 'Psi',
  'CHKP_AVG': 'Psi',
  'CEMENTP_AVG': 'Psi',
  'SPM01_AVG': 'Stk/min',
  'SPM02_AVG': 'Stk/min',
  'SPM03_AVG': 'Stk/min',
  'PITACTIVE_AVG': 'bbl',
  'FLOWIN_AVG': 'gal/min',
  'FLOWOUTP_AVG': '%',
  'DIN_AVG': 'ppg',
  'TIN_AVG': 'C',
  'DOUT_AVG': 'ppg',
  'TOUT_AVG': 'C',
  'HP': 'hp',
  'DOC': 'in/rev',
  'MSE': 'ksi',
  'Comments': 'TOC/TOP'},
   DEPMD (ft) WOB_AVG (tf )  DEPTVD (m)  ROPA_AVG (m/h)  TORQ_AVG (kLbf.ft)  \
 0       2667         12.31     2533.49          141.86                4.87   
 1       2668          2.51     2534.33           90.56                4.78   
 2       2669          3.21     2535.19           80.75                4.57   
 3       2670           3.1     2536.04           89.63                4.61   
 4       2671          3.68     2536.87           54.78      

In [7]:
import re
import pandas as pd


def map_mnemonic(mnemonic, unit):
    """
    Map the given mnemonic to a standardized form using regex patterns.
    Case-insensitive matching is used to capture various cases.
    Units are also considered in the mapping process.
    """
    patterns = {
        "MD": r".*DEPMD.*|.*DEPTH.*|.*MD.*",
        "TVD": r".*TVD.*",
        "ROP": r".*ROP.*|.*PENETRATE.*",
        "WOB": r".*WOB.*|.*BITLOAD.*",
        "RPM": r".*RPM.*|.*ROTATE.*|.*BITRPM.*",
        "SPP": r".*SPP.*|.*STANDPIPE.*",
        "TORQUE": r".*TORQ.*|.*TWIST.*",
        "FLOW": r".*FLOW.*|.*FLUIDRATE.*",
        "TEMP": r".*TEMP.*|.*HEAT.*",
        "PRESSURE": r".*PRESS.*|.*PSI.*",
        "VOLUME": r".*VOL.*|.*BARREL.*" if "bbl" in unit or "L" in unit else None,
        # Expanded patterns for more specific cases
        "TORQUE_AVG": r".*TORQ_AVG.*|.*AVG_TORQUE.*",
        "TORQUE_MAX": r".*TORQ_MAX.*|.*MAX_TORQUE.*",
        "TORQUE_MIN": r".*TORQ_MIN.*|.*MIN_TORQUE.*",
        # Add more patterns for other specific cases as needed
    }

    for std_mnemonic, pattern in patterns.items():
        if pattern and re.match(pattern, mnemonic, re.IGNORECASE):
            return f"{std_mnemonic} ({unit})"

    return analyze_context(mnemonic, unit)


def analyze_context(mnemonic, unit):
    """
    Analyze the context of the mnemonic for accurate mapping.
    This function is used for mnemonics that require context-based interpretation.
    """
    # Contextual analysis logic goes here
    # Example:
    if "TEMP" in mnemonic:
        return "TEMPERATURE (C)" if "c" in unit.lower() else "TEMPERATURE (F)"

    # Mark as unidentified if no contextual rule applies
    return f"{mnemonic}__unidentified ({unit})"  # Changed here


def standardize_mnemonics(df):
    """
    Apply mnemonic standardization to all column headers in the DataFrame.
    """
    new_columns = [map_mnemonic(col.split(' (')[0], col.split(' (')[1].rstrip(')')) for col in df.columns]
    df.columns = new_columns
    return df

# Apply the standardization
standardized_data = standardize_mnemonics(data)

standardized_data.head()  # To view the first few rows of the standardized dataset

Unnamed: 0,MD (ft),WOB (tf ),TVD (m),ROP (m/h),TORQUE (kLbf.ft),RPM (rpm),RPM (rpm).1,RPM (rpm).2,SPP (Psi),CHKP_AVG__unidentified__unidentified (Psi),...,FLOW (gal/min),FLOW (%),DIN_AVG__unidentified__unidentified (ppg),TIN_AVG__unidentified__unidentified (C),DOUT_AVG__unidentified__unidentified (ppg),TOUT_AVG__unidentified__unidentified (C),HP__unidentified__unidentified (hp),DOC__unidentified__unidentified (in/rev),MSE__unidentified__unidentified (ksi),Comments__unidentified__unidentified (TOC/TOP)
0,2667,12.31,2533.49,141.86,4.87,48.0,0.0,48.0,689.0,0.0,...,510.0,24.11,11.69,37.954,11.69,34.7,44.508759,1.939249781,3.822969471,TOC
1,2668,2.51,2534.33,90.56,4.78,48.0,0.0,48.0,685.0,0.0,...,511.0,23.78,11.69,37.963,11.69,34.7,43.686215,1.237970254,5.229530215,TOC
2,2669,3.21,2535.19,80.75,4.57,48.0,0.0,48.0,689.0,0.0,...,510.0,24.61,11.69,37.948,11.69,34.8,41.766946,1.103865923,5.627672211,TOC
3,2670,3.1,2536.04,89.63,4.61,48.0,0.0,48.0,685.0,0.0,...,511.0,23.37,11.69,37.923,11.69,34.7,42.132521,1.225256999,5.121710058,TOC
4,2671,3.68,2536.87,54.78,4.54,48.0,0.0,48.0,676.0,0.0,...,514.0,24.95,11.69,38.025,11.69,34.7,41.492765,0.748851706,8.200857921,TOC
