In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

In [34]:
class WellDataGenerator:
    """
    A comprehensive data generator for realistic well performance data
    based on petroleum engineering principles and field observations.
    """
    
    def __init__(self, random_seed=42):
        """
        Initialize the data generator with reproducible random seed
        
        Parameters:
        -----------
        random_seed : int
            Random seed for reproducible results
        """
        self.random_seed = random_seed
        np.random.seed(random_seed)
        
    def generate_reservoir_properties(self, n_wells):
        """
        Generate realistic reservoir properties based on geological distributions
        
        Parameters:
        -----------
        n_wells : int
            Number of wells to generate
            
        Returns:
        --------
        dict : Dictionary containing reservoir properties
        """
        print("Generating reservoir properties...")
        
        # Permeability (mD) - Lognormal distribution typical for reservoir rocks
        # Mean of 2 in log space ≈ 7.4 mD, which is reasonable for many reservoirs
        permeability = np.random.lognormal(mean=2, sigma=1, size=n_wells)
        
        # Porosity (fraction) - Normal distribution, typical range 5-35%
        porosity = np.random.normal(0.15, 0.05, n_wells)
        porosity = np.clip(porosity, 0.05, 0.35)
        
        # Net-to-Gross ratio - fraction of reservoir rock that's productive
        net_to_gross = np.random.beta(8, 3, n_wells)  # Skewed toward higher values
        net_to_gross = np.clip(net_to_gross, 0.3, 1.0)
        
        # Reservoir thickness (ft)
        thickness = np.random.gamma(2, 25, n_wells)  # Gamma distribution
        thickness = np.clip(thickness, 10, 200)
        
        return {
            'permeability_md': permeability,
            'porosity_fraction': porosity,
            'net_to_gross': net_to_gross,
            'thickness_ft': thickness
        }
    
    def generate_well_design(self, n_wells):
        """
        Generate well design parameters
        
        Parameters:
        -----------
        n_wells : int
            Number of wells to generate
            
        Returns:
        --------
        dict : Dictionary containing well design parameters
        """
        print("Generating well design parameters...")
        
        # Well depth (ft) - Normal distribution around typical depths
        well_depth = np.random.normal(8000, 1500, n_wells)
        well_depth = np.clip(well_depth, 4000, 15000)
        
        # Tubing diameter (inches) - Common sizes in industry
        tubing_sizes = [2.375, 2.875, 3.5, 4.5, 5.5]
        tubing_weights = [0.1, 0.3, 0.35, 0.2, 0.05]  # 3.5" most common
        tubing_diameter = np.random.choice(tubing_sizes, n_wells, p=tubing_weights)
        
        # Choke size (64ths of inch) - Production control
        choke_size = np.random.uniform(8, 64, n_wells)
        
        # Well type - Vertical, Horizontal, Deviated
        well_types = ['Vertical', 'Horizontal', 'Deviated']
        well_type_weights = [0.4, 0.5, 0.1]
        well_type = np.random.choice(well_types, n_wells, p=well_type_weights)
        
        # Completion type affects productivity
        completion_types = ['Perforated', 'Open Hole', 'Slotted Liner']
        completion_weights = [0.7, 0.2, 0.1]
        completion_type = np.random.choice(completion_types, n_wells, p=completion_weights)
        
        # Artificial lift type
        lift_types = ['Natural Flow', 'ESP', 'Rod Pump', 'Gas Lift']
        lift_weights = [0.3, 0.3, 0.25, 0.15]
        artificial_lift = np.random.choice(lift_types, n_wells, p=lift_weights)
        
        return {
            'well_depth_ft': well_depth,
            'tubing_diameter_in': tubing_diameter,
            'choke_size_64th': choke_size,
            'well_type': well_type,
            'completion_type': completion_type,
            'artificial_lift': artificial_lift
        }
    
    def generate_pressure_temperature(self, well_depth):
        """
        Generate pressure and temperature based on depth and regional gradients
        
        Parameters:
        -----------
        well_depth : array
            Well depths in feet
            
        Returns:
        --------
        dict : Dictionary containing pressure and temperature data
        """
        print("Generating pressure and temperature profiles...")
        
        n_wells = len(well_depth)
        
        # Pressure gradient: typical 0.43-0.47 psi/ft
        pressure_gradient = np.random.normal(0.45, 0.02, n_wells)
        reservoir_pressure = well_depth * pressure_gradient + np.random.normal(500, 100, n_wells)
        reservoir_pressure = np.clip(reservoir_pressure, 1500, 8000)
        
        # Temperature: surface temp + geothermal gradient
        surface_temp = np.random.normal(70, 10, n_wells)  # °F
        geothermal_gradient = np.random.normal(1.5, 0.2, n_wells)  # °F per 100 ft
        reservoir_temp = surface_temp + (well_depth / 100) * geothermal_gradient
        
        # Operating pressures
        bottomhole_pressure = reservoir_pressure - np.random.uniform(200, 1000, n_wells)
        bottomhole_pressure = np.clip(bottomhole_pressure, 200, reservoir_pressure * 0.9)
        
        wellhead_pressure = bottomhole_pressure - np.random.uniform(100, 600, n_wells)
        wellhead_pressure = np.clip(wellhead_pressure, 50, bottomhole_pressure * 0.8)
        
        return {
            'reservoir_pressure_psi': reservoir_pressure,
            'reservoir_temp_f': reservoir_temp,
            'bottomhole_pressure_psi': bottomhole_pressure,
            'wellhead_pressure_psi': wellhead_pressure
        }
    
    def generate_fluid_properties(self, n_wells, reservoir_temp):
        """
        Generate fluid properties based on reservoir conditions
        
        Parameters:
        -----------
        n_wells : int
            Number of wells
        reservoir_temp : array
            Reservoir temperatures
            
        Returns:
        --------
        dict : Dictionary containing fluid properties
        """
        print("Generating fluid properties...")
        
        # Oil gravity (API) - varies by region and formation
        oil_gravity = np.random.normal(35, 8, n_wells)
        oil_gravity = np.clip(oil_gravity, 15, 50)
        
        # Gas-Oil Ratio (scf/bbl) - correlated with reservoir pressure and temperature
        # Higher temperature/pressure generally means higher GOR
        base_gor = np.random.lognormal(5, 0.8, n_wells)
        temp_factor = (reservoir_temp - 150) / 100  # Temperature effect
        gor = base_gor * (1 + temp_factor * 0.3)
        gor = np.clip(gor, 50, 5000)
        
        # Water Cut (fraction) - represents well maturity
        # Uses beta distribution to create realistic water cut progression
        water_cut = np.random.beta(2, 8, n_wells) * 0.9
        
        # Formation Volume Factor - oil shrinkage from reservoir to surface
        fvf_oil = 1.1 + (gor / 10000) * 0.3  # Simplified correlation
        
        # Oil viscosity (cp) - function of API gravity and temperature
        oil_viscosity = 10 ** (3.0324 - 0.02023 * oil_gravity) * (reservoir_temp / 100) ** (-1.163)
        oil_viscosity = np.clip(oil_viscosity, 0.5, 100)
        
        return {
            'oil_gravity_api': oil_gravity,
            'gas_oil_ratio_scf_bbl': gor,
            'water_cut_fraction': water_cut,
            'fvf_oil': fvf_oil,
            'oil_viscosity_cp': oil_viscosity
        }
    
    def calculate_production_rates(self, reservoir_props, well_design, pressures, fluids):
        """
        Calculate production rates using petroleum engineering correlations
        
        Parameters:
        -----------
        reservoir_props : dict
            Reservoir properties
        well_design : dict
            Well design parameters
        pressures : dict
            Pressure data
        fluids : dict
            Fluid properties
            
        Returns:
        --------
        dict : Dictionary containing production rates
        """
        print("Calculating production rates using IPR correlations...")
        
        n_wells = len(reservoir_props['permeability_md'])
        
        # Productivity Index (bbl/day/psi) - Darcy's Law based
        # J = (0.00708 * k * h) / (μ * B * (ln(re/rw) + S))
        
        k = reservoir_props['permeability_md']
        h = reservoir_props['thickness_ft'] * reservoir_props['net_to_gross']
        mu = fluids['oil_viscosity_cp']
        B = fluids['fvf_oil']
        
        # Simplified productivity index calculation
        productivity_index = (0.007 * k * h) / (mu * B * 7)  # Assuming ln(re/rw)+S ≈ 7
        
        # Well type multiplier
        well_type_multiplier = np.ones(n_wells)
        horizontal_mask = well_design['well_type'] == 'Horizontal'
        well_type_multiplier[horizontal_mask] *= 3  # Horizontal wells more productive
        
        deviated_mask = well_design['well_type'] == 'Deviated'
        well_type_multiplier[deviated_mask] *= 1.5  # Deviated wells moderately more productive
        
        # Completion efficiency
        completion_efficiency = np.ones(n_wells)
        completion_efficiency[well_design['completion_type'] == 'Perforated'] *= 0.8
        completion_efficiency[well_design['completion_type'] == 'Open Hole'] *= 1.0
        completion_efficiency[well_design['completion_type'] == 'Slotted Liner'] *= 0.9
        
        # Pressure drawdown
        pressure_drawdown = pressures['reservoir_pressure_psi'] - pressures['bottomhole_pressure_psi']
        
        # Base oil rate using IPR
        oil_rate = (productivity_index * pressure_drawdown * well_type_multiplier * 
                   completion_efficiency * (1 - fluids['water_cut_fraction']))
        
        # Choke effect - limits maximum flow
        choke_factor = np.minimum(well_design['choke_size_64th'] / 32, 1.5)
        oil_rate *= choke_factor
        
        # Artificial lift effect
        lift_efficiency = np.ones(n_wells)
        lift_efficiency[well_design['artificial_lift'] == 'ESP'] *= 1.3
        lift_efficiency[well_design['artificial_lift'] == 'Rod Pump'] *= 1.1
        lift_efficiency[well_design['artificial_lift'] == 'Gas Lift'] *= 1.2
        oil_rate *= lift_efficiency
        
        # Add realistic noise and constraints
        oil_rate *= np.random.uniform(0.8, 1.2, n_wells)  # ±20% variability
        oil_rate = np.clip(oil_rate, 5, 2000)  # Realistic range
        
        # Gas production rate
        gas_rate = oil_rate * fluids['gas_oil_ratio_scf_bbl']
        
        # Water production rate
        water_rate = oil_rate * fluids['water_cut_fraction'] / (1 - fluids['water_cut_fraction'] + 1e-6)
        water_rate = np.clip(water_rate, 0, oil_rate * 10)
        
        return {
            'oil_rate_bbl_day': oil_rate,
            'gas_rate_scf_day': gas_rate,
            'water_rate_bbl_day': water_rate,
            'productivity_index': productivity_index
        }
    

    def generate_economic_data(self,n_wells,production_rates,well_design):
        '''
        Generate economic parameter including prices and cost
        '''

        print("Generate economic parameters....")

        ## Commodity price with regional variation
        oil_price_base=75 # $/lb
        oil_price_variation=np.random.normal(0,10,n_wells)
        oil_price=oil_price_base+oil_price_variation
        oil_price=np.clip(oil_price,45,120)

        gas_price_base=3.5 #$/Mcf
        gas_price_variation=np.random.normal(0,0.8,n_wells)
        gas_price=gas_price_base+gas_price_variation
        gas_price=np.clip(gas_price,1.5,8)

        ## Operating cost(opex)
        ## Base daily ocst vary by well type and artificial type
        base_opex=np.full(n_wells,200) ## Base cost $/day

        ## Depth factor
        depth_factor=well_design['well_depth_ft']*0.01


        ## Artificial lift cost
        lift_cost=np.zeros(n_wells)
        lift_cost[well_design['artificial_lift']=='ESP']=100
        lift_cost[well_design['artificial_lift']=='Rod Pump']=50
        lift_cost[well_design['artificial_lift']=='Gas Lift']=75

        ## Varible cost based on production
        variable_cost_oil=production_rates['oil_rate_bbl_day']*2.5
        varibale_cost_water=production_rates['water_rate_bbl_day']*1.0

        daily_opex=base_opex+depth_factor+lift_cost+variable_cost_oil+varibale_cost_water

        ## Capital cost (one time , but affects economics)
        drilling_cost_ft=np.random.normal(150,30,n_wells)
        drilling_cost=well_design['well_depth_ft']*drilling_cost_ft

        completion_cost=np.random.normal(500000,100000,n_wells)
        completion_cost=np.clip(completion_cost,200000, 1500000)

        total_capex=drilling_cost+completion_cost

        # Revenue calculation
        oil_revenue = production_rates['oil_rate_bbl_day'] * oil_price
        gas_revenue = production_rates['gas_rate_scf_day'] * gas_price / 1000  # Convert to Mcf
        
        daily_revenue = oil_revenue + gas_revenue - daily_opex
        
        return {
            'oil_price_usd_bbl': oil_price,
            'gas_price_usd_mcf': gas_price,
            'daily_opex_usd': daily_opex,
            'drilling_cost_usd': drilling_cost,
            'completion_cost_usd': completion_cost,
            'total_capex_usd': total_capex,
            'daily_revenue_usd': daily_revenue
        }
    

    def calculate_performance_metrics(self, production_rates, economic_data, fluids):
        """
        Calculate key performance indicators
        
        Parameters:
        -----------
        production_rates : dict
            Production data
        economic_data : dict
            Economic data
        fluids : dict
            Fluid properties
            
        Returns:
        --------
        dict : Dictionary containing performance metrics
        """
        print("Calculating performance metrics...")
        
        oil_rate = production_rates['oil_rate_bbl_day']
        daily_revenue = economic_data['daily_revenue_usd']
        daily_opex = economic_data['daily_opex_usd']
        oil_price = economic_data['oil_price_usd_bbl']
        water_cut = fluids['water_cut_fraction']
        
        # Primary performance index (combines production efficiency and economics)
        performance_index = (oil_rate * (1 - water_cut) * oil_price / (daily_opex + 1)) * 100
        
        # Additional KPIs
        oil_cut = 1 - water_cut  # Fraction of oil in total liquid
        profit_per_barrel = daily_revenue / (oil_rate + 0.1)  # Avoid division by zero
        
        # Production efficiency (actual vs theoretical max)
        theoretical_max = oil_rate / (1 - water_cut + 0.001)  # If no water
        production_efficiency = oil_rate / theoretical_max
        
        # Economic efficiency
        economic_efficiency = np.where(daily_opex > 0, daily_revenue / daily_opex, 0)
        
        # Well ranking score (composite metric)
        ranking_score = (
            0.4 * (oil_rate / oil_rate.max()) +
            0.3 * (economic_efficiency / economic_efficiency.max()) +
            0.2 * (oil_cut / oil_cut.max()) +
            0.1 * (production_efficiency / production_efficiency.max())
        ) * 100
        
        return {
            'performance_index': performance_index,
            'oil_cut': oil_cut,
            'profit_per_barrel': profit_per_barrel,
            'production_efficiency': production_efficiency,
            'economic_efficiency': economic_efficiency,
            'ranking_score': ranking_score
        }
    
    def add_temporal_data(self,n_wells):
        '''
        Add time based informance to well

        return
        -------
        dict: Dictionary conatining temporal data
        
    
        '''
        print("Adding temporal data.....")

        # Well vintage (when drilled)
        start_date=datetime(2015,1,1)
        end_date=datetime(2025,1,1)
        date_range = (end_date - start_date).days

        drill_dates = [start_date + timedelta(days=np.random.randint(0, date_range)) 
                      for _ in range(n_wells)]
        

        ## Calculate well age days

        current_date=datetime(2025,1,1)
        well_age_days = [(current_date - date).days for date in drill_dates]
        
        # Production months (time on production)
        production_months = np.random.uniform(1, 48, n_wells)  # 1-48 months
        
        # Days since last workover
        days_since_workover = np.random.exponential(180, n_wells)  # Average 6 months
        days_since_workover = np.clip(days_since_workover, 30, 1095)  # 1 month to 3 years
        
        return {
            'drill_date': drill_dates,
            'well_age_days': well_age_days,
            'production_months': production_months,
            'days_since_workover': days_since_workover
        }
    
    def generate_complete_dataset(self, n_wells=500, filename='well_data.csv'):
        """
        Generate complete well dataset with all parameters
        
        Parameters:
        -----------
        n_wells : int
            Number of wells to generate
        filename : str
            Output CSV filename
            
        Returns:
        --------
        pandas.DataFrame : Complete well dataset
        """
        print(f"\n{'='*50}")
        print(f"GENERATING WELL PERFORMANCE DATASET")
        print(f"{'='*50}")
        print(f"Number of wells: {n_wells}")
        print(f"Random seed: {self.random_seed}")
        print(f"Output file: {filename}")
        print()
        
        # Generate all data components
        reservoir_props = self.generate_reservoir_properties(n_wells)
        well_design = self.generate_well_design(n_wells)
        pressures = self.generate_pressure_temperature(well_design['well_depth_ft'])
        fluids = self.generate_fluid_properties(n_wells, pressures['reservoir_temp_f'])
        production = self.calculate_production_rates(reservoir_props, well_design, pressures, fluids)
        economics = self.generate_economic_data(n_wells, production, well_design)
        performance = self.calculate_performance_metrics(production, economics, fluids)
        temporal = self.add_temporal_data(n_wells)
        
        # Combine all data into DataFrame
        data_dict = {
            'well_id': range(1, n_wells + 1),
            **reservoir_props,
            **well_design,
            **pressures,
            **fluids,
            **production,
            **economics,
            **performance,
            **temporal
        }
        
        df = pd.DataFrame(data_dict)
        
        # Add some derived features
        df['pressure_drawdown'] = df['reservoir_pressure_psi'] - df['bottomhole_pressure_psi']
        df['total_liquid_rate'] = df['oil_rate_bbl_day'] + df['water_rate_bbl_day']
        df['productivity_factor'] = df['permeability_md'] * df['porosity_fraction']
        df['depth_category'] = pd.cut(df['well_depth_ft'], 
                                     bins=[0, 5000, 8000, 12000, float('inf')],
                                     labels=['Shallow', 'Medium', 'Deep', 'Ultra Deep'])
        
        # Save to CSV
        df.to_csv(filename, index=False)
        
    
        
        print(f"\n✅ Dataset successfully saved to {filename}")
        return df
    
    
    




In [35]:
def main():
        """
        Main function demonstrating data preparation capabilities
        """
        print("Well Performance Data Preparation Module")
        print("=" * 50)
        
        # Example 1: Generate standard dataset
        print("\n1. Generating standard dataset...")
        generator = WellDataGenerator(random_seed=42)
        df_standard = generator.generate_complete_dataset(
            n_wells=500, 
            filename='well_data_standard.csv'
        )


if __name__ == "__main__":
    main()

Well Performance Data Preparation Module

1. Generating standard dataset...

GENERATING WELL PERFORMANCE DATASET
Number of wells: 500
Random seed: 42
Output file: well_data_standard.csv

Generating reservoir properties...
Generating well design parameters...
Generating pressure and temperature profiles...
Generating fluid properties...
Calculating production rates using IPR correlations...
Generate economic parameters....
Calculating performance metrics...
Adding temporal data.....

✅ Dataset successfully saved to well_data_standard.csv
