In [4]:
df=pd.read_csv('well_data_standard.csv')

In [3]:
import os
import pandas as pd

In [8]:
def generate_data_summary( df):
        """
        Generate and display summary statistics for the dataset
        
        Parameters:
        -----------
        df : pandas.DataFrame
            Generated dataset
        filename : str
            Output filename
        """
        print(f"\n{'='*30}")
        print("DATASET SUMMARY")
        print(f"{'='*30}")
        
        print(f"Dataset shape: {df.shape}")
        ##print(f"File size: {os.path.getsize(filename) / 1024:.1f} KB")
        
        print(f"\nProduction Statistics:")
        print(f"  Average oil rate: {df['oil_rate_bbl_day'].mean():.1f} bbl/day")
        print(f"  Production range: {df['oil_rate_bbl_day'].min():.1f} - {df['oil_rate_bbl_day'].max():.1f} bbl/day")
        print(f"  Average water cut: {df['water_cut_fraction'].mean():.1%}")
        
        print(f"\nReservoir Statistics:")
        print(f"  Permeability range: {df['permeability_md'].min():.1f} - {df['permeability_md'].max():.1f} mD")
        print(f"  Average porosity: {df['porosity_fraction'].mean():.1%}")
        print(f"  Depth range: {df['well_depth_ft'].min():.0f} - {df['well_depth_ft'].max():.0f} ft")
        
        print(f"\nEconomic Statistics:")
        profitable_wells = len(df[df['daily_revenue_usd'] > 0])
        print(f"  Profitable wells: {profitable_wells} ({profitable_wells/len(df)*100:.1f}%)")
        print(f"  Average daily revenue: ${df['daily_revenue_usd'].mean():.0f}")
        print(f"  Average OPEX: ${df['daily_opex_usd'].mean():.0f}/day")
        
        print(f"\nWell Type Distribution:")
        well_type_counts = df['well_type'].value_counts()
        for well_type, count in well_type_counts.items():
            print(f"  {well_type}: {count} wells ({count/len(df)*100:.1f}%)")
        
        print(f"\nArtificial Lift Distribution:")
        lift_counts = df['artificial_lift'].value_counts()
        for lift_type, count in lift_counts.items():
            print(f"  {lift_type}: {count} wells ({count/len(df)*100:.1f}%)")
        
        print(f"\nPerformance Metrics:")
        print(f"  Average performance index: {df['performance_index'].mean():.1f}")
        print(f"  Performance range: {df['performance_index'].min():.1f} - {df['performance_index'].max():.1f}")
        print(f"  Average ranking score: {df['ranking_score'].mean():.1f}")

In [9]:
generate_data_summary(df)


DATASET SUMMARY
Dataset shape: (500, 45)

Production Statistics:
  Average oil rate: 7.4 bbl/day
  Production range: 5.0 - 138.1 bbl/day
  Average water cut: 18.4%

Reservoir Statistics:
  Permeability range: 0.3 - 348.2 mD
  Average porosity: 15.2%
  Depth range: 4000 - 11957 ft

Economic Statistics:
  Profitable wells: 365 (73.0%)
  Average daily revenue: $203
  Average OPEX: $354/day

Well Type Distribution:
  Horizontal: 239 wells (47.8%)
  Vertical: 203 wells (40.6%)
  Deviated: 58 wells (11.6%)

Artificial Lift Distribution:
  ESP: 162 wells (32.4%)
  Natural Flow: 156 wells (31.2%)
  Rod Pump: 120 wells (24.0%)
  Gas Lift: 62 wells (12.4%)

Performance Metrics:
  Average performance index: 119.8
  Performance range: 44.4 - 1161.0
  Average ranking score: 27.9
