## 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import os

## 2. Path configuration

In [2]:
RAW_DATA_PATH = '../data/raw/Volve production data.xlsx'

## 3. Selective column loading

In [3]:
cols_to_use = [
    'NPD_WELL_BORE_NAME', 
    'DATEPRD', 
    'BORE_OIL_VOL', 
    'BORE_GAS_VOL', 
    'BORE_WAT_VOL', 
    'AVG_DOWNHOLE_PRESSURE'
]

print("Loading data from the Volve Field...")
df_raw = pd.read_excel(RAW_DATA_PATH, usecols=cols_to_use)

Loading data from the Volve Field...


## 4. Basic initial cleaning

In [None]:
df_raw['DATEPRD'] = pd.to_datetime(df_raw['DATEPRD'])
df_raw.columns = [col.lower() for col in df_raw.columns]    # Normalize to lowercase

## 5. Identify the 5 wells with the longest production history.

In [5]:
top_5_wells = df_raw['npd_well_bore_name'].value_counts().head(5).index.tolist()

print(f"Wells selected for comparative analysis: {top_5_wells}")

Wells selected for comparative analysis: ['15/9-F-4', '15/9-F-5', '15/9-F-12', '15/9-F-14', '15/9-F-11']


## 6. Create DataFrames dictionaries (one for each well)

In [6]:
wells_dict = {}
for well in top_5_wells:
    temp_df = df_raw[df_raw['npd_well_bore_name'] == well].copy()
    temp_df = temp_df.sort_values('dateprd')
    # Limpieza: Eliminar días con producción <= 0 para el ajuste de curvas
    temp_df = temp_df[temp_df['bore_oil_vol'] > 0]
    wells_dict[well] = temp_df

print(f"\nData loaded and ready for all {len(wells_dict)} wells.")


Data loaded and ready for all 5 wells.


## 7. Save the data of the selected wells.

In [7]:
df_top_wells = df_raw[df_raw['npd_well_bore_name'].isin(top_5_wells)]
df_top_wells.to_csv('../data/processed/well_data_cleaned.csv', index=False)
print("Data processed and saved successfully to data/processed/!")

Data processed and saved successfully to data/processed/!
