# Imports

In [0]:
import geopandas as gp
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import DateType, IntegerType
from datetime import datetime, date
from pyspark.sql.functions import col, desc
import datetime
from dateutil.relativedelta import relativedelta
import shapely.wkt
import functools as ft
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots



# Input Paramters

In [0]:
# basin_of_interest = 'GULF COAST EAST'
# cutoff_date = '2019-04-01'
# flowunit_of_interest = 'HAYNESVILLE'
# scenario_id = "1"
# current_date = "2024-04-29"

In [0]:
basin_of_interest = dbutils.widgets.get("basin_of_interest")
cutoff_date = dbutils.widgets.get("cutoff_date_for_training_data")
flowunit_of_interest = dbutils.widgets.get("flow_unit_of_interest")
scenario_id = dbutils.widgets.get("scenario_id")
current_date = dbutils.widgets.get("current_date")

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-1395104717220172>, line 1[0m
[0;32m----> 1[0m basin_of_interest [38;5;241m=[39m dbutils[38;5;241m.[39mwidgets[38;5;241m.[39mget([38;5;124m"[39m[38;5;124mbasin_of_interest[39m[38;5;124m"[39m)
[1;32m      2[0m cutoff_date [38;5;241m=[39m dbutils[38;5;241m.[39mwidgets[38;5;241m.[39mget([38;5;124m"[39m[38;5;124mcutoff_date_for_training_data[39m[38;5;124m"[39m)
[1;32m      3[0m flowunit_of_interest [38;5;241m=[39m dbutils[38;5;241m.[39mwidgets[38;5;241m.[39mget([38;5;124m"[39m[38;5;124mflow_unit_of_interest[39m[38;5;124m"[39m)

File [0;32m/databricks/python_shell/dbruntime/WidgetHandlerImpl.py:43[0m, in [0;36mWidgetsHandlerImpl.get[0;34m(self, name)[0m
[1;32m     37[0m [38;5;28;01mdef[39;00m [38;5;21mget[39m([38;5;28mself[39m, name):
[1;32m   

# TypeCurve dataframe

In [0]:
typecurve_df = spark.sql(
    f"""
    SELECT
        API10,
        typeCurveArea,
        FlowUnit_Analog
    FROM
        produced.analog_well_selection
    WHERE
        recentWell = "true"
        AND flowUnit_Analog = '{flowunit_of_interest}'
"""
).toPandas()

In [0]:
from datetime import datetime


class Downloader:
    """
    A class for downloading well, TCA, and economics data using PySpark.
    """

    def __init__(
        self, well_data_table: str, analog_well_table: str, basin_of_interest: str
    ):
        """
        Initializes the Downloader object with necessary parameters.

        Parameters:
        - well_data_table (str): Table name for well data.
        - flowunit_of_interest (str): flow unit of interest.


        """
        self.well_data_table = well_data_table
        self.analog_well_table = analog_well_table
        self.flowunit_of_interest = flowunit_of_interest

    def download_well_data(self, filter_date, current_date):
        query = f"""
        SELECT
        *
        EXCEPT(LateralLength_FT, fu_median_ll, tca_median_ll),
        COALESCE(LateralLength_FT, tca_median_ll, fu_median_ll) AS LateralLength_FT
        FROM (
            SELECT
            ana.API10, ana.API14, ana.LateralLength_FT, ana.typeCurveArea, ana.BasinQuantum, ana.FlowUnit_Analog, com.CompletionDate, ana.OperatorGold, ana.FirstProdDate, com.EnvPermitSubmittedDate, com.PermitApprovedDate, ana.SpudDate, com.RigReleaseDate,
            PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY ana.LateralLength_FT) OVER () AS fu_median_ll,
            PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY ana.LateralLength_FT) OVER (PARTITION BY typeCurveArea) AS tca_median_ll
            FROM {self.well_data_table} com
            INNER JOIN {self.analog_well_table} ana
            ON ana.API14 = com.API14
            AND ana.recentWell = 'true'
            AND ana.FlowUnit_Analog = '{self.flowunit_of_interest}'
            AND ana.FirstProdDate < '{current_date}'
            AND ana.FirstProdDate > '{filter_date}'
        ) AS subquery
        """

        df = spark.sql(query).toPandas()

        df = df.loc[df.groupby("API10")["LateralLength_FT"].idxmax()]
        return df

In [0]:
well_completion_table = "produced.vw_well_completions_merged"
analog_well_table = "produced.analog_well_selection"
filter_date_for_training = cutoff_date

download = Downloader(well_completion_table, analog_well_table, flowunit_of_interest)
final_df = download.download_well_data(filter_date_for_training, current_date)

# Spud to rig release (historical data)

In [0]:
class RigsHistorical:
    def __init__(
        self, rig_historical_table: str, rig_historical_col: list, flow_unit_of_interest
    ):

        self.rig_historical_table = rig_historical_table
        self.analog_well_table = analog_well_table
        self.flow_unit_of_interest = flow_unit_of_interest

    def download_historical_rig_data(self, cutoff_date, current_date) -> pd.DataFrame:

        query = f"""
            SELECT
            date, com.API10, com.operator, com.reservoir_gold_consolidated, ana.typeCurveArea, com.BasinQuantum, ana.FlowUnit_Analog, rig_id
            FROM
            {self.rig_historical_table} com
            INNER JOIN
            {self.analog_well_table} ana
            ON
            ana.api10 = com.api10
            AND
            ana.recentWell = 'true'
            AND ana.FlowUnit_Analog = '{self.flow_unit_of_interest}'
            AND ana.FirstProdDate < '{current_date}'
            AND ana.FirstProdDate > '{cutoff_date}'

        """
        df = spark.sql(query).toPandas()
        df.rename(
            {"operator": "OperatorGold", "date": "time_taken_spud_to_rigrelease"},
            inplace=True,
            axis=1,
        )
        return df

In [0]:
rigs_historical_data_table = "produced.private_rigs_history"
api = tuple(final_df.API10.unique())
righistorical_download = RigsHistorical(
    rigs_historical_data_table, analog_well_table, flowunit_of_interest
)
rig_history_df = righistorical_download.download_historical_rig_data(
    cutoff_date, current_date
)

In [0]:
rig_time_df = rig_history_df.groupby(["API10"], as_index=False)[
    "time_taken_spud_to_rigrelease"
].count()

In [0]:
rig_time_df["time_taken_spud_to_rigrelease"] = pd.to_timedelta(
    rig_time_df["time_taken_spud_to_rigrelease"], unit="D"
)

In [0]:
def preprocessing(df):
    """
    Preprocesses the input DataFrame by filling null values, selecting rows based on specific criteria,
    and creating new time-related columns.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.

    Returns:
    - pd.DataFrame: Preprocessed DataFrame.
    """

    # Fill null values in the 'LateralLength_FT' column with -100
    df["LateralLength_FT"] = df["LateralLength_FT"].fillna(-100)

    # Select API10 entries with the greatest 'LateralLength_FT' for wells that possess more than one API14
    max_values = df.groupby("API10")["LateralLength_FT"].idxmax()
    df = df.loc[max_values]

    # Replace 0 values with None in 'LateralLength_FT' column
    df["LateralLength_FT"] = df["LateralLength_FT"].replace(0, None)

    # Calculate time taken for different phases
    df["time_taken_premit_submit_to_appr"] = (
        df["PermitApprovedDate"] - df["EnvPermitSubmittedDate"]
    )
    df["time_taken_premit_appr_to_spud"] = df["SpudDate"] - df["PermitApprovedDate"]
    df["time_taken_spud_to_completion"] = df["CompletionDate"] - df["SpudDate"]
    df["time_taken_completion_to_firstprod"] = (
        df["FirstProdDate"] - df["CompletionDate"]
    )

    return df

In [0]:
final_df["FirstProdDate"] = pd.to_datetime(final_df["FirstProdDate"])
final_df["SpudDate"] = pd.to_datetime(final_df["SpudDate"])

In [0]:
final_df = preprocessing(final_df)

In [0]:
final_df = pd.merge(final_df, rig_time_df, on="API10", how="left")

In [0]:
final_df = final_df.dropna(
    subset=["time_taken_spud_to_rigrelease", "time_taken_spud_to_completion"]
)

# Cycle time calculation Opr and TCA level

In [0]:
def check_threshold_limit(df, operator, typecurve, threshold=9):
    """
    Checks if the number of wells for the given operator and typecurve exceeds a threshold.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.
    - operator (str): Operator name.
    - typecurve (str): Typecurve area.
    - threshold (int): Threshold limit (default is 9).

    Returns:
    - pd.DataFrame: Filtered DataFrame based on the threshold.
    """
    filtered_df = df[
        (df["OperatorGold"] == operator) & (df["typeCurveArea"] == typecurve)
    ]

    if len(filtered_df) >= threshold:
        return filtered_df
    elif len(df[df["OperatorGold"] == operator]) >= threshold:
        return df[df["OperatorGold"] == operator]
    else:
        return df


def get_time_taken_premit_submit_to_appr(df, whole_basin_df):
    """
    Calculates the median time taken from permit submission to approval.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.

    Returns:
    - float: Median time taken.
    """
    median = df["time_taken_premit_submit_to_appr"].median()
    if median:
        return median
    else:
        whole_basin_df["time_taken_premit_submit_to_appr"].median()


def get_time_taken_premit_appr_to_spud(df, whole_basin_df):
    """
    Calculates the median time taken from permit approval to spud.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.

    Returns:
    - float: Median time taken.
    """
    median = df["time_taken_premit_appr_to_spud"].median()
    if median:
        return median
    else:
        whole_basin_df["time_taken_premit_appr_to_spud"].median()


def get_time_taken_spud_to_completion(df, whole_basin_df):
    """
    Calculates the median time taken from spud to completion.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.

    Returns:
    - float: Median time taken.
    """
    median = df["time_taken_spud_to_completion"].median()
    if median:
        return median
    else:
        whole_basin_df["time_taken_spud_to_completion"].median()


def get_time_taken_completion_to_firstprod(df, whole_basin_df):
    """
    Calculates the median time taken from completion to first production.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.

    Returns:
    - float: Median time taken.
    """
    median = df["time_taken_completion_to_firstprod"].median()
    if median:
        return median
    else:
        whole_basin_df["time_taken_completion_to_firstprod"].median()


def get_time_taken_spud_to_rigrelease(df, whole_basin_df):
    """
    Calculates the median time taken from spud to rig release.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.

    Returns:
    - float: Median time taken.
    """
    median = df["time_taken_spud_to_rigrelease"].median()
    if median is not None:
        return median
    else:
        print("here----")
        whole_basin_df["time_taken_spud_to_rigrelease"].median()


def get_cycle_times(operator, typecurve, df, whole_basin_df):
    """
    Gets median times for different phases of well cycle.

    Parameters:
    - operator (str): Operator name.
    - typecurve (str): Typecurve area.
    - df (pd.DataFrame): Input DataFrame.

    Returns:
    - list: List of median times.
    """
    new_df = check_threshold_limit(df, operator, typecurve)
    time_taken_premit_submit_to_appr = get_time_taken_premit_submit_to_appr(new_df, df)
    time_taken_premit_appr_to_spud = get_time_taken_premit_appr_to_spud(new_df, df)
    time_taken_spud_to_rigrelease = get_time_taken_spud_to_rigrelease(new_df, df)
    time_taken_spud_to_completion = get_time_taken_spud_to_completion(new_df, df)
    time_taken_completion_to_firstprod = get_time_taken_completion_to_firstprod(
        new_df, df
    )

    return [
        time_taken_premit_submit_to_appr,
        time_taken_premit_appr_to_spud,
        time_taken_spud_to_rigrelease,
        time_taken_spud_to_completion,
        time_taken_completion_to_firstprod,
    ]


def cycle_time_calculation(df, df2):
    """
    Calculates cycle times for different operators and typecurves.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.

    Returns:
    - pd.DataFrame: DataFrame with calculated cycle times.
    """
    opr_tca_df = df.groupby(["OperatorGold", "typeCurveArea"], as_index=False)[
        "API10"
    ].count()
    opr_tca_df.rename({"API10": "num_of_wells"}, inplace=True, axis=1)
    opr_tca_df[
        [
            "time_taken_premit_submit_to_appr",
            "time_taken_premit_appr_to_spud",
            "time_taken_spud_to_rigrelease",
            "time_taken_spud_to_completion",
            "time_taken_completion_to_firstprod",
        ]
    ] = opr_tca_df.apply(
        lambda row: pd.Series(
            get_cycle_times(row["OperatorGold"], row["typeCurveArea"], df, df2)
        ),
        axis=1,
    )
    opr_tca_df["time_taken_spud_to_rigrelease"] = opr_tca_df[
        "time_taken_spud_to_rigrelease"
    ].fillna(opr_tca_df["time_taken_spud_to_rigrelease"].median())

    opr_tca_df["time_taken_completion_to_firstprod"] = opr_tca_df[
        "time_taken_completion_to_firstprod"
    ].fillna(opr_tca_df["time_taken_completion_to_firstprod"].median())

    opr_tca_df["time_taken_spud_to_completion"] = opr_tca_df[
        "time_taken_spud_to_completion"
    ].fillna(opr_tca_df["time_taken_spud_to_completion"].median())
    return opr_tca_df

In [0]:
final_df_copy = final_df.copy(deep=True)
opr_tca_df = cycle_time_calculation(final_df, final_df_copy)

In [0]:
opr_tca_df["BasinQuantum"] = basin_of_interest

In [0]:
opr_tca_df["time_taken_premit_submit_to_appr"] = opr_tca_df[
    "time_taken_premit_submit_to_appr"
].dt.days
opr_tca_df["time_taken_premit_appr_to_spud"] = opr_tca_df[
    "time_taken_premit_appr_to_spud"
].dt.days
opr_tca_df["time_taken_spud_to_rigrelease"] = opr_tca_df[
    "time_taken_spud_to_rigrelease"
].dt.days
opr_tca_df["time_taken_spud_to_completion"] = opr_tca_df[
    "time_taken_spud_to_completion"
].dt.days
opr_tca_df["time_taken_completion_to_firstprod"] = opr_tca_df[
    "time_taken_completion_to_firstprod"
].dt.days
opr_tca_df["scenario_id"] = scenario_id

In [0]:
final_df = final_df[
    [
        "API10",
        "OperatorGold",
        "typeCurveArea",
        "time_taken_premit_submit_to_appr",
        "time_taken_premit_appr_to_spud",
        "time_taken_spud_to_rigrelease",
        "time_taken_spud_to_completion",
        "time_taken_completion_to_firstprod",
        "BasinQuantum",
    ]
]

In [0]:
final_df["time_taken_premit_submit_to_appr"] = final_df[
    "time_taken_premit_submit_to_appr"
].dt.days
final_df["time_taken_premit_appr_to_spud"] = final_df[
    "time_taken_premit_appr_to_spud"
].dt.days
final_df["time_taken_spud_to_rigrelease"] = final_df[
    "time_taken_spud_to_rigrelease"
].dt.days
final_df["time_taken_spud_to_completion"] = final_df[
    "time_taken_spud_to_completion"
].dt.days
final_df["time_taken_completion_to_firstprod"] = final_df[
    "time_taken_completion_to_firstprod"
].dt.days
final_df["scenario_id"] = scenario_id

# Creating Tables

In [0]:
spark.sql(
    f"""
          delete from produced.api_level_cycle_times where scenario_id = "{scenario_id}"
          """
)
spark.sql(
    f"""
          delete from produced.operator_cycle_times where scenario_id = "{scenario_id}"
          """
)

In [0]:
(
    spark.createDataFrame(final_df)
    .withColumn(
        "time_taken_premit_submit_to_appr",
        col("time_taken_premit_submit_to_appr").cast(IntegerType()),
    )
    .withColumn(
        "time_taken_premit_appr_to_spud",
        col("time_taken_premit_appr_to_spud").cast(IntegerType()),
    )
    .withColumn(
        "time_taken_spud_to_rigrelease",
        col("time_taken_spud_to_rigrelease").cast(IntegerType()),
    )
    .withColumn(
        "time_taken_spud_to_completion",
        col("time_taken_spud_to_completion").cast(IntegerType()),
    )
    .withColumn(
        "time_taken_completion_to_firstprod",
        col("time_taken_completion_to_firstprod").cast(IntegerType()),
    )
    .write.format("delta")
    .option("mergeSchema", "true")
    .mode("append")
    .saveAsTable(f"produced.api_level_cycle_times")
)

In [0]:
(
    spark.createDataFrame(opr_tca_df)
    .withColumn("num_of_wells", col("num_of_wells").cast(IntegerType()))
    .withColumn(
        "time_taken_premit_submit_to_appr",
        col("time_taken_premit_submit_to_appr").cast(IntegerType()),
    )
    .withColumn(
        "time_taken_premit_appr_to_spud",
        col("time_taken_premit_appr_to_spud").cast(IntegerType()),
    )
    .withColumn(
        "time_taken_spud_to_rigrelease",
        col("time_taken_spud_to_rigrelease").cast(IntegerType()),
    )
    .withColumn(
        "time_taken_spud_to_completion",
        col("time_taken_spud_to_completion").cast(IntegerType()),
    )
    .withColumn(
        "time_taken_completion_to_firstprod",
        col("time_taken_completion_to_firstprod").cast(IntegerType()),
    )
    .write.format("delta")
    .option("mergeSchema", "true")
    .mode("append")
    .saveAsTable(f"produced.operator_cycle_times")
)