# Imports

In [0]:
import geopandas as gp
import pandas as pd
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql.types import DateType
from pyspark.sql.functions import desc
from pyspark.sql.functions import col
from datetime import date
import datetime
from dateutil.relativedelta import relativedelta
import shapely.wkt
import functools as ft



# Input Params

In [0]:
# user_input_num_of_rigs = 35
# basin_of_interest = 'GULF COAST EAST'
# flowunit_of_interest = 'HAYNESVILLE'
# desired_active_rig_date = "2023-10-01"
# training_cutoff_date = "2022-04-01"
# cutoff_misc_perc = 5
# scenario_id = "1"
# current_date = "2024-04-29"

In [0]:
user_input_num_of_rigs = dbutils.widgets.get("user_input_for_num_of_rigs")
basin_of_interest = dbutils.widgets.get("basin_of_interest")
flowunit_of_interest = dbutils.widgets.get("flow_unit_of_interest")
desired_active_rig_date = dbutils.widgets.get("desired_active_rig_date")
training_cutoff_date = dbutils.widgets.get("cutoff_date_to_train_rig_model")
cutoff_misc_perc = int(dbutils.widgets.get("cutoff_perc_for_misc_operators"))
current_date = dbutils.widgets.get("current_date")
scenario_id = dbutils.widgets.get("scenario_id")

# Rig Data Downloader

In [0]:
class RigsHistorical:
    def __init__(
        self,
        rig_historical_table: str,
        rig_historical_col: list,
        flow_unit_of_interest,
        basin_of_interest,
    ):

        self.rig_historical_table = rig_historical_table
        self.analog_well_table = analog_well_table
        self.flow_unit_of_interest = flow_unit_of_interest
        self.basin_of_interest = basin_of_interest

    def download_historical_rig_data(self, desired_active_rig_date) -> pd.DataFrame:

        query = f"""
        SELECT
            date, com.api10, SpudDate, com.operator, com.reservoir_gold_consolidated, 
            ana.typeCurveArea, com.BasinQuantum, ana.FlowUnit_Analog, ana.LateralLength_FT, rig_id
        FROM
            {self.rig_historical_table} AS com
        INNER JOIN
            {self.analog_well_table} AS ana
        ON
            ana.api10 = com.api10
            AND ana.recentWell = 'true'
            AND com.BasinQuantum = '{self.basin_of_interest}'
            AND ana.FlowUnit_Analog = '{self.flow_unit_of_interest}'
            AND date = '{desired_active_rig_date}'
        """

        df = spark.sql(query).toPandas()
        return df

In [0]:
rigs_historical_data_table = "produced.private_rigs_history"
analog_well_table = "produced.analog_well_selection"
righistorical_download = RigsHistorical(
    rigs_historical_data_table,
    analog_well_table,
    flowunit_of_interest,
    basin_of_interest,
)
rig_data = righistorical_download.download_historical_rig_data(desired_active_rig_date)
rig_data = rig_data.drop_duplicates()



In [0]:
class RighistoricalDownloader:
    def __init__(
        self,
        rig_historical_table: str,
        analog_well_table: list,
        basin_of_interest: str,
        flow_unit_of_interest: str,
    ):
        """
        Initializes the RigDownloader object with necessary parameters.

        """
        self.rig_historical_table = rig_historical_table
        self.analog_well_table = analog_well_table
        self.flow_unit_of_interest = flow_unit_of_interest
        self.basin_of_interest = basin_of_interest

    def download_rig_data(self, cutoff_date, current_date) -> pd.DataFrame:
        """
        Downloads rig data from PySpark and returns it as a Pandas DataFrame.

        Parameters:
        - spark: PySpark SparkSession.

        Returns:
        - pd.DataFrame: Rig data as Pandas DataFrame.
        """
        query = f"""
        SELECT
            date, com.api10,  com.operator, com.reservoir_gold_consolidated, 
            ana.typeCurveArea, com.BasinQuantum, ana.FlowUnit_Analog, ana.LateralLength_FT, rig_id
        FROM
            {self.rig_historical_table} AS com
        INNER JOIN
            {self.analog_well_table} AS ana
        ON
            ana.api10 = com.api10
            AND ana.recentWell = 'true'
            AND com.BasinQuantum = '{self.basin_of_interest}'
            AND ana.FlowUnit_Analog = '{self.flow_unit_of_interest}'
            AND date >= '{cutoff_date}'
            AND date < '{current_date}'
        """

        rig_data = spark.sql(query).toPandas()
        rig_data["LateralLength_FT"] = pd.to_numeric(rig_data["LateralLength_FT"])
        rig_data = rig_data.loc[rig_data.groupby("api10")["LateralLength_FT"].idxmax()]

        return rig_data

In [0]:
def rigs_distribution_historical_data():
    rigtable = "produced.private_rigs_history"
    analog_well_table = "produced.analog_well_selection"
    rigdownload = RighistoricalDownloader(
        rigtable, analog_well_table, basin_of_interest, flowunit_of_interest
    )
    rig_history_df = rigdownload.download_rig_data(training_cutoff_date, current_date)
    return rig_history_df

# Get Rig Release for current rigs

In [0]:
final_df = spark.sql(
    f"select * from produced.api_level_cycle_times where scenario_id = '{scenario_id}'"
).toPandas()
opr_tca_df = spark.sql(
    f"SELECT * FROM produced.operator_cycle_times where scenario_id = '{scenario_id}'"
).toPandas()

In [0]:
def get_rig_release_time(opr, tca, cycle_time_df=opr_tca_df, basin_df=final_df):

    if (
        len(
            cycle_time_df[
                (cycle_time_df.OperatorGold == opr)
                & (cycle_time_df.typeCurveArea == tca)
            ]
        )
        > 0
    ):

        return cycle_time_df[
            (cycle_time_df.OperatorGold == opr) & (cycle_time_df.typeCurveArea == tca)
        ]["time_taken_spud_to_rigrelease"].median()
    else:
        if len(basin_df[basin_df.OperatorGold == opr]) > 9:

            return basin_df[basin_df.OperatorGold == opr][
                "time_taken_spud_to_rigrelease"
            ].median()
        else:

            return basin_df["time_taken_spud_to_rigrelease"].median()


def get_spud_to_complete_time(opr, tca, cycle_time_df=opr_tca_df, basin_df=final_df):

    if (
        len(
            cycle_time_df[
                (cycle_time_df.OperatorGold == opr)
                & (cycle_time_df.typeCurveArea == tca)
            ]
        )
        > 0
    ):

        return cycle_time_df[
            (cycle_time_df.OperatorGold == opr) & (cycle_time_df.typeCurveArea == tca)
        ]["time_taken_spud_to_completion"].median()
    else:
        if len(basin_df[basin_df.OperatorGold == opr]) > 9:

            return basin_df[basin_df.OperatorGold == opr][
                "time_taken_spud_to_completion"
            ].median()
        else:

            return basin_df["time_taken_spud_to_completion"].median()

In [0]:
rig_data["rig_release_date"] = None
rig_data["SpudDate"] = pd.to_datetime(rig_data["SpudDate"]).dt.date

In [0]:
rig_data.reset_index(drop=True, inplace=True)

In [0]:
for i in range(len(rig_data)):

    operator = rig_data["operator"].iloc[i]
    type_curve_area = rig_data["typeCurveArea"].iloc[i]

    spud_to_rig_release = int(get_rig_release_time(operator, type_curve_area))

    rig_release_date = rig_data.at[i, "SpudDate"] + pd.Timedelta(
        days=spud_to_rig_release
    )

    if rig_release_date < pd.Timestamp(desired_active_rig_date):
        rig_release_date = pd.Timestamp(desired_active_rig_date)

    rig_data.loc[i, "rig_release_date"] = rig_release_date

  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig_date):
  if rig_release_date < pd.Timestamp(desired_active_rig

# Rigs Distribution based on User Input

In [0]:
def get_rigs_distribution_while_allocating(rigs_to_allocate, area_of_each_opr):
    random_numbers = np.random.uniform(low=0, high=100, size=rigs_to_allocate)

    def distribute_rigs(random_number):
        for i, row in area_of_each_opr.iterrows():
            if random_number <= row["cumulative_fraction_rigs"]:
                return row["operator"]

    # Apply rig distribution function to each random number
    rigs_distribution = [
        distribute_rigs(random_number) for random_number in random_numbers
    ]

    # Count rigs for each operator
    rigs_count = pd.Series(rigs_distribution).value_counts()
    rigs_count = rigs_count.reindex(area_of_each_opr["operator"])
    area_of_each_opr["allocated_rigs"] = rigs_count.values
    return area_of_each_opr

In [0]:
def get_historical_rig_distribution_among_opr(miscell_perc):

    rig_history_df = rigs_distribution_historical_data()

    # todo: rename the column rig_id
    opr_rig_dist_cross_day_df = rig_history_df.groupby(
        ["date", "operator"], as_index=False
    )["rig_id"].nunique()
    area_of_each_opr = opr_rig_dist_cross_day_df.groupby("operator", as_index=False)[
        "rig_id"
    ].sum()
    total = area_of_each_opr["rig_id"].sum()
    area_of_each_opr["fraction_rigs"] = round(
        area_of_each_opr["rig_id"] * 100 / total, 2
    )

    converted_opr_to_misc = (
        area_of_each_opr[area_of_each_opr.fraction_rigs < miscell_perc]["operator"]
        .unique()
        .tolist()
    )
    area_of_each_opr.loc[
        area_of_each_opr.fraction_rigs < miscell_perc, "operator"
    ] = "Miscellaneous"
    area_of_each_opr = (
        area_of_each_opr.groupby("operator", as_index=False)[
            ["rig_id", "fraction_rigs"]
        ]
        .sum()
        .sort_values(by="fraction_rigs")
    )
    area_of_each_opr["cumulative_fraction_rigs"] = area_of_each_opr[
        "fraction_rigs"
    ].cumsum()
    return area_of_each_opr, converted_opr_to_misc

In [0]:
def allocate_rigs(area_of_each_opr, today_rigs_df, date):
    date_of_allocation = datetime.datetime.strptime(date, "%Y-%m-%d").date()
    rig_count = 1
    for i, row in area_of_each_opr.iterrows():
        if row["allocated_rigs"] > 0:
            total_rigs = int(row["allocated_rigs"])
            curr_opr = row["operator"]
            for j in range(total_rigs):
                rig_dict = {
                    "rig_id": f"allocated_rig_{rig_count}",
                    "api10": None,
                    "operator": curr_opr,
                    "typeCurveArea": None,
                    "SpudDate": None,
                    "rig_release_date": date_of_allocation,
                }
                rig_count += 1
                today_rigs_df = today_rigs_df.append(rig_dict, ignore_index=True)
    return today_rigs_df

In [0]:
# def get_rig_distribution_while_deallocating(area_of_each_opr1, today_rigs_df, total_rigs):

#     contains_negative = True
#     def distribute_rigs(random_number):
#         for i, row in area_of_each_opr1.iterrows():
#             if random_number <= row['cumulative_fraction_rigs']:
#                 return row['operator']
#     while contains_negative==True:
#         random_numbers = np.random.uniform(low=0, high=100, size=total_rigs)

#         # Apply rig distribution function to each random number
#         rigs_distribution = [distribute_rigs(random_number) for random_number in random_numbers]

#         # Count rigs for each operator
#         rigs_count = pd.Series(rigs_distribution).value_counts()
#         rigs_count = rigs_count.reindex(area_of_each_opr1['operator'])
#         area_of_each_opr1['allocate_rigs'] = rigs_count.values
#         area_of_each_opr1 = area_of_each_opr1[area_of_each_opr1.allocate_rigs>0]
#         rigs_on_each_opr = today_rigs_df.groupby("operator", as_index=False)['rig_id'].nunique()
#         new_df = pd.merge(rigs_on_each_opr, area_of_each_opr1[['operator', 'allocate_rigs']], on='operator', how='outer')
#         new_df["rigs_to_remove"] = new_df['rig_id']-new_df['allocate_rigs']
#         contains_negative = any(new_df['rigs_to_remove'] < 0)
#     return new_df

In [0]:
def deallocate_and_reallocate_rigs(required_rigs_distribution_df, today_rigs_df, date):
    date_of_allocation = datetime.datetime.strptime(date, "%Y-%m-%d").date()

    required_rigs_distribution_df = required_rigs_distribution_df[
        required_rigs_distribution_df.allocated_rigs.notnull()
    ][["operator", "allocated_rigs"]]

    total_current_rigs = (
        today_rigs_df.groupby("operator", as_index=False)["rig_id"]
        .nunique()
        .rename(columns={"rig_id": "current_num_of_rigs"})
    )

    required_rigs_distribution_df = pd.merge(
        required_rigs_distribution_df, total_current_rigs, on="operator", how="outer"
    )

    required_rigs_distribution_df.loc[
        required_rigs_distribution_df.current_num_of_rigs.isnull(),
        "current_num_of_rigs",
    ] = 0

    required_rigs_distribution_df = required_rigs_distribution_df[
        required_rigs_distribution_df.allocated_rigs.notnull()
    ]

    required_rigs_distribution_df["rigs_to_remove"] = (
        required_rigs_distribution_df["current_num_of_rigs"]
        - required_rigs_distribution_df["allocated_rigs"]
    )
    rig_count = 1
    for i, row in required_rigs_distribution_df.iterrows():
        opr = row["operator"]
        rigs_to_remove = row["rigs_to_remove"]
        allocate_rigs = row["allocated_rigs"]

        if rigs_to_remove > 0:
            rigs_api_to_remove = (
                today_rigs_df[today_rigs_df.operator == opr]
                .sort_values(by="rig_release_date")
                .reset_index(drop=True)
                .head(int(rigs_to_remove))["rig_id"]
                .tolist()
            )
            today_rigs_df = today_rigs_df[
                ~today_rigs_df.rig_id.isin(rigs_api_to_remove)
            ]
        elif rigs_to_remove < 0:

            for j in range(int(abs(rigs_to_remove))):
                rig_dict = {
                    "rig_id": f"allocated_rig_{rig_count}",
                    "api10": None,
                    "operator": opr,
                    "typeCurveArea": None,
                    "SpudDate": None,
                    "rig_release_date": date_of_allocation,
                }
                rig_count += 1
                today_rigs_df = today_rigs_df.append(rig_dict, ignore_index=True)

    return today_rigs_df

In [0]:
current_rigs = len(rig_data)
area_of_each_opr, miss_oprs = get_historical_rig_distribution_among_opr(
    cutoff_misc_perc
)
miscellaneous_opr = pd.DataFrame({"real_operator_name": miss_oprs})
miscellaneous_opr["new_opr_name"] = "Miscellaneous"

if len(miscellaneous_opr) == 0:
    miscellaneous_opr = pd.DataFrame({"real_operator_name": ["dummy_operator"]})
    miscellaneous_opr["new_opr_name"] = None

# modifing rig data for miscellaneous category
rig_data.loc[
    rig_data.operator.isin(miscellaneous_opr.real_operator_name.unique()), "operator"
] = "Miscellaneous"

if user_input_num_of_rigs != "null":

    user_input_num_of_rigs = int(user_input_num_of_rigs)

    if user_input_num_of_rigs > current_rigs:

        rigs_to_allocate = int(user_input_num_of_rigs - current_rigs)
        area_of_each_opr = get_rigs_distribution_while_allocating(
            rigs_to_allocate, area_of_each_opr
        )

        rig_data = allocate_rigs(area_of_each_opr, rig_data, desired_active_rig_date)

    elif user_input_num_of_rigs < current_rigs:
        rigs_to_deallocate_df = get_rigs_distribution_while_allocating(
            user_input_num_of_rigs, area_of_each_opr
        )
        rig_data = deallocate_and_reallocate_rigs(
            rigs_to_deallocate_df, rig_data, desired_active_rig_date
        )



here1
here2
here3


  today_rigs_df = today_rigs_df.append(rig_dict, ignore_index=True)
  today_rigs_df = today_rigs_df.append(rig_dict, ignore_index=True)
  today_rigs_df = today_rigs_df.append(rig_dict, ignore_index=True)
  today_rigs_df = today_rigs_df.append(rig_dict, ignore_index=True)
  rigs_api_to_remove = today_rigs_df[today_rigs_df.operator == opr].sort_values(by='rig_release_date').reset_index(drop=True).head(int(rigs_to_remove))['rig_id'].tolist()
  today_rigs_df = today_rigs_df.append(rig_dict, ignore_index=True)


In [0]:
rig_data.rename({"SpudDate": "spud_date"}, inplace=True, axis=1)

In [0]:
rig_data = rig_data[
    [
        "date",
        "rig_id",
        "api10",
        "reservoir_gold_consolidated",
        "BasinQuantum",
        "operator",
        "typeCurveArea",
        "FlowUnit_Analog",
        "spud_date",
        "rig_release_date",
    ]
]

In [0]:
rig_data["rig_release_date"] = pd.to_datetime(rig_data["rig_release_date"])

In [0]:
rig_data["BasinQuantum"].fillna(basin_of_interest, inplace=True)
rig_data["FlowUnit_Analog"].fillna(flowunit_of_interest, inplace=True)

In [0]:
rig_data["scenario_id"] = scenario_id
miscellaneous_opr["scenario_id"] = scenario_id

# Creating Tabels

In [0]:
spark.sql(
    f"""
          delete from produced.rig_model_miscellaneous_opr where scenario_id = "{scenario_id}"
          """
)
spark.sql(
    f"""
          delete from produced.rig_model_table where scenario_id = "{scenario_id}"
          """
)

DataFrame[num_affected_rows: bigint]

In [0]:
spark.createDataFrame(miscellaneous_opr).write.format("delta").option(
    "mergeSchema", "true"
).mode("append").saveAsTable(f"produced.rig_model_miscellaneous_opr")

In [0]:
rig_data["date"] = pd.to_datetime(rig_data["date"])
rig_data["spud_date"] = pd.to_datetime(rig_data["spud_date"])

In [0]:
rig_data["date"] = rig_data["date"].dt.date

In [0]:
spark.createDataFrame(rig_data).write.format("delta").option(
    "mergeSchema", "true"
).mode("append").saveAsTable(f"produced.rig_model_table")