In [None]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor


import warnings

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.model_selection import GridSearchCV

In [3]:
import pandas as pd

data=pd.read_csv('well_data_standard.csv')

In [7]:
train,test=train_test_split(data)
train.shape

(375, 45)

In [9]:
numerical_fetures=['permeability_md', 'porosity_fraction', 'net_to_gross', 'thickness_ft',
       'well_depth_ft', 'tubing_diameter_in', 'choke_size_64th',
       'reservoir_pressure_psi', 'reservoir_temp_f', 'bottomhole_pressure_psi',
       'wellhead_pressure_psi', 'oil_gravity_api', 'gas_oil_ratio_scf_bbl',
       'water_cut_fraction', 'fvf_oil', 'oil_viscosity_cp', 'oil_rate_bbl_day',
       'gas_rate_scf_day', 'water_rate_bbl_day', 'productivity_index',
       'oil_price_usd_bbl', 'gas_price_usd_mcf', 'daily_opex_usd',
       'drilling_cost_usd', 'completion_cost_usd', 'total_capex_usd',
       'daily_revenue_usd', 'oil_cut', 'profit_per_barrel',
       'production_efficiency', 'economic_efficiency', 'ranking_score',
       'well_age_days', 'production_months', 'days_since_workover',
       'pressure_drawdown', 'total_liquid_rate', 'productivity_factor']
            
cat_features=['well_type', 'completion_type', 'artificial_lift',
       'depth_category']

In [17]:
num_transformer=StandardScaler()
oh_transformer=OneHotEncoder()

preprocessor=ColumnTransformer([
    ("OneHotEncoder",oh_transformer,cat_features),
    ("StandardScaler",num_transformer,numerical_fetures)
            ])

target_column_name='performance_index'

In [18]:
input_feature_train_df=train.drop(columns=[target_column_name],axis=1)
target_feature_train_df=train[target_column_name]

input_feature_test_df=test.drop(columns=[target_column_name],axis=1)
target_feature_test_df=test[target_column_name]

In [20]:
input_feature_train_arr=preprocessor.fit_transform(input_feature_train_df)
input_feature_test_arr=preprocessor.transform(input_feature_test_df)

train_arr=np.c_[
    input_feature_train_arr,np.array(target_feature_train_df)
            ]
test_arr=np.c_[input_feature_test_arr,np.array(target_feature_test_df)]

In [21]:
X_train,y_train,X_test,y_test=(
                train_arr[:,:-1],
                train_arr[:,-1],
                test_arr[:,:-1],
                test_arr[:,-1]
            )

In [25]:
models = {
                "Random Forest": RandomForestRegressor(),
                "Decision Tree": DecisionTreeRegressor(),
                "Gradient Boosting": GradientBoostingRegressor(),
                "Linear Regression": LinearRegression(),
                "XGBRegressor": XGBRegressor(),
                "CatBoosting Regressor": CatBoostRegressor(verbose=False),
                "AdaBoost Regressor": AdaBoostRegressor(),
            }
params={
                "Decision Tree": {
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                    # 'splitter':['best','random'],
                    # 'max_features':['sqrt','log2'],
                },
                "Random Forest":{
                    # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                 
                    # 'max_features':['sqrt','log2',None],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Gradient Boosting":{
                    # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
                    'learning_rate':[.1,.01,.05,.001],
                    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                    # 'criterion':['squared_error', 'friedman_mse'],
                    # 'max_features':['auto','sqrt','log2'],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Linear Regression":{},
                "XGBRegressor":{
                    'learning_rate':[.1,.01,.05,.001],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "CatBoosting Regressor":{
                    'depth': [6,8,10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'iterations': [30, 50, 100]
                },
                "AdaBoost Regressor":{
                    'learning_rate':[.1,.01,0.5,.001],
                    # 'loss':['linear','square','exponential'],
                    'n_estimators': [8,16,32,64,128,256]
                }
                
            }

In [35]:
def evaluate_models(X_train, y_train,X_test,y_test,models,param):
    
    report = {}

    for i in range(len(list(models))):
            model = list(models.values())[i]
            para=param[list(models.keys())[i]]

            gs = GridSearchCV(model,para,cv=3)
            gs.fit(X_train,y_train)

            model.set_params(**gs.best_params_)
            model.fit(X_train,y_train)

            #model.fit(X_train, y_train)  # Train model

            y_train_pred = model.predict(X_train)

            y_test_pred = model.predict(X_test)

            train_model_score = r2_score(y_train, y_train_pred)

            test_model_score = r2_score(y_test, y_test_pred)

            report[list(models.keys())[i]] = test_model_score

    return report
    
    


In [36]:
model_report:dict=evaluate_models(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,
                                             models=models,param=params)

In [37]:
model_report

{'Random Forest': 0.8964156691662394,
 'Decision Tree': 0.9417430998579491,
 'Gradient Boosting': 0.9552364939521492,
 'Linear Regression': 0.9976697039874478,
 'XGBRegressor': 0.9314832752373531,
 'CatBoosting Regressor': 0.7744449963033646,
 'AdaBoost Regressor': 0.9572836415932815}

In [38]:
best_model_score = max(sorted(model_report.values()))

In [39]:
best_model_score

0.9976697039874478

In [45]:
params['Gradient Boosting']['learning_rate']


[0.1, 0.01, 0.05, 0.001]

In [46]:
target_feature_train_df

495    424.188367
62      93.646059
414     94.527022
487     84.340429
230    116.133556
          ...    
19      98.265632
439    104.031577
77     108.585683
419    175.928941
102    130.342489
Name: performance_index, Length: 375, dtype: float64