In [15]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression





**Reading The Data set**

In [3]:
df=pd.read_excel("Volve_dataframe.xlsx")

## Check The Null values

In [4]:
features_na=[features for features in df.columns if df[features].isnull().sum()>1]
features_na

['ON_STREAM_HRS',
 'AVG_DOWNHOLE_PRESSURE',
 'AVG_DOWNHOLE_TEMPERATURE',
 'AVG_DP_TUBING',
 'AVG_ANNULUS_PRESS',
 'AVG_CHOKE_SIZE_P',
 'AVG_CHOKE_UOM',
 'AVG_WHP_P',
 'AVG_WHT_P',
 'DP_CHOKE_SIZE',
 'BORE_OIL_VOL',
 'BORE_GAS_VOL',
 'BORE_WAT_VOL',
 'BORE_WI_VOL']

In [5]:
df.isnull().sum()

DATEPRD                        0
WELL_BORE_CODE                 0
NPD_WELL_BORE_CODE             0
NPD_WELL_BORE_NAME             0
NPD_FIELD_CODE                 0
NPD_FIELD_NAME                 0
NPD_FACILITY_CODE              0
NPD_FACILITY_NAME              0
ON_STREAM_HRS                285
AVG_DOWNHOLE_PRESSURE       6649
AVG_DOWNHOLE_TEMPERATURE    6649
AVG_DP_TUBING               6649
AVG_ANNULUS_PRESS           7704
AVG_CHOKE_SIZE_P            6715
AVG_CHOKE_UOM               6473
AVG_WHP_P                   6479
AVG_WHT_P                   6488
DP_CHOKE_SIZE                294
BORE_OIL_VOL                6473
BORE_GAS_VOL                6473
BORE_WAT_VOL                6473
BORE_WI_VOL                 9858
FLOW_KIND                      0
WELL_TYPE                      0
dtype: int64

In [6]:
#Dropping unnecessary features: WELL_BORE_CODE, NPD_WELL_BORE_NAME, NPD_FIELD_NAME, NPD_FIELD_CODE,NPD_FACILITY_CODE,NPD_FACILITY_NAME

to_drop = ["WELL_BORE_CODE", "NPD_WELL_BORE_NAME", "NPD_FIELD_NAME", "NPD_FIELD_CODE", "NPD_FACILITY_CODE", "NPD_FACILITY_NAME","AVG_DOWNHOLE_PRESSURE", "AVG_DP_TUBING","BORE_WI_VOL","AVG_CHOKE_UOM","FLOW_KIND","WELL_TYPE","BORE_GAS_VOL"]
newdf = df.drop(to_drop, axis=1)
#print(newdf.info())

In [7]:
#Data is highly skewed and depends on the well,hence i have used padding to fill the missing values 
for i in newdf.columns.unique():
    newdf[i]=newdf[i].fillna(method='pad')


In [8]:
newdf.isnull().sum()

DATEPRD                     0
NPD_WELL_BORE_CODE          0
ON_STREAM_HRS               0
AVG_DOWNHOLE_TEMPERATURE    0
AVG_ANNULUS_PRESS           0
AVG_CHOKE_SIZE_P            0
AVG_WHP_P                   0
AVG_WHT_P                   0
DP_CHOKE_SIZE               0
BORE_OIL_VOL                0
BORE_WAT_VOL                0
dtype: int64

In [9]:
#Scaling dataset to remove difference in distributions within columns

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

newdf[['ON_STREAM_HRS','AVG_DOWNHOLE_TEMPERATURE','AVG_ANNULUS_PRESS','AVG_CHOKE_SIZE_P','AVG_WHP_P','AVG_WHT_P']] =scaler.fit_transform(newdf[['ON_STREAM_HRS','AVG_DOWNHOLE_TEMPERATURE','AVG_ANNULUS_PRESS','AVG_CHOKE_SIZE_P','AVG_WHP_P','AVG_WHT_P']])

In [10]:
#Taking the data of wells 2 to 4 since they were the ones with the east number of missing values
fdf = newdf.loc[746 : 8022]

In [11]:
fdf

Unnamed: 0,DATEPRD,NPD_WELL_BORE_CODE,ON_STREAM_HRS,AVG_DOWNHOLE_TEMPERATURE,AVG_ANNULUS_PRESS,AVG_CHOKE_SIZE_P,AVG_WHP_P,AVG_WHT_P,DP_CHOKE_SIZE,BORE_OIL_VOL,BORE_WAT_VOL
746,2013-07-21,7078,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.0
747,2013-07-22,7078,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.0
748,2013-07-23,7078,0.000000,0.000000,0.000000,0.011133,0.000000,0.000000,0.000000,0.00,0.0
749,2013-07-24,7078,0.280000,0.003246,0.096115,0.032565,0.204276,0.194575,18.851470,17.85,0.0
750,2013-07-25,7078,0.960000,0.555894,0.648388,0.085491,0.743722,0.395333,72.955190,725.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...
8018,2014-03-20,7289,0.960000,0.981121,0.645976,0.021706,0.325649,0.305422,15.093765,225.45,0.0
8019,2014-03-21,7289,0.960000,0.981321,0.657475,0.022446,0.323856,0.307348,14.764761,229.31,0.0
8020,2014-03-22,7289,0.536667,0.980281,0.634368,0.021768,0.319928,0.313633,14.232282,192.31,0.0
8021,2014-03-23,7289,0.960000,0.979535,0.548404,0.029806,0.318499,0.291931,13.878730,267.64,0.0


In [16]:
X = fdf.drop(['BORE_OIL_VOL'],axis=1)
y= fdf[['BORE_OIL_VOL','NPD_WELL_BORE_CODE']]

#Applying Linear Regression

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.3, random_state=42)

reg_all = LinearRegression()

In [17]:
x_train_final = X_train.drop(['DATEPRD','NPD_WELL_BORE_CODE'],axis = 1)
x_test_final = X_test.drop(['DATEPRD',"NPD_WELL_BORE_CODE"],axis = 1)

In [18]:
y_test_final = y_test['BORE_OIL_VOL']
y_train_final = y_train['BORE_OIL_VOL']

In [19]:
y_test_final = y_test['BORE_OIL_VOL']
y_train_final = y_train['BORE_OIL_VOL']
X = fdf.drop(['DATEPRD','BORE_OIL_VOL'],axis=1).values
y= fdf['BORE_OIL_VOL']

In [20]:
x_train_final

Unnamed: 0,ON_STREAM_HRS,AVG_DOWNHOLE_TEMPERATURE,AVG_ANNULUS_PRESS,AVG_CHOKE_SIZE_P,AVG_WHP_P,AVG_WHT_P,DP_CHOKE_SIZE,BORE_WAT_VOL
3374,0.96,0.000000,0.395510,0.977334,0.244223,0.974734,4.592777,4657.30
3760,0.96,0.000000,0.493936,1.000000,0.245396,0.983080,4.911603,4791.85
2900,0.96,0.000000,0.788277,0.520850,0.378589,0.946336,19.411079,3603.63
2090,0.96,0.976702,0.250367,0.349779,0.698305,0.759181,63.378641,3.96
7782,0.96,0.918301,0.743114,1.000000,0.203130,0.948506,1.848460,3203.30
...,...,...,...,...,...,...,...,...
5937,0.96,0.974822,0.000000,1.000000,0.299113,0.959814,8.772110,3578.90
5972,0.96,0.971136,0.000000,1.000000,0.286100,0.973663,9.420210,3669.41
6136,0.00,0.922988,0.000000,0.600923,0.765762,0.167259,95.059600,0.00
1606,0.96,0.977394,0.359498,1.000000,0.232684,0.859368,4.445737,1917.21


In [21]:
reg_all.fit(x_train_final, y_train_final)

In [22]:
print("The R2 value for linear regression for oil volume production is", reg_all.score(x_test_final, y_test_final))

The R2 value for linear regression for oil volume production is 0.5525268920997132


In [23]:
## Basic Import 
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [24]:
def evaluvate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)
    return mae,rmse,r2_square

In [26]:
models={
   "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
    
model_list=[]
r2_list=[]

for i in range(len(models.values())):
    model=list(models.values())[i]
    model.fit(x_train_final,y_train_final)
    ## Make prediction 
    y_train_pred=model.predict(x_train_final)
    y_test_pred=model.predict(x_test_final)
    
    model_train_mae,model_train_rmse,model_train_r2=evaluvate_model(y_train_final,y_train_pred)
    model_test_mae,model_test_rmse,model_test_r2=evaluvate_model(y_test_final,y_test_pred)

    print(list(models.values())[i])
    model_list.append(list(models.keys())[i])
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')
    

LinearRegression()
Model performance for Training set
- Root Mean Squared Error: 912.4416
- Mean Absolute Error: 699.1350
- R2 Score: 0.5462
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 911.2052
- Mean Absolute Error: 690.4637
- R2 Score: 0.5525


Lasso()
Model performance for Training set
- Root Mean Squared Error: 912.4794
- Mean Absolute Error: 699.0652
- R2 Score: 0.5462
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 911.3773
- Mean Absolute Error: 690.3463
- R2 Score: 0.5524


Ridge()
Model performance for Training set
- Root Mean Squared Error: 912.4541
- Mean Absolute Error: 699.3393
- R2 Score: 0.5462
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 911.4564
- Mean Absolute Error: 690.8664
- R2 Score: 0.5523


KNeighborsRegressor()
Model performance for Training set
- Root Mean Squared Error: 647.5740
- Mean Absolute Error: 355.4423
- R2 Scor

In [27]:
df_results = pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)
df_results

Unnamed: 0,Model Name,R2_Score
7,CatBoosting Regressor,0.990374
6,XGBRegressor,0.986195
5,Random Forest Regressor,0.985036
4,Decision Tree,0.969726
8,AdaBoost Regressor,0.908295
3,K-Neighbors Regressor,0.650974
0,Linear Regression,0.552527
1,Lasso,0.552358
2,Ridge,0.55228


### Tunning CatboostRegressor

In [28]:
# Intialize Catboost Regressor
cbr = CatBoostRegressor(verbose=False)

## Creating Hyper Parameter Tuning Grid
param_dist = {'depth'          : [4,5,6,7,8,9, 10],
              'learning_rate' : [0.01,0.02,0.03,0.04],
               'iterations'    : [300,400,500,600]}
# Intiate RandomSearchCV grid
rscv = RandomizedSearchCV(cbr , param_dist, scoring='r2', cv =5, n_jobs=-1)

# Fit The MODEL
rscv.fit(x_train_final,y_train_final)

# Print the tuned parameters and score
print(rscv.best_params_)
print(rscv.best_score_)

{'learning_rate': 0.04, 'iterations': 500, 'depth': 7}
0.9850837261650748


In [30]:
## Definition of print evaluvated model result
def print_evaluated_model_result(model,xtrain,ytrain,xtest,ytest):
    
    ytrain_pred = model.predict(xtrain)
    ytest_pred = model.predict(xtest)

    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluvate_model(ytrain, ytrain_pred)
    model_test_mae , model_test_rmse, model_test_r2 = evaluvate_model(ytest, ytest_pred)

    # Printing results
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))

In [31]:
# Selecting best model
best_cbr = rscv.best_estimator_

# Evaluate Train and Test dataset
print_evaluated_model_result(best_cbr,x_train_final,y_train_final,x_test_final,y_test_final)

Model performance for Training set
- Root Mean Squared Error: 109.7917
- Mean Absolute Error: 70.3641
- R2 Score: 0.9934
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 147.8571
- Mean Absolute Error: 85.7525
- R2 Score: 0.9882
