<a href="https://colab.research.google.com/github/singhritu7116/ongc/blob/main/EUR_PREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **IMPORTING LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,StandardScaler,power_transform
from scipy.stats import spearmanr
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **LOADING DATASET**

In [None]:
data= pd.read_csv("/content/drive/MyDrive/Chapter6_Shale Gas Wells.csv")

Data Analysis

In [None]:
data.shape

In [None]:
X = data.iloc[:,0:13]
EUR = data.iloc[:,-1]
EUR=EUR.astype('int')

In [None]:
data.head()

# **FEATURE SELECTION**

### 1. **USING SELECT KBEST**

In [None]:
BestFeatures = SelectKBest(score_func=chi2, k=13)
fit = BestFeatures.fit(X,EUR)


In [None]:
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)

In [None]:
f_Scores = pd.concat([df_columns,df_scores],axis=1)               # feature scores
f_Scores.columns = ['Specs','Score']

In [None]:
f_Scores

In [None]:
print(f_Scores.nlargest(13,'Score'))

### **2. USING CORRELATION HEATMAP**

In [None]:
import seaborn as sns

#get correlations of each features in dataset
corrmat = data.corr()
top_corr_features = corrmat.index

plt.figure(figsize=(13,13))

#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

### **3. USING SPEARMAN RANKING CORRELATION**

*   1 : perfect correlation,
*   0 : no correlation
*   -1: negative correlation








3.1 Correlation Ranking for Stage Spacing

In [None]:
newdf1=data[['Stage Spacing','EUR']]
newdf1.corr(method='spearman')

In [None]:
spearmanr(newdf1)

3.2 Correlation Ranking for all parameters

In [None]:
Col1=['Stage Spacing', 'bbl/ft', 'Well Spacing', 'Dip', 'Thickness','Lateral Length', 'Injection Rate', 'Porosity', 'ISIP','Water Saturation', 'Percentage of LG', 'Pressure Gradient',
       'Proppant Loading']

In [None]:
for x in Col1:
  df=data[[x,'EUR']]
  print("--------correlation between",x,"and EUR---------")
  print('\n')
  print(df.corr(method='spearman'))
  print(spearmanr(df))
  print('----------------------------------------------------')
  print('\n')

# **EVALUATING PERFORMANCE  OF DIFFERENT MODELS OVER GIVEN DATASET **

## **1. USING UN-NORMALIZED DATA**

In [None]:
data.columns
X=data.drop(['EUR'],axis=1)
eur=data['EUR']

### SPLITTING DATA

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_train, x_test, y_train, y_test = train_test_split(X,eur,test_size=0.2,train_size=0.8)
x_train, x_cv, y_train, y_cv = train_test_split(X_train,y_train,test_size = 0.25,train_size =0.75)

In [None]:
x_test.shape

In [None]:
x_train.shape

In [None]:
x_cv.shape

## **MODEL 1 : RANDOM FOREST REGRESSOR**

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
regressor2 = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [None]:
regressor2.fit(x_train, y_train)

In [None]:
f_i = list(zip(X,regressor2.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i])

plt.show()

In [None]:
y_pred2 = regressor2.predict(x_test)

In [None]:
y_test

In [None]:
x_test

In [None]:
df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred2})
df

### **Testing MODEL-1 Over Random Data**

In [None]:
x_cv.head()

In [None]:
y_cv.head()

In [None]:
data_pred={'Stage Spacing': 140,
      'bbi/ft': 36,
      'Well Spacing': 950,
      'Dip':0,
      'Thickness': 166,
      'Lateral Length': 7783,
      'Injection Rate': 59,
      'Porosity': 7.7,
      'ISIP': 7643,
      'Water Saturation': 17.4,
      'Percentage of LG': 37.1,
      'Pressure Gradient': 0.94,
      'Proppant Loading': 2531,
      }



In [None]:
import pandas as pd

In [None]:
df=pd.DataFrame(data_pred,index=[0])
df


In [None]:
new_pred=regressor2.predict(df)
print(new_pred[0])

### **Plotting Predicted Data Against Actual Data**

In [None]:
import seaborn as sns
plt.figure(figsize=(5, 7))


ax = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(y_pred2, hist=False, color="b", label="Fitted Values" , ax=ax)


plt.title('Actual vs Fitted Values for EUR')

plt.legend()
plt.show()
plt.close()

### **Calculating MAE and Accuracy and R2 Score of the Model**

In [None]:
errors = abs(y_pred2 - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred2)

## **MODEL 2 : Support Vector Machine(SVM) Model**

In [None]:
from sklearn.svm import SVC
from sklearn import svm

In [None]:
svc_model = svm.SVR(kernel='poly',degree=4)
svc_model.fit(x_train, y_train)

prediction = svc_model .predict(x_test)

In [None]:
print(svc_model.score(x_train, y_train))
print(svc_model.score(x_test, y_test))

### **Plotting Predicted Data Against Actual Data**

In [None]:
plt.figure(figsize=(5, 7))


ax = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(prediction, hist=False, color="b", label="Fitted Values" , ax=ax)


plt.title('Actual vs Fitted Values for EUR')

plt.legend()
plt.show()
plt.close()

### **Calculating MAE and Accuracy and R2 Score of the Model**

In [None]:
errors = abs(prediction - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', accuracy, '%.')

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, prediction)

In [None]:
x_cv

In [None]:
y_cv

In [None]:
data_pred={'Stage Spacing': 140,
      'bbi/ft': 36,
      'Well Spacing': 950,
      'Dip':0,
      'Thickness': 166,
      'Lateral Length': 7783,
      'Injection Rate': 59,
      'Porosity': 7.7,
      'ISIP': 7643,
      'Water Saturation': 17.4,
      'Percentage of LG': 37.1,
      'Pressure Gradient': 0.94,
      'Proppant Loading': 2531,
      }

In [None]:
df=pd.DataFrame(data_pred,index=[0])
df

In [None]:
new_pred=svc_model.predict(df)
print(new_pred[0])

# **NORMALIZATION OF DATASET**

### USING LAMBDA FUNCTION

In [None]:
col_norm=['Stage Spacing', 'bbl/ft', 'Well Spacing', 'Dip', 'Thickness','Lateral Length', 'Injection Rate', 'Porosity', 'ISIP','Water Saturation', 'Percentage of LG', 'Pressure Gradient',
       'Proppant Loading']
data_norm=data[col_norm]
data_norm.head()

In [None]:
data_norm1=data[col_norm].apply(lambda x:((x-x.min())/(x.max()-x.min())))

In [None]:
data_norm1.head()

## **2. USING NORMALIZED DATA**

In [None]:
data_norm1.columns
X=data_norm1.drop(['Pressure Gradient'],axis=1)
eur=data['EUR']

X_train, x_test, y_train, y_test = train_test_split(X,eur,test_size=0.2,train_size=0.8)
x_train, x_cv, y_train, y_cv = train_test_split(X_train,y_train,test_size = 0.25,train_size =0.75)


### **Using Random Forest Regressor**




In [None]:
regressor2 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
regressor2.fit(x_train, y_train)
y_pred2 = regressor2.predict(x_test)
df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred2})
df

### **Plotting Predicted Data Against Actual Data**

In [None]:
import seaborn as sns
plt.figure(figsize=(5, 7))


ax = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(y_pred2, hist=False, color="b", label="Fitted Values" , ax=ax)

plt.title('Actual vs Fitted Values for EUR')

plt.legend()
plt.show()
plt.close()

### **Calculating MAE and Accuracy and R2 Score of the Model**

In [None]:
errors = abs(y_pred2 - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred2)

### **Using Support Vector Machine**

In [None]:
svc_model = svm.SVR(kernel='poly',degree=4)
svc_model.fit(x_train, y_train)

prediction = svc_model .predict(x_test)
print(svc_model.score(x_train, y_train))
print(svc_model.score(x_test, y_test))

### **Plotting Predicted Data Against Actual Data**

In [None]:
plt.figure(figsize=(5, 7))


ax = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(prediction, hist=False, color="b", label="Fitted Values" , ax=ax)


plt.title('Actual vs Fitted Values for EUR')

plt.legend()
plt.show()
plt.close()

### **Calculating MAE and Accuracy and R2 Score of the Model**

In [None]:
errors = abs(prediction - y_test)
# Print out the mean absolute error (mae)
# print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
print('Mean Absolute Error:', np.mean(errors) ,'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', (accuracy, 2), '%.')

In [None]:
from sklearn.metrics import r2_score

r2_score(y_test, prediction)

In [None]:
print(y_test)

In [None]:
data_pred3={'Stage Spacing': 140,
      'bbi/ft': 30,
      'Well Spacing': 950,
      'Dip':0,
      'Thickness': 178,
      'Lateral Length': 7942,
      'Injection Rate': 76,
      'Porosity': 7,
      'ISIP': 7783,
      'Water Saturation': 16.9,
      'Percentage of LG': 43.9,
      #'Pressure Gradient': 0.95,
      'Proppant Loading': 2803,
      }

In [None]:
df2=pd.DataFrame(data_pred3,index=[0])
df2

## **MODEL 3 : DECISION TREE REGRESSOR**

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
regressor= DecisionTreeRegressor(criterion='squared_error',splitter='best', random_state=142)
regressor.fit(x_train, y_train)

In [None]:
y_pred3= regressor.predict(x_test)
df3=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred3})
df3

### **Plotting Predicted Data Against Actual Data**

In [None]:
import seaborn as sns
plt.figure(figsize=(5, 7))


ax = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(y_pred3, hist=False, color="b", label="Fitted Values" , ax=ax)


plt.title('Actual vs Fitted Values for EUR')

plt.legend()
plt.show()
plt.close()

In [None]:
from sklearn.tree import plot_tree
reg = DecisionTreeRegressor(max_depth=3) #max_depth is maximum number of levels in the tree
reg.fit(x_train, y_train)

plt.figure(figsize=(25,10))
a = plot_tree(reg,
              feature_names=['Stage Spacing', 'bbl/ft', 'Well Spacing', 'Dip', 'Thickness','Lateral Length', 'Injection Rate', 'Porosity', 'ISIP','Water Saturation', 'Percentage of LG',
       'Proppant Loading'],
              class_names=['EUR'],
              filled=True,
              rounded=True,
              fontsize=14)

### **Calculating MAE and Accuracy and R2 Score of the Model**

In [None]:
errors = abs(y_pred3 - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
r2_score(y_test, y_pred3)

In [None]:
x_test

In [None]:
print(y_test)

In [None]:
data_pred={'Stage Spacing': 140,
      'bbi/ft': 36,
      'Well Spacing': 950,
      'Dip':0,
      'Thickness': 166,
      'Lateral Length': 7783,
      'Injection Rate': 59,
      'Porosity': 7.7,
      'ISIP': 7643,
      'Water Saturation': 17.4,
      'Percentage of LG': 37.1,
      #'Pressure Gradient': 0.94,
      'Proppant Loading': 2531,
      }

In [None]:
df=pd.DataFrame(data_pred,index=[0])
df

In [None]:
new_pred=regressor.predict(df)
print(new_pred[0])

## **MODEL 4 : NEURAL NETWORK**

### IMPORTING LIBRARIES

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pandas as pd
import matplotlib.pyplot as plt

### SPLITTING DATA

In [None]:
data=data.drop(['Pressure Gradient'], axis = 1)
train_dataset = data.sample(frac=0.7, random_state=0)
test_dataset = data.drop(train_dataset.index)

In [None]:
train_dataset.describe().transpose()

### REMOVING LABELS FROM TEST DATA AND EXTRACTING FEATURES SEPARATELY

In [None]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()
train_labels = train_features.pop('EUR')
test_labels = test_features.pop('EUR')

### NORMALIZATION OF DATA

In [None]:
train_dataset.describe().transpose()[['mean', 'std']]
normalizer = tf.keras.layers.Normalization(axis=1)
normalizer.adapt(np.array(train_features))

### CREATING LAYERS

In [None]:
from tensorflow.keras import layers
model = keras.Sequential([
      normalizer,
      layers.Dense(12, activation='relu',input_shape=(train_features.shape)),
      layers.Dense(12, activation='relu'),
      layers.Dense(1)
  ])

### COMPILING MODEL

In [None]:
model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(0.001),
             metrics=['mean_squared_error'])


### TRAINING MODEL

In [None]:
history = model.fit(
    train_features,
    train_labels,
    validation_split=0.2,
    verbose=1, epochs=100)
# set verbose = 1 or 2 for visualisation of epochs

### LOSS VISUALIZATION OVER EPOCHS

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.ylim([0, 10])
plt.xlabel('Epoch')
plt.ylabel('Error [EUR]')
plt.legend()
plt.grid(True)

### MODEL EVALUATION OVER TEST DATA

In [None]:
test_results = model.evaluate(test_features, test_labels, verbose=0)
test_results

In [None]:
test_predictions = model.predict(test_features).flatten()

### SCATTER PLOT : PREDICTED V/S ACTUAL DATA POINTS

In [None]:
a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [EUR]')
plt.ylabel('Predictions [EUR]')
lims = [0, 40]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

### R2 SCORE FOR EVALUATING PERFORMANCE OF MODEL

In [None]:
from sklearn.metrics import r2_score
print("R2 score : %.2f" % r2_score(test_labels,test_predictions))

In [None]:
errors = abs(test_predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

### SAVING TRAINED MODEL

In [None]:
!mkdir -p saved_model
model.save('saved_model/EURmodel')

In [None]:
!ls saved_model
!ls saved_model/EURmodel

### LOADING TRAINED MODEL FOR PREDICTION

In [None]:
Loaded_model = tf.keras.models.load_model('saved_model/EURmodel')

In [None]:
test_features

In [None]:
test_labels

In [None]:
data_pred={'Stage Spacing': 140,
      'bbi/ft': 36,
      'Well Spacing': 950,
      'Dip':0,
      'Thickness': 166,
      'Lateral Length': 7783,
      'Injection Rate': 59,
      'Porosity': 7.7,
      'ISIP': 7643,
      'Water Saturation': 17.4,
      'Percentage of LG': 37.1,
      #'Pressure Gradient': 0.94,
      'Proppant Loading': 2531,
      }



In [None]:
df=pd.DataFrame(data_pred,index=[0])
df

In [None]:
new_pred=model.predict(df)
print(new_pred[0])

Hyper parameter tunning using keras tuner

In [None]:
# from sklearn.model_selection import GridSearchCV


In [None]:
# from numpy.ma.core import size
# layers=[[20],[40,20],[45,30,15]]
# activations=['sigmoid','relu']
# param_grid=dict(layers=layers,activation=activations,batch_size=[128,256],epochs=[30])
# grid = GridSearchCV(estimator=model, param_grid=param_grid,scoring='f1_macro',cv=5,refit = True)

In [None]:
# grid_result = grid.fit(train_dataset, train_features)

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
!pip install -q -U keras-tuner

In [None]:
import keras_tuner as kt

In [None]:
def model_builder(hp):
  '''
  Args:
    hp - Keras tuner object
  '''
  # Initialize the Sequential API and start stacking the layers

  model = keras.Sequential()

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512

  hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
  model.add(keras.layers.Dense(units=hp_units, activation='relu',input_shape=(12,)))

  # Add next layers

  model.add(layers.Dense(12, activation='relu'))
  model.add(layers.Dense(1))

  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  model.compile(loss='mean_squared_error',
                optimizer=tf.keras.optimizers.Adam(learning_rate = hp_learning_rate),
                metrics=['mean_squared_error'])

  return model


**BAYESIAN OPTIMISATION**

In [None]:
tuner_bo = kt.BayesianOptimization(
            model_builder,
            objective='mean_squared_error',
            max_trials=10,                    # the number of hyperparameter combinations that will be tested by the tuner
            executions_per_trial=2            # the number of models that should be built and fit for each trial
        )

In [None]:
tuner_bo.search(train_features,train_labels, epochs=10, validation_split=0.2, verbose=0)
best_model = tuner_bo.get_best_models(num_models=1)[0]
best_model.evaluate(test_features, test_labels)

**HYPERBAND**

In [None]:
# Instantiate the tuner
tuner = kt.Hyperband(model_builder, # the hypermodel
                     objective='mean_squared_error', # objective to optimize
max_epochs=50,
factor=3, # factor which you have seen above
directory='dir', # directory to save logs
project_name='khyperband')

In [None]:
tuner.search_space_summary()

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='mean_squared_error', patience=5)
# Perform hypertuning
tuner.search(train_features,train_labels, epochs=10, validation_split=0.2, callbacks=[stop_early])

In [None]:
best_hp=tuner.get_best_hyperparameters()[0]

h_model = tuner.hypermodel.build(best_hp)
h_model.fit(train_features, train_labels, epochs=10, validation_split=0.2)
h_model.summary()

In [None]:
h_eval_dict = h_model.evaluate(test_features, test_labels, return_dict=True)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

In [None]:
seed = 7
np.random.seed(seed)

In [None]:
X = data.iloc[:,0:12]
Y = data.iloc[:,-1]

In [None]:
model2 = KerasClassifier(model=model, verbose=0)
print(model2.get_params().keys())

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.layers import Dense, Activation, Embedding, Flatten, LeakyReLU, BatchNormalization, Dropout

In [None]:
def create_model(layers, activation):
    model = Sequential()
    for i, nodes in enumerate(layers):
        if i==0:
            model.add(Dense(nodes,input_dim=train_features.shape[1]))
            model.add(Activation(activation))
            model.add(Dropout(0.3))
        else:
            model.add(Dense(nodes))
            model.add(Activation(activation))
            model.add(Dropout(0.3))

    model.add(Dense(units = 1, kernel_initializer= 'glorot_uniform', activation = 'sigmoid'))

    model.compile(optimizer='adam', loss='mean_squared_error',metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model, verbose=0)

In [None]:
model

In [None]:
layers = [[10], [20, 10], [35, 20, 15]]
activations = ['sigmoid', 'relu']
param_grid = dict(layers=layers, activation=activations, batch_size = [28, 56], epochs=[50,80])
grid = GridSearchCV(estimator=model, param_grid=param_grid,cv=5)

In [None]:
grid_result = grid.fit(train_features, train_labels)

In [None]:
[grid_result.best_score_,grid_result.best_params_]

In [None]:
pred_y = grid.predict(test_features).flatten()

In [None]:
from sklearn.metrics import r2_score
print("R2 score : %.2f" % r2_score(test_labels,pred_y))