# DUC Datathon 2020
## Predicting Total Vertical Depth

In this section of the competition we are tasked with building regression machine learning (or other)
model that will be able to predict TVD (True Vertical Depth)

In [None]:
# import general libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# import prediction libraries
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
# for polynomial feature extraction
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import BaseEstimator, TransformerMixin


### Data import

In [None]:
# import well header data
well_header = pd.read_csv("../data/WellHeader_Datathon.csv")

In [None]:
well_header.shape

In [None]:
well_header.info()

Take a random sample with selected columns

In [None]:
well_header[['EPAssetsId','TVD','TotalDepth','BH_Location','Formation','Field','Pool','WellProfile']].sample(20)

### Total counts for some of the variables:

In [None]:
well_header.Formation.value_counts()

In [None]:
well_header.Pool.value_counts()

In [None]:
well_header.Field.value_counts()

There are only 6 vertical wells and one is missing the TVD.

In [None]:
well_header.WellProfile.value_counts()

In [None]:
well_header[well_header['WellProfile']=='Vertical'][['EPAssetsId','TVD','TotalDepth']]

### Data Exploration

In [None]:
# remove na's
well_header_clean = well_header[['EPAssetsId','TVD','TotalDepth','Formation','BH_Location','Field','WellProfile']].dropna()
# remove vertical wells
well_header_clean = well_header_clean[well_header_clean.WellProfile != "Vertical"]
well_header_clean.head()

In [None]:
sns.set()
sns.lmplot( x="TotalDepth", y="TVD", data=well_header_clean, fit_reg=True, height= 5, legend=True);

In [None]:
sns.set()
sns.lmplot( x="TotalDepth", y="TVD", data=well_header_clean, 
            fit_reg=True,hue='Formation',legend=False,col="Formation",col_wrap=2, height=5,order=3,
            scatter_kws={'alpha':0.5});

In [None]:
sns.set()
sns.lmplot( x="TotalDepth", y="TVD", data=well_header_clean, 
            fit_reg=False,hue='Field',legend=False,col="Formation",col_wrap=2, height=5);

Distribution of TVD for horizontal vs directional wells

In [None]:
well_header[well_header['WellProfile']=="Horizontal"].TVD.plot(kind='hist',bins=40,color="blue",alpha=0.5,figsize=(10, 7))
well_header[well_header['WellProfile']=="Directional"].TVD.plot(kind='hist',bins=40,color="magenta",alpha=0.5,figsize=(10, 7))
plt.legend(labels=['Horizontal', 'Directional'])
plt.title('Distribution of TVD', size=24)
plt.xlabel('TVD', size=18)
plt.ylabel('Frequency', size=18);

## Explore different prediction models

Declare the Columns You Want to Use as Features and targets.

In [None]:
features = ['TotalDepth',
            'Formation']
target = ['TVD']

Extract Features and Target ('TVD') Values into Separate Dataframes

In [None]:
X = well_header_clean[features]
# X = pd.get_dummies(X,drop_first=False)
y = well_header_clean[target]

In [None]:
X.sample(10)

In [None]:
# taken from : 
#  https://stackoverflow.com/questions/47664061/how-to-apply-polynomial-transformation-to-subset-of-features-in-scikitlearn?rq=1


# estimators need to inherit from these classes to play nicely with others
class ColumnExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_cols = X[self.columns]
        return X_cols

# Using pandas get dummies to make pipeline a bit simpler by
# avoiding one-hot and label encoder.     
# Build the pipeline from a FeatureUnion that processes 
# numerical and one-hot encoded separately.
# FeatureUnion puts them back together when it's done.
pipe2nvars = Pipeline([
    ('features', FeatureUnion([('num', 
                                Pipeline([('extract', 
                                           ColumnExtractor(columns=['TotalDepth'])),
                                          ('poly', 
                                           PolynomialFeatures())  ])),
                               ('cat_var', 
                                ColumnExtractor(columns=['Formation_Montney','Formation_Duvernay',
                                                         'Formation_Cardium','Formation_Viking']))])
    )])    

pipe2nvars.set_params(features__num__poly__degree=3)
X = pipe2nvars.fit_transform(pd.get_dummies(X, drop_first=False))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324)

### (1) Linear Regression: Fit a model to the training set

In [None]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_prediction = regressor.predict(X_test)

In [None]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
print(RMSE)

### (3) Ridge regression: Fit a new regression model to the training set

In [None]:
ridge_regressor = Ridge(alpha=10)
ridge_regressor.fit(X_train, y_train)

In [None]:
y_prediction = ridge_regressor.predict(X_test)
y_prediction

In [None]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
print(RMSE)

### (2) Decision Tree Regressor: Fit a new regression model to the training set

In [None]:
X = well_header_clean[features]
X = pd.get_dummies(X,drop_first=False)
y = well_header_clean[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324)

In [None]:
tree_regressor = DecisionTreeRegressor(max_depth=6)
tree_regressor.fit(X_train, y_train)

In [None]:
y_prediction = tree_regressor.predict(X_test)
y_prediction

In [None]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
print(RMSE)