# Facies classification using machine learning
#### Joshua Poirier, [NEOS](https://www.neosgeo.com/)
2016 SEG Machine Learning Competition  

## Introduction

This notebook demonstrates a novel way of training a machine learning algorithm to predict facies from well log data. The data set we use comes from a class exercise from The University of Kansas on Neural Networks and Fuzzy Systems. This exercise is based on a consortium project to use machine learning techniques to create a reservoir model of the largest gas fields in North America, the Hugoton and Panoma Fields.  

The data set we will use contains log data from nine wells that have been labeled with a facies type based on core examination. I use this log data to train support vector machine (SVM) classifiers to predict the facies type. SVM's are a type of supervised learning algorithm which can be trained on data to perform classification and regression. The SVM algorithm uses the training data to fit an optimal hyperplane between the different classes (in our case - facies). 

My approach is to build multiple models and allow them to **vote** - this process is called **blended modeling**. The novelty in my approach is that I build a model for each well in the training data and weight it based on its cross-correlation with the well of interest. This is intended to re-introduce some spatial geometry to the problem. Wells closer to the well of interest are more likely to exhibit a similar log character. Cross-correlation between close wells should show smaller lag and higher maximum correlation. Furthermore, the lag may be used to pre-process the **RELPOS** (relative position) channel - enhancing the predictive nature of that feature.  

To get started, let's load the libraries and supporting files!

In [11]:
# libraries and supporting files
library(e1071)
#library(caret)

"package 'e1071' was built under R version 3.2.5"

## Loading and splitting the data  

Blahblahblah

In [4]:
# function to load data
loadData <- function() {
    fname <- "../facies_vectors.csv"
    data <- read.csv(fname, colClasses=c(rep("factor",3), rep("numeric",6), "factor", "numeric"))
    
    data
}

In [1]:
# function to pre-process the data
preProcessData <- function(data) {
    # convert NM_M channel into a binary channel "isMarine"
    data$NM_M <- data$NM_M == "2"
    names(data)[10] <- "isMarine"

    # make the Facies channel more descriptive
    levels(data$Facies) <- c("SS", "CSiS", "FSiS", "SiSh", "MS", "WS", "D", "PS", "BS")
    
    data
}

In [2]:
# function to split the data
splitData <- function(data, testWell) {
    testIndex <- data$Well.Name == testWell
    
    train <- data[!testIndex,]
    test <- data[testIndex,]
    split <- list(train, test)
    
    split
}

In [5]:
# load and pre-process the data
data <- loadData()
data <- preProcessData(data)

# split the data
split <- splitData(data, "SHANKLE")
train <- split[[1]]
test <- split[[2]]
rm(data, split)

"Training Data"
head(train)

"Testing Data"
head(test)

Facies,Formation,Well.Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,isMarine,RELPOS
FSiS,A1 SH,SHRIMPLIN,2793.0,77.45,0.664,9.9,11.915,4.6,False,1.0
FSiS,A1 SH,SHRIMPLIN,2793.5,78.26,0.661,14.2,12.565,4.1,False,0.979
FSiS,A1 SH,SHRIMPLIN,2794.0,79.05,0.658,14.8,13.05,3.6,False,0.957
FSiS,A1 SH,SHRIMPLIN,2794.5,86.1,0.655,13.9,13.115,3.5,False,0.936
FSiS,A1 SH,SHRIMPLIN,2795.0,74.58,0.647,13.5,13.3,3.4,False,0.915
FSiS,A1 SH,SHRIMPLIN,2795.5,73.97,0.636,14.0,13.385,3.6,False,0.894


Unnamed: 0,Facies,Formation,Well.Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,isMarine,RELPOS
938,CSiS,A1 SH,SHANKLE,2774.5,98.36,0.642,-0.1,18.685,2.9,False,1.0
939,CSiS,A1 SH,SHANKLE,2775.0,97.57,0.631,7.9,16.745,3.2,False,0.984
940,CSiS,A1 SH,SHANKLE,2775.5,98.41,0.615,12.8,14.105,3.2,False,0.968
941,CSiS,A1 SH,SHANKLE,2776.0,85.92,0.597,13.0,13.385,3.4,False,0.952
942,CSiS,A1 SH,SHANKLE,2776.5,83.16,0.592,12.3,13.345,3.4,False,0.935
943,CSiS,A1 SH,SHANKLE,2777.0,82.56,0.599,12.9,13.105,3.5,False,0.919


## Exploratory data analysis  

Blahblahblah

In [27]:
# function to center/scale data
centerScaleData <- function(data) {
   
    # we only want to center/scale data which is numeric
    toScale <- sapply(data, class) == "numeric"
    
    # for each numeric feature in the data frame, center and scale
    for (i in 1:length(toScale)) {
        if (toScale[i]) {
            data[,i] <- scale(data[,i])
        }
    }
    
    data
}

In [74]:
# function to perform crosscorrelation between two vectors
crossCorrelate <- function(a, b) {

    # calculate cross-correlation between vectors a and b
    ccor <- ccf(a, b, lag.max=400, plot=F)

    # retrieve the maximum correlation and associated lag
    corr <- max(ccor[["acf"]][,,1])
    lag <- ccor[["lag"]][,,1][which.max(ccor[["acf"]][,,1])]
    
    if (is.na(corr)) {
        print(a)
        print(b)
    }
    # return maximum correlation and associated lag
    list(Correlation=corr, Lag=lag)
}

In [67]:
# apply a function "FUN" over columns named "features" to data frames "a" and "b"
loopFeatures <- function(a, b, FUN) {
    
    # preprocess
    a <- centerScaleData(a)
    b <- centerScaleData(b)
    
    # get list of columns for a and b dataframes
    features_a <- names(a)[names(a) != "Facies"]
    features_b <- names(b)[names(b) != "Facies"]
    
    # ensure a and b data frames have the same features
    try ((if (!all.equal(features_a, features_b)) stop("Error! Data frames do not have the same features.")))
    
    r <- data.frame(feature=factor(), correlation=numeric(), lag=numeric())
    features <- features_a[!features_a %in% c("Facies", "Formation", "Well.Name", "Depth", "RELPOS", "PE")]
    
    # loop through features
    for (feature in features) {
        av <- as.data.frame(a[,which(names(a) %in% feature)])
        bv <- as.data.frame(b[,which(names(b) %in% feature)])
        
        temp <- FUN(av, bv)
        
        r <- rbind(r, data.frame(feature=feature, correlation=temp[["Correlation"]], lag=temp[["Lag"]]))
    }
    
    r
}

In [None]:
trainBlendedModel <- function(data, cost, gamma) {

    # initialize model and params
    fits <- list()
    
    # retrieve list of wells in data set
    wells <- unique(data$Well.Name)
    
    # loop through wells
    for (well_i in wells) {
        # subset the data
        data_i <- data[data$Well.Name == well_i,]
        
        # build the model for this well
        fits[[well_i]] <- svm(Facies ~ ., data=data_i, kernel='radial', cost=cost, gamma=gamma)
    }
    
    fits
}

In [52]:
weightBlendedModel <- function(train, test) {
    
    # initialize weights list
    weights <- list()
    crossCorrs <- data.frame(trainWell=factor(), testWell=factor(), feature=factor(), correlation=numeric(), lag=numeric())
    
    # retrieve list of wells in training set
    trainWells <- unique(train$Well.Name)
    
    # loop through training wells
    for (well_i in trainWells) {
        # subset the data
        train_i <- train[train$Well.Name == well_i,]
        
        # calculate cross-correlation for each feature between current training well and testing well
        temp <- loopFeatures(train_i, test, crossCorrelate)
        temp$trainWell <- well_i
        temp$testWell <- test$Well.Name[1]
        crossCorrs <- rbind(crossCorrs, temp)
    }
    
    print(crossCorrs)
}

In [None]:
buildBlendedModel <- function(train, test, features, cost, gamma) {
    # initialize the blended model
    blendedModel <- list()
    
    # train and weight the models
    blendedModel[["fits"]] <- trainBlendedModel(train, cost, gamma)
    blendedModel[["weights"]] <- weightBlendedModel(train, test, features)
    
    # return the blended model
    blendedModel
}

In [None]:
tuneBlendedModel <- function(data, features) {
    
    # retrieve list of wells in data set
    wells <- unique(data$Well.Name)
    
    # loop through wells
    for (well_i in wells) {
        # split data into training and test (well_i is test)
        split <- splitData(data, well_i)
        
        # build blended model using training data
        buildBlendedModel(split[[1]], split[[2]], features, 10, 1)
    }
}

In [76]:
# execute exploratory data analysis
split <- splitData(train, "NEWBY")
a <- split[[1]]
a <- a[a$Well.Name != "Recruit F9",]
b <- split[[2]]

weightBlendedModel(a,b)
#f <- c("GR", "ILD_log10", "DeltaPHI", "PHIND", "isMarine", "RELPOS")
#aPrime <- centerScaleData(a, f)
#bPrime <- centerScaleData(b, f)

#loopWellPairs(train, f, loopFeatures)

     feature correlation  lag       trainWell testWell
1         GR   0.4134401   67       SHRIMPLIN    NEWBY
2  ILD_log10   0.5817869  -20       SHRIMPLIN    NEWBY
3   DeltaPHI   0.3562930  -38       SHRIMPLIN    NEWBY
4      PHIND   0.3248451  -22       SHRIMPLIN    NEWBY
5   isMarine   0.5271789  -28       SHRIMPLIN    NEWBY
6         GR   0.3692871  -39     ALEXANDER D    NEWBY
7  ILD_log10   0.3987207  -24     ALEXANDER D    NEWBY
8   DeltaPHI   0.3024102  -39     ALEXANDER D    NEWBY
9      PHIND   0.3279468  -35     ALEXANDER D    NEWBY
10  isMarine   0.3934702  -33     ALEXANDER D    NEWBY
11        GR   0.3274616  -11        LUKE G U    NEWBY
12 ILD_log10   0.5478564  -12        LUKE G U    NEWBY
13  DeltaPHI   0.3249954    5        LUKE G U    NEWBY
14     PHIND   0.3450218   -2        LUKE G U    NEWBY
15  isMarine   0.6787544   -3        LUKE G U    NEWBY
16        GR   0.5379095   -9        KIMZEY A    NEWBY
17 ILD_log10   0.4161453   -7        KIMZEY A    NEWBY
18  DeltaP

## Model tuning  

Blahblahblah

In [4]:
# function to tune model

In [5]:
# execute model tuning, print the model tuning parameters

## Training  

Blahblahblah

In [6]:
# function to train model

In [7]:
# execute model training

## Testing  

Blahblahblah

In [8]:
# function to test model (on SHANKLE well)

In [9]:
# execute model testing

## Conclusions  

Blahblahblah