### Getting and cleaning data

In [1]:
source("loadData.R")

# load and clean the data
raw <- loadData()

"Raw data:"
format(head(raw,3), digits=3)

clean <- cleanData(raw)

"Cleaned data:"
format(head(clean,3), digits=3)

dataPrime <- data.frame()
wells <- unique(clean$Well.Name)

for (well_i in wells) {
    data_i <- clean[clean$Well.Name == well_i,]
    
    data_i$GR <- (data_i$GR - mean(data_i$GR, na.rm=T)) / sd(data_i$GR, na.rm=T)
    data_i$ILD_log10 <- (data_i$ILD_log10 - mean(data_i$ILD_log10, na.rm=T)) / sd(data_i$ILD_log10, na.rm=T)
    data_i$DeltaPHI <- (data_i$DeltaPHI - mean(data_i$DeltaPHI, na.rm=T)) / sd(data_i$DeltaPHI, na.rm=T)
    data_i$PHIND <- (data_i$PHIND - mean(data_i$PHIND, na.rm=T)) / sd(data_i$PHIND, na.rm=T)
    data_i$PE <- (data_i$PE - mean(data_i$PE, na.rm=T)) / sd(data_i$PE, na.rm=T)
    
    dataPrime <- rbind(dataPrime, data_i)
}

cs <- dataPrime
rm(dataPrime)

"Centered and scaled data:"
format(head(cs,3), digits=3)

Facies,Formation,Well.Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
3,A1 SH,SHRIMPLIN,2793,77.5,0.664,9.9,11.9,4.6,1,1.0
3,A1 SH,SHRIMPLIN,2794,78.3,0.661,14.2,12.6,4.1,1,0.979
3,A1 SH,SHRIMPLIN,2794,79.0,0.658,14.8,13.1,3.6,1,0.957


Facies,Formation,Well.Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,isMarine,RELPOS
FSiS,A1 SH,SHRIMPLIN,2793,77.5,0.664,9.9,11.9,4.6,False,1.0
FSiS,A1 SH,SHRIMPLIN,2794,78.3,0.661,14.2,12.6,4.1,False,0.979
FSiS,A1 SH,SHRIMPLIN,2794,79.0,0.658,14.8,13.1,3.6,False,0.957


Facies,Formation,Well.Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,isMarine,RELPOS
FSiS,A1 SH,SHRIMPLIN,2793,0.216,0.01855,0.512,-0.0487,0.421,False,1.0
FSiS,A1 SH,SHRIMPLIN,2794,0.237,0.00567,1.517,0.0736,-0.133,False,0.979
FSiS,A1 SH,SHRIMPLIN,2794,0.258,-0.00721,1.657,0.1648,-0.687,False,0.957


### Conditioning the data

In [2]:
df <- cs


format(head(df, 3), digits=3)

Facies,Formation,Well.Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,isMarine,RELPOS
FSiS,A1 SH,SHRIMPLIN,2793,0.216,0.01855,0.512,-0.0487,0.421,False,1.0
FSiS,A1 SH,SHRIMPLIN,2794,0.237,0.00567,1.517,0.0736,-0.133,False,0.979
FSiS,A1 SH,SHRIMPLIN,2794,0.258,-0.00721,1.657,0.1648,-0.687,False,0.957


### Exploratory data analysis

In [3]:
options(warn=-1)

buildMetaRow <- function(formation, well, fmThickness, facies, top, base, df) {
    metadf <- data.frame(Formation = formation,
                         Well.Name = well,
                         Top = top,
                         Base = base,
                         FmThickness = fmThickness,
                         Facies = facies,
                         FmRelThickness = (base-top)/fmThickness,
                         GR_mean=mean(df$GR), 
                         ILD_log10_mean=mean(df$ILD_log10), 
                         dPhi_mean=mean(df$DeltaPHI), 
                         PHI_mean=mean(df$PHIND), 
                         PE_mean=mean(df$PE, na.rm=T)
                        )
    metadf
}

faciesSequencing <- function(df) {
    metadf <- data.frame()
    formations <- unique(df$Formation)

    for (f in formations) {

        df_f <- df[df$Formation == f,]
        wells <- unique(df_f$Well.Name)

        for (w in wells) {
            df_fw <- df_f[df_f$Well.Name == w,]
            fmThickness <- max(df_fw$Depth) - min(df_fw$Depth) + .5

            # initialize values
            top <- df_fw$Depth[1] - .25    # top is .25ft above top sample (.25 is half the .5 sampling interval)
            base <- df_fw$Depth[1] + .25   # ditto
            facies_prev <- df_fw$Facies[1]

            for (i in 1:nrow(df_fw)) {
                facies_cur <- df_fw$Facies[i]

                # look for change in facies
                if (facies_cur != facies_prev) {
                    base <- df_fw$Depth[i-1] + ((df_fw$Depth[i] - df_fw$Depth[i-1])/2)
                    
                    temp <- df_fw[df_fw$Depth >= top & df_fw$Depth < base,]
                    metadf <- rbind(metadf, buildMetaRow(f, w, fmThickness, facies_prev, top, base, temp))

                    # reset values
                    facies_prev <- facies_cur
                    top <- df_fw$Depth[i] - ((df_fw$Depth[i] - df_fw$Depth[i-1])/2)
                } 
                
                # look for end of data frame
                if (i == nrow(df_fw)) {
                    base <- df_fw$Depth[i] + .25
                    
                    temp <- df_fw[df_fw$Depth >= top & df_fw$Depth < base,]
                    metadf <- rbind(metadf, buildMetaRow(f, w, fmThickness, facies_prev, top, base, temp))                
                }
            }
        }
    }
    
    metadf
}

### Feature engineering

In [4]:
library(dplyr)

# formation depth
wells <- unique(df$Well.Name)

df <- mutate(group_by(df, Formation, Well.Name), FmThickness=max(Depth)-min(Depth)+.5)
df$FmRelDepth <- 1 - df$RELPOS

format(head(df), digits=3)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



Facies,Formation,Well.Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,isMarine,RELPOS,FmThickness,FmRelDepth
FSiS,A1 SH,SHRIMPLIN,2793,0.216,0.01855,0.512,-0.0487,0.421,False,1.0,21.5,0.0
FSiS,A1 SH,SHRIMPLIN,2794,0.237,0.00567,1.517,0.0736,-0.133,False,0.979,21.5,0.021
FSiS,A1 SH,SHRIMPLIN,2794,0.258,-0.00721,1.657,0.1648,-0.687,False,0.957,21.5,0.043
FSiS,A1 SH,SHRIMPLIN,2794,0.447,-0.02009,1.447,0.177,-0.797,False,0.936,21.5,0.064
FSiS,A1 SH,SHRIMPLIN,2795,0.139,-0.05444,1.353,0.2118,-0.908,False,0.915,21.5,0.085
FSiS,A1 SH,SHRIMPLIN,2796,0.122,-0.10167,1.47,0.2278,-0.687,False,0.894,21.5,0.106


### Trial training

In [29]:
library(dplyr)

# use SHRIMPLIN & CHURCHMAN BIBLE as test wells
train <- df[df$Well.Name != "SHRIMPLIN" & df$Well.Name != "CHURCHMAN BIBLE",]
test <- df[df$Well.Name == "SHRIMPLIN" | df$Well.Name == "CHURCHMAN BIBLE",]

meta <- faciesSequencing(train)
meta <- mutate(group_by(meta, Formation, Well.Name), FmCumThickness=cumsum(FmRelThickness))

metawells <- unique(meta$Well.Name)
metawells <- metawells[!metawells %in% c("Recruit F9")]
testwells <- unique(test$Well.Name)
testPrime <- data.frame()

for (w in testwells) {
    test_w <- test[test$Well.Name == w,]
    
    for (i in 1:nrow(test_w)) {
        for (mw in metawells) {
            meta_mw <- meta[meta$Well.Name == mw & 
                            meta$Formation == test_w$Formation[i] &
                            test_w$FmRelDepth[i] < meta$FmCumThickness,
                           ]
            
            test_w[i, paste("Facies", mw)] <- meta_mw$Facies[1]
            
            # calculate distance
            GR_dist <- test_w[i, "GR"] - meta_mw$GR_mean[1]
            ILD_log10_dist <- test_w[i, "ILD_log10"] - meta_mw$ILD_log10_mean[1]
            dPhi_dist <- test_w[i, "DeltaPHI"] - meta_mw$dPhi_mean[1]
            PHIND_dist <- test_w[i, "PHIND"] - meta_mw$PHI_mean[1]
            PE_dist <- test_w[i, "PE"] - meta_mw$PE_mean[1]

            if (is.na(PE_dist)) {
                dist <- sum(GR_dist^2 + ILD_log10_dist^2 + dPhi_dist^2 + PHIND_dist^2)^0.5
            } else {
                dist <- sum(GR_dist^2 + ILD_log10_dist^2 + dPhi_dist^2 + PHIND_dist^2 + PE_dist^2)^0.5
            }
            
            test_w[i, paste("Dist", mw)] <- dist
        }
    }
    
    testPrime <- rbind(testPrime, test_w) 
}

format(head(testPrime), digits=3)

Facies,Formation,Well.Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,isMarine,...,Facies LUKE G U,Dist LUKE G U,Facies KIMZEY A,Dist KIMZEY A,Facies CROSS H CATTLE,Dist CROSS H CATTLE,Facies NOLAN,Dist NOLAN,Facies NEWBY,Dist NEWBY
FSiS,A1 SH,SHRIMPLIN,2793,0.216,0.01855,0.512,-0.0487,0.421,False,...,FSiS,1.31,SS,1.17,CSiS,1.184,CSiS,2.51,FSiS,1.255
FSiS,A1 SH,SHRIMPLIN,2794,0.237,0.00567,1.517,0.0736,-0.133,False,...,FSiS,0.559,SS,1.86,CSiS,0.606,CSiS,1.548,FSiS,0.816
FSiS,A1 SH,SHRIMPLIN,2794,0.258,-0.00721,1.657,0.1648,-0.687,False,...,FSiS,0.803,SS,1.97,CSiS,0.811,FSiS,0.746,FSiS,0.719
FSiS,A1 SH,SHRIMPLIN,2794,0.447,-0.02009,1.447,0.177,-0.797,False,...,FSiS,0.846,SS,1.72,CSiS,0.737,FSiS,0.527,FSiS,0.543
FSiS,A1 SH,SHRIMPLIN,2795,0.139,-0.05444,1.353,0.2118,-0.908,False,...,FSiS,1.061,CSiS,2.82,CSiS,0.984,FSiS,0.528,FSiS,0.565
FSiS,A1 SH,SHRIMPLIN,2796,0.122,-0.10167,1.47,0.2278,-0.687,False,...,FSiS,0.936,CSiS,2.92,CSiS,0.892,FSiS,0.613,FSiS,0.645


### Trial prediction

In [113]:
votes <- data.frame(matrix(0, nrow=nrow(testPrime), ncol=9))
names(votes) <- c('SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS')
p <- 1.75

# tally votes weighted by their distance (in data-space)
for (i in 1:nrow(testPrime)) {
    for (mw in metawells) {
        c <- unlist(testPrime[i, paste("Facies", mw)])
        w <- 1 / (unlist(testPrime[i, paste("Dist", mw)]))^p

        votes[i, which(names(votes) %in% c)] <- unlist(votes[i, which(names(votes) %in% c)]) + w

    }
}

# elect facies
for (i in 1:nrow(testPrime)) {
    testPrime$Predicted[i] <- names(votes)[which.max(votes[i,])]
}

testFinal <- subset(testPrime, select=c(Formation, Well.Name, Depth,
                                       GR, ILD_log10, DeltaPHI, PHIND, PE, isMarine, RELPOS,
                                       Facies, Predicted))

format(head(testFinal,3), digits=3)

Formation,Well.Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,isMarine,RELPOS,Facies,Predicted
A1 SH,SHRIMPLIN,2793,0.216,0.01855,0.512,-0.0487,0.421,False,1.0,FSiS,FSiS
A1 SH,SHRIMPLIN,2794,0.237,0.00567,1.517,0.0736,-0.133,False,0.979,FSiS,CSiS
A1 SH,SHRIMPLIN,2794,0.258,-0.00721,1.657,0.1648,-0.687,False,0.957,FSiS,FSiS


In [112]:
source("accuracyMetrics.R")

myF1Metric(testFinal$Predicted, testFinal$Facies)