In [3]:
%matplotlib inline
# to install watermark magic command: pip install ipyext
#%load_ext watermark 
#%watermark -v -p numpy,scipy,pandas,matplotlib,seaborn,sklearn

## Exploring the dataset
First, we will examine the data set we will use to train the classifier.

In [4]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics

from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None

filename = '../facies_vectors.csv'
training_data = pd.read_csv(filename)
training_data.describe()



Unnamed: 0,Facies,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
count,4149.0,4149.0,4149.0,4149.0,4149.0,4149.0,3232.0,4149.0,4149.0
mean,4.503254,2906.867438,64.933985,0.659566,4.402484,13.201066,3.725014,1.518438,0.521852
std,2.474324,133.300164,30.30253,0.252703,5.274947,7.132846,0.896152,0.49972,0.286644
min,1.0,2573.5,10.149,-0.025949,-21.832,0.55,0.2,1.0,0.0
25%,2.0,2821.5,44.73,0.498,1.6,8.5,,1.0,0.277
50%,4.0,2932.5,64.99,0.639,4.3,12.02,,2.0,0.528
75%,6.0,3007.0,79.438,0.822,7.5,16.05,,2.0,0.769
max,9.0,3138.0,361.15,1.8,19.312,84.4,8.094,2.0,1.0


In [5]:
training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')
training_data['Well Name'].unique()

[SHRIMPLIN, ALEXANDER D, SHANKLE, LUKE G U, KIMZEY A, CROSS H CATTLE, NOLAN, Recruit F9, NEWBY, CHURCHMAN BIBLE]
Categories (10, object): [SHRIMPLIN, ALEXANDER D, SHANKLE, LUKE G U, ..., NOLAN, Recruit F9, NEWBY, CHURCHMAN BIBLE]

In [6]:
PE_mask = training_data['PE'].notnull().values
training_data = training_data[PE_mask]

## K fold cross-validation 
Adapted from [@LukasMosser code](https://gist.github.com/LukasMosser/cd645bad2bdbbb419098ac3ea363f2b3) to fit python 3.5.
Doing a cross-validation on each well in order to see how each well perform as a blind text

In [19]:
names = set(training_data["Well Name"])

X, y = {}, {}
for name in names:
    well = training_data[training_data["Well Name"]==name] 
    well_labels = well['Facies'].values.astype(np.uint8)
    well = well.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1).values
    X[name] = well
    y[name] = well_labels

In [None]:
training_sets = []
test_sets = []

for name in names:
    X_train = []
    y_train = []

    X_test = []
    y_test = []

    for n, data in X.items():
        if n is not name:
            for row in data:
                X_train.append(row)
        else:
            for row in data:
                X_test.append(row)

    for name, labels in y_data.items():
        if name is not names[i]:
            for val in labels:
                y_train.append(val)
        else:
            for val in labels:
                y_test.append(val)

    X_train = np.array(X_train, dtype=np.float32)
    y_train = np.array(y_train, dtype=np.int64).reshape(len(y_train), 1)
    y_train = y_train.ravel()
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)


    X_test = np.array(X_test, dtype=np.float32)
    X_test = scaler.transform(X_test)

    y_test = np.array(y_test, dtype=np.int32)
    training_sets.append([X_train, y_train, X_test, y_test])
    
#Use as follows:
scores = []
for i, (X_train, y_train, X_test, y_test) in enumerate(training_sets):
    clf = svm.LinearSVC(class_weight='balanced', tol=1e-03, random_state=42, C=10)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    #Scoring
    score = metrics.f1_score(y_test, y_pred, average='weighted')
    scores.append(score)
    print('********')
    print('Blind well is {0}, F1 score : {1:.4%}\n'.format(names[i],score))
#     print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    pass
print("="*30)
print('*********** RESULT ***********')
print("="*30)
print('\nAverage  F1-score is {:.4%}'.format(np.mean(scores)))

K fold cross validation shows that the F1-score for each well is highly variable. For example, the model fit well for SHANKLE but less SHRIMPLIN. This is way, as @LukasMosser and me suggest is to use the average F1-score as a metricsto evaluate the performance of the submission.

In [None]:
from sklearn.cross_validation import KFold
kf = KFold(4, n_folds=2)
for train_index, test_index in kf:
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]