In [1]:
###### >>> POINT 4 <<<
##  Train the model on the data collected in SigTuple Hack v1

In [4]:
import os 

import cv2

import pandas as pd
import numpy as np
np.random.seed(1337)  # for reproducibility

from skimage import filters

from sklearn import svm, metrics 
from sklearn.externals import joblib
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV

import scipy.misc
from scipy.misc import imread
import matplotlib.pyplot as plt

%matplotlib inline

In [16]:
dfTrain = pd.read_csv('TrainFilters.csv')

In [18]:
dfTrain.drop('Unnamed: 0', axis=1, inplace = True)
print dfTrain.shape
print dfTrain.columns

(6414012, 20)
Index([u'BiFi', u'Blue', u'BlueB', u'BlueHSV', u'BlueO', u'Blur', u'Green',
       u'GreenB', u'GreenHSV', u'GreenO', u'Markers', u'Mask', u'Median',
       u'NoiseK', u'OTSU', u'Red', u'RedB', u'RedHSV', u'RedO', u'Th1'],
      dtype='object')


In [41]:
dfTrain.drop('OTSU', axis=1, inplace = True)

In [42]:
dfTrain.Mask.mean()

0.08352712779458474

In [43]:
dfTrain.fillna(value = 0, inplace = True) 

In [44]:
train, test = train_test_split(dfTrain, test_size = 0.5)
#validate, test = train_test_split(test, test_size = 0.5)

print train.shape
#print validate.shape
print test.shape

(3207006, 19)
(3207006, 19)


In [45]:
lin_clf = svm.LinearSVC(verbose =5)

X = train.drop('Mask', axis=1)
y = train['Mask']

In [46]:
lin_clf.fit(X, y)

[LibLinear]

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=5)

In [47]:
# Save the model
joblib.dump(lin_clf, 'linearSVC_v3_filter.pkl')

['linearSVC_v3_filter.pkl']

In [48]:
def img3d2d(image, listCols):
    imgdf = pd.DataFrame(image.transpose(2,0,1).reshape(3,-1).transpose(1,0))
    imgdf.columns = listCols
    return imgdf

In [49]:
## Read the test files
ip_files = []
test_files = os.listdir('E:\\SigTuple_Hack\\Test_Data\\')
ip_files += [f for f in test_files if f.endswith('.jpg') and 'mask' not in f]


In [51]:
# Load the model & process the test files
lin_clf = joblib.load('linearSVC_v3_filter.pkl')
for img_file in ip_files:
    input_file = 'E:\\SigTuple_Hack\\Test_Data\\' + img_file
    mask_file = 'E:\\SigTuple_Hack\\Test_Data\\Mask\\' + img_file[:-4] + '-mask.jpg'
    print input_file
    
    # Load the image 
    imgcolor = cv2.imread(input_file)
    imgShape = imgcolor.shape
    
    imageDF = img3d2d(imgcolor, ['Red', 'Green', 'Blue'])
    
    
    # Convert BGR to HSV and gray scale image
    imghsv = cv2.cvtColor(imgcolor, cv2.COLOR_BGR2HSV)
    imggray = cv2.cvtColor(imgcolor, cv2.COLOR_BGR2GRAY)
    imageDF = pd.concat([imageDF ,img3d2d(imghsv, ['RedHSV', 'GreenHSV', 'BlueHSV'])], axis = 1) 
    
    # Create filter with threshold using OTSU binarization
    val = filters.threshold_otsu(imggray)
    imgotsu = imgcolor < val    
    imageDF = pd.concat([imageDF ,img3d2d(imgotsu, ['RedO', 'GreenO', 'BlueO'])], axis = 1) 
    
    # Create markers using watershed technique
    # Create an approximate estimation of the cells by using OTSU binarization
    ret, thresh = cv2.threshold(imggray,0,255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
    
    # noise removal ## >>>> Only using the noise removal on the pic with an iteration of 15 gives a 0.65
                    ## This is actually less than the one with a pure ML solution
    kernel = np.ones((3,3),np.uint8)
    imgopening = cv2.morphologyEx(thresh,cv2.MORPH_OPEN,kernel, iterations = 15)
    imageDF['NoiseK'] = imgopening.reshape(imgShape[0]*imgShape[1])
    
    opening = cv2.morphologyEx(thresh,cv2.MORPH_OPEN,kernel, iterations = 5)
    # sure background area
    sure_bg = cv2.dilate(opening,kernel,iterations=3)

    # Finding sure foreground area
    dist_transform = cv2.distanceTransform(opening,cv2.DIST_L2,5)
    ret, sure_fg = cv2.threshold(dist_transform,0.7*dist_transform.max(),255,0)

    # Finding unknown region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg,sure_fg)

    # Marker labelling
    ret, markers = cv2.connectedComponents(sure_fg)
    # Add one to all labels so that sure background is not 0, but 1
    imgmarkers = markers+1
    # Now, mark the region of unknown with zero
    imgmarkers[unknown==255] = 0
    
    imageDF['Markers'] = imgmarkers.reshape(imgShape[0]*imgShape[1])
    
    # define range of blue color in HSV
    lower_wbc = np.array([120,240,150])
    upper_wbc = np.array([135,255,180])
    # Threshold the HSV image to get only blue colors
    mask = cv2.inRange(imghsv, lower_wbc, upper_wbc)
    # Bitwise-AND mask and original image
    imgBlue = cv2.bitwise_and(imghsv,imghsv, mask= mask)
    imageDF = pd.concat([imageDF,img3d2d(imghsv, ['RedB', 'GreenB', 'BlueB'])], axis = 1)
        
        
    ret,th1 = cv2.threshold(imggray,127,255,cv2.THRESH_BINARY)
    imgTh1 = cv2.bitwise_not(th1)
    imageDF['Th1'] = imgTh1.reshape(imgShape[0]*imgShape[1])
    
    imgblur = cv2.blur(imgTh1, (10,10))
    imgmedian = cv2.medianBlur(imgTh1,7)
    imgbiF = cv2.bilateralFilter(imgTh1,5,75,75)
    
    imageDF['Blur'] = imgblur.reshape(imgShape[0]*imgShape[1])
    imageDF['Median'] = imgmedian.reshape(imgShape[0]*imgShape[1])
    imageDF['BiFi'] = imgbiF.reshape(imgShape[0]*imgShape[1])
    
    # Create mask, using LinearSVC model
    mask = lin_clf.predict(imageDF)
    imgmask = mask.reshape(imgShape[0], imgShape[1])
    # Save file
    scipy.misc.imsave(mask_file, imgmask)
    #imageDF['PredMask'] = mask
    
    # Inserting the mask bit
     # Load mask and append to temp df
    #imgmask = cv2.imread(mask_file)/255.0
    #temp = rgb2gray(imgmask)
    #maskShape = temp.shape
    #imageDF['Mask'] = temp.reshape(maskShape[0]*maskShape[1])
    
    
    

E:\SigTuple_Hack\Test_Data\017532875DDF.jpg
E:\SigTuple_Hack\Test_Data\029E137BB177.jpg
E:\SigTuple_Hack\Test_Data\029E137BB179.jpg
E:\SigTuple_Hack\Test_Data\072CBBB64F88.jpg
E:\SigTuple_Hack\Test_Data\072CBBB64F89.jpg
E:\SigTuple_Hack\Test_Data\0BC2C60F3BA0.jpg
E:\SigTuple_Hack\Test_Data\1468054105.jpg
E:\SigTuple_Hack\Test_Data\1468054163.jpg
E:\SigTuple_Hack\Test_Data\1468054267.jpg
E:\SigTuple_Hack\Test_Data\1468054306.jpg
E:\SigTuple_Hack\Test_Data\1468061314.jpg
E:\SigTuple_Hack\Test_Data\1468061317.jpg
E:\SigTuple_Hack\Test_Data\1468061347.jpg
E:\SigTuple_Hack\Test_Data\1468061368.jpg
E:\SigTuple_Hack\Test_Data\1468061411.jpg
E:\SigTuple_Hack\Test_Data\16B1C9836EB0.jpg
E:\SigTuple_Hack\Test_Data\264882623008.jpg
E:\SigTuple_Hack\Test_Data\2D6D41F5B4A1.jpg
E:\SigTuple_Hack\Test_Data\308A1C309EF5.jpg
E:\SigTuple_Hack\Test_Data\31A25DC60455.jpg
E:\SigTuple_Hack\Test_Data\31A25DC60456.jpg
E:\SigTuple_Hack\Test_Data\34DBA85D4F7A.jpg
E:\SigTuple_Hack\Test_Data\34DBA85D4F7E.jpg
E:\Sig