In [96]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
import numpy as np
from skimage.io import imread, imshow, imsave
from skimage.feature import blob_doh
from skimage.color import rgb2gray
from skimage.transform import resize
from glob import glob
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split

In [2]:
%matplotlib inline

In [3]:
trainlabel = pd.read_csv('../Data/train.csv')

In [4]:
whaleIDset = set(trainlabel['whaleID'])

In [5]:
print(len(whaleIDset), len(trainlabel))

(447, 4544)


### Make the training set

In [8]:
dloc = '../../BigData/kaggle-right-whale/imgs/'

In [82]:
features = []
foundlist = []
for i in range(0, 6000):
    si = str(i)
    tailloc = 'w_' + si + '_small_verysmall.jpg'
    dfile = dloc + tailloc
    if si[0] < '6':
        if 'w_' + si + '.jpg' in trainlabel['Image'].values:
            imdata = imread(dfile)
            features.append(imdata.flatten())
            trainlabelindex = trainlabel['Image'].values == 'w_' + si + '.jpg'
            trainlabelvalue = trainlabel['whaleID'].values[trainlabelindex][0]
            foundlist.append(trainlabelvalue)

In [83]:
print(len(foundlist), len(set(foundlist)))

(2662, 429)


In [84]:
features = np.array(features)

In [85]:
features.shape

(2662, 1800)

In [86]:
labels = trainlabel['whaleID']

In [87]:
vectorizer = CountVectorizer(min_df=1)

In [88]:
X = features
y = vectorizer.fit_transform(foundlist).toarray()

In [89]:
y.shape

(2662, 429)

In [90]:
clf = RandomForestClassifier(n_estimators = 10)

In [101]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)

In [104]:
clf.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [105]:
ypredict = clf.predict(Xtest)

In [112]:
def llfun(act, pred):
    epsilon = 1e-15
    toohigh = pred > 1 - epsilon
    pred[toohigh] = 1 - epsilon
    toolow = pred < epsilon
    pred[toolow] = epsilon
    ll = sum(act * np.log(pred) + (1 - act)*np.log(1 - pred))
    ll = ll * -1.0/len(act)
    return ll

In [113]:
classbyclass = []
nclasses = ypredict[0, :].size
for i in range(nclasses):
    result = llfun(ytest[:, i], ypredict[:, i])
    classbyclass.append(result)


In [116]:
sum(classbyclass)

34.538776394910919

In [95]:
scores = cross_val_score(clf, X, y, cv=3, scoring='log_loss')
print(scores, scores.mean(), scores.std())

ValueError: could not broadcast input array from shape (888,2) into shape (888)

In [11]:
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
test = pd.read_csv(dloc + 'test.csv')

In [14]:
Xtest = test

In [15]:
ypredict = clf.predict(Xtest)

In [16]:
ypredict

array([2, 0, 9, ..., 3, 9, 2])

In [17]:
dfpredict = pd.DataFrame(ypredict)

In [22]:
dfpredict.columns = ['Label']

In [26]:
dfpredict['ImageId'] = np.arange(28000) + 1

In [27]:
dfpredict.to_csv(dloc + 'predict_RFbenchmark.csv', index=False)

In [80]:
files = glob('../../BigData/kaggle-right-whale/imgs/*_small.jpg')
for file in files:
    im1 = imread(file)
    imsmall = resize(im1, (20,30,3))
    splitfile = os.path.splitext(file)
    fsmall = splitfile[0] + '_verysmall' + splitfile[1]
    imsave(fsmall, imsmall)