# Feature Extraction

In [5]:
import os
import numpy as np
import pandas as pd
import cv2
from matplotlib import pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
import scipy.stats as ss
from skimage.feature import local_binary_pattern
from multiprocessing.dummy import Pool as ThreadPool 

n_images = 4


def get_noise_features(noise, prefix):
    noise_features = {
        prefix + '_mean': np.mean(noise),
        prefix + '_std': np.std(noise),
        prefix + '_median': np.median(noise),
        prefix + '_min': np.min(noise),
        prefix + '_max': np.max(noise),
        prefix + '_sum': np.sum(noise),
        prefix + '_kurtosis': ss.kurtosis(noise.flatten()),
        prefix + '_skewness': ss.skew(noise.flatten()),
        prefix + '_entropy': ss.entropy(noise.flatten()),
        prefix + '_variation': ss.variation(noise.flatten()),
        prefix + '_noises_zero_count': len(noise[noise == 0]),
        prefix + '_non_zero_count': len(noise[noise > 0]),
        prefix + '_non_zero_mean': np.mean(noise[noise > 0]),
        prefix + '_non_zero_std': np.std(noise[noise > 0]),
        prefix + '_non_zero_median': np.median(noise[noise > 0]),
        prefix + '_nnon_zero_entropy': ss.entropy(noise[noise > 0].flatten()),
        prefix + '_non_zero_variation': ss.variation(noise[noise > 0].flatten()),
        prefix + '_low_value_count': (noise < 50).sum(),
        prefix + '_high_value_count': (noise > 200).sum()
    }
    return noise_features


def format_(name, values):
    return {'{}{}'.format(name, i): v for i, v in enumerate(values)}

def get_image_features(img, x1, y1, x2, y2):
    img = img[x1:x2, y1:y2]

    img_nl = cv2.fastNlMeansDenoisingColored(img)
    noise_nl = img - img_nl
    
#     img_dw = denoise_wavelet(img, multichannel=True)
#     noise_dw = img - (img_dw * 255).astype(np.int)

    features = {}
    features.update(get_noise_features(noise_nl[:,:,0], 'noise_nl_r'))
    features.update(get_noise_features(noise_nl[:,:,1], 'noise_nl_g'))
    features.update(get_noise_features(noise_nl[:,:,2], 'noise_nl_b'))

#     features.update(get_noise_features(noise_dw[:,:,0], 'noise_dw_r'))
#     features.update(get_noise_features(noise_dw[:,:,1], 'noise_dw_g'))
#     features.update(get_noise_features(noise_dw[:,:,2], 'noise_dw_b'))

    features.update(format_('noise_nl_hist_r', cv2.calcHist([noise_nl], [0], None, [256], [0, 256]).flatten()))
    features.update(format_('noise_nl_hist_g', cv2.calcHist([noise_nl], [1], None, [256], [0, 256]).flatten()))
    features.update(format_('noise_nl_hist_b', cv2.calcHist([noise_nl], [2], None, [256], [0, 256]).flatten()))

#     features.update(format_('noise_dw_hist_r', cv2.calcHist([noise_dw], [0], None, [256], [0, 256]).flatten()))
#     features.update(format_('noise_dw_hist_g', cv2.calcHist([noise_dw], [1], None, [256], [0, 256]).flatten()))
#     features.update(format_('noise_dw_hist_b', cv2.calcHist([noise_dw], [2], None, [256], [0, 256]).flatten()))

    features.update(format_('img_hist_r', cv2.calcHist([img], [0], None, [256], [0, 256]).flatten()))
    features.update(format_('img_hist_g', cv2.calcHist([img], [1], None, [256], [0, 256]).flatten()))
    features.update(format_('img_hist_b', cv2.calcHist([img], [2], None, [256], [0, 256]).flatten()))

    #Get LBP from image 
    lbp_r = local_binary_pattern(img[:,:,0], 8, 3)
    lbp_g = local_binary_pattern(img[:,:,1], 8, 3)
    lbp_b = local_binary_pattern(img[:,:,2], 8, 3)  
    (hist_r, _) = np.histogram(lbp_r, bins=256)
    (hist_g, _) = np.histogram(lbp_g, bins=256)
    (hist_b, _) = np.histogram(lbp_b, bins=256)
    features.update(format_('img_hist_lbp_r', hist_r))
    features.update(format_('img_hist_lbp_g', hist_g))
    features.update(format_('img_hist_lbp_b', hist_b))
    
    return features


def get_path_features(path):
    features_list = []
    img = cv2.imread(path, cv2.COLOR_BGR2RGB)
#     for angle in [0, 90, 90, 90]:
#         if angle != 0:
#             img = np.rot90(img)
#         for _ in range(0, n_images):
    (h, w) = img.shape[:2]
#             x, y = np.random.randint(h-512), np.random.randint(w-512)
    x, y = 0, 0
    features = get_image_features(img, x, y, x + 512, y + 512) #canto esquerdo sup.
    features_list.append(features)

    x, y = h-512, w-512
    features = get_image_features(img, x, y, x + 512, y + 512) #canto direito inf.
    features_list.append(features)

    x, y = h, w
    features = get_image_features(img, x-512, 0, x, 512) #canto direito sup.
    features_list.append(features)

    x, y = h, w
    features = get_image_features(img, 0, y-512, 512, y) #canto esquerdo inf.
    features_list.append(features)

    x, y = h, w
    startx = x//2-(512//2)
    starty = y//2-(512//2)

    features = get_image_features(img, startx, starty, startx + cropx, starty+cropy) #centro
    features_list.append(features)

    features.update({
        'device': os.path.basename(os.path.dirname(path)),
        'filename': os.path.basename(path)
    })
    features_list.append(features)
    return features_list

def save_all_features(paths, output, verbose=False):
    with open(output, 'w') as myfile:
        for path in tqdm_notebook(paths):
            features_list = get_path_features(path)
            for features in features_list:
                myfile.write(str(features))

paths = ['train/Samsung-Galaxy-Note3/(GalaxyN3)54.jpg']
save_all_features(paths, 'features.json', True)




NameError: name 'cropx' is not defined

In [133]:
from datetime import datetime

paths = []
base_dir = '/media/carlosbaia/HD - CARLOS HENRIQUE/Unicamp/MO444/train'
for device_dir in os.listdir(base_dir):
    for file in os.listdir(os.path.join(base_dir, device_dir)):
        paths.append(os.path.join(base_dir, device_dir, file))
      
print('Extracting features of {} images...'.format(len(paths)))
print('Started at {}'.format(datetime.now()))
save_all_features(paths, 'features.json')
print('Finished at {}'.format(datetime.now()))

Extracting features of 2750 images...
Started at 2018-04-18 00:04:12.278056


  pk = 1.0*pk / np.sum(pk, axis=0)
  return a.std(axis) / a.mean(axis)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  return a.std(axis) / a.mean(axis)



Finished at 2018-04-18 09:43:45.975598


In [127]:
dw = denoise_wavelet(img, multichannel=True, method='BayesShrink')
dw_n = (dw  * 255).astype(np.int)
noise_dw = img - dw_n

TypeError: denoise_wavelet() got an unexpected keyword argument 'method'

In [125]:
(img.min(), img.max()), (dw_n.min(), dw_n.max()), (noise_dw.min(), noise_dw.max())

((0, 255), (0, 255), (0, 1))

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 4))
ax1.imshow(img)
ax2.imshow(dw_n)
ax3.imshow(noise_dw)

# Train Model

In [11]:
import pandas as pd
import numpy as np

df_train = pd.read_json('features.csv', names)
print(len(df_train))
df_train.head()

55000


Unnamed: 0,noises_mean,noises_std,noises_median,noises_min,noises_max,noises_sum,noises_entropy,noises_variation,noises_zero_count,noises_non_zero_count,...,img_hist_254,img_hist_255,img_mean,img_std,img_laplacian,img_sobel,img_low_color,img_high_color,device,filename
0,14.691711,59.080803,0.0,0.0,255.0,3851344.0,9.661123,4.02137,231331.0,30813.0,...,2380.0,2075.0,97.465164,54.889874,2451.360827,2680003.0,64987.0,8136.0,HTC-1-M7,(HTC-1-M7)1.jpg
1,87.854954,119.837133,2.0,0.0,255.0,23030649.0,11.457742,1.364034,81204.0,180940.0,...,7.0,24.0,103.408039,37.123013,163.88311,78539.58,59.0,3586.0,HTC-1-M7,(HTC-1-M7)1.jpg
2,97.886593,122.428506,3.0,0.0,255.0,25660383.0,11.568542,1.250718,59988.0,202156.0,...,0.0,0.0,80.891674,6.371986,177.33773,43676.17,513.0,0.0,HTC-1-M7,(HTC-1-M7)1.jpg
3,53.685238,103.161844,0.0,0.0,255.0,14073263.0,10.960092,1.921605,151929.0,110215.0,...,1.0,4.0,46.158791,35.731351,829.789852,750685.4,200631.0,130.0,HTC-1-M7,(HTC-1-M7)1.jpg
4,11.435402,52.508425,0.0,0.0,255.0,2997722.0,9.408241,4.591743,238541.0,23603.0,...,1.0,4.0,76.850822,49.462525,2529.561804,2470751.0,111583.0,685.0,HTC-1-M7,(HTC-1-M7)1.jpg


In [12]:
df_train[df_train.noises_mean == 0].device.value_counts()

Motorola-Nexus-6        18
Samsung-Galaxy-Note3    12
LG-Nexus-5x             11
Samsung-Galaxy-S4       11
Sony-NEX-7               9
Motorola-X               6
Motorola-Droid-Maxx      5
iPhone-4s                3
HTC-1-M7                 1
Name: device, dtype: int64

In [13]:
df_train.fillna(0, inplace=True)

In [14]:
df_train.replace(np.inf, 0, inplace=True)
df_train.replace(-np.inf, 0, inplace=True)

# ML model

In [15]:
features = df_train.columns.drop(['device', 'filename'])
X = df_train[features].values
y = df_train['device'].values

In [16]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

classifier = RandomForestClassifier(100, random_state=1986)

cv = StratifiedKFold(5, True, 1986)
scores = cross_val_score(classifier, X, y, scoring='accuracy', cv=cv)
print(scores, scores.mean())

[0.69181818 0.69763636 0.70963636 0.70054545 0.70772727] 0.7014727272727272
CPU times: user 5min 12s, sys: 1.9 s, total: 5min 13s
Wall time: 5min 18s


# Outros

In [48]:
from skimage.feature import local_binary_pattern

local_binary_pattern(img, 10, 5)

ValueError: The parameter `image` must be a 2-dimensional array