In [29]:
# Data directory /Users/scott/p5/data/interim/n35_train_valid_test

In [1]:
import numpy as np
import pandas as pd
import keras
from keras import backend as K
from keras.preprocessing import image
from keras.applications.vgg19 import decode_predictions
from keras.models import Sequential, Model
from keras.layers import Activation
from keras.layers.core import Dense, Flatten
from keras.optimizers import Adam, SGD
from keras.metrics import categorical_crossentropy
from keras.preprocessing.image import ImageDataGenerator
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import *
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools
from glob import glob
import os.path
import PIL
import cv2
import re
from collections import defaultdict, namedtuple

Using TensorFlow backend.


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
def get_classes(path):
    directories = glob(os.path.expanduser(path) + '/*')
    return [dir.split('/')[-1] for dir in directories]
n_classes =  len(get_classes('/Users/scott/p5/data/interim/n35_train_valid_test/train'))
n_classes

3

In [4]:
def to_gray_scale(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray_image = cv2.cvtColor(image,cv2.COLOR_GRAY2BGR)
    return np.array(gray_image)

In [5]:
 vgg19 = keras.applications.vgg19.VGG19(weights='imagenet', include_top=False, input_shape=(224,224, 3))













In [6]:
# note we exclude the final dense layers and add one back below, we would retrain it ourselves
base_model = vgg19
 
# Freeze convolutional layers
for layer in base_model.layers:
    layer.trainable = False 
    
x = base_model.output
x = Flatten()(x) # flatten from convolution tensor output 
predictions = Dense(n_classes, activation='softmax')(x) # should match # of classes predicted

# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

In [7]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [8]:
train_path = os.path.expanduser('/Users/scott/p5/data/interim/n35_train_valid_test/train')
test_path = os.path.expanduser('/Users/scott/p5/data/interim/n35_train_valid_test/test')
valid_path = os.path.expanduser('/Users/scott/p5/data/interim/n35_train_valid_test/valid')

In [9]:
train_batches = ImageDataGenerator(preprocessing_function=to_gray_scale).flow_from_directory(train_path,
                                                         target_size=(224,224), 
                                                         batch_size=6)
valid_batches = ImageDataGenerator(preprocessing_function=to_gray_scale).flow_from_directory(valid_path, 
                                                         target_size=(224,224), 
                                                         batch_size=5)
test_batches = ImageDataGenerator(preprocessing_function=to_gray_scale).flow_from_directory(test_path, 
                                                        target_size=(224,224),
                                                        shuffle=False,
                                                        batch_size=1)


Found 735 images belonging to 3 classes.
Found 158 images belonging to 3 classes.
Found 158 images belonging to 3 classes.


In [10]:
# sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True)
# model.compile(Adam(lr=0.00002), loss='categorical_crossentropy', metrics=['accuracy'])
model.compile(Adam(lr=0.00002), loss='categorical_crossentropy', metrics=['accuracy'])





In [11]:
# patient early stopping
# es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100)
# mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
hist = model.fit_generator(train_batches, 
                    steps_per_epoch=4, 
                    validation_data=valid_batches, 
                    validation_steps=4,
                    epochs=100,
                    verbose=2)#, 
#                     callbacks=[es, mc])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/100
 - 18s - loss: 5.4548 - acc: 0.3333 - val_loss: 5.0242 - val_acc: 0.4500
Epoch 2/100
 - 17s - loss: 4.6299 - acc: 0.3750 - val_loss: 3.0649 - val_acc: 0.5000
Epoch 3/100
 - 17s - loss: 4.5564 - acc: 0.3333 - val_loss: 4.1576 - val_acc: 0.4500
Epoch 4/100
 - 17s - loss: 3.6904 - acc: 0.5417 - val_loss: 3.6422 - val_acc: 0.4000
Epoch 5/100
 - 17s - loss: 2.4489 - acc: 0.5417 - val_loss: 3.3507 - val_acc: 0.4500
Epoch 6/100
 - 17s - loss: 4.9762 - acc: 0.2917 - val_loss: 2.1961 - val_acc: 0.5000
Epoch 7/100
 - 17s - loss: 4.8867 - acc: 0.5000 - val_loss: 2.4890 - val_acc: 0.4500
Epoch 8/100
 - 16s - loss: 4.2516 - acc: 0.4583 - val_loss: 4.5059 - val_acc: 0.2222
Epoch 9/100
 - 17s - loss: 5.5829 - acc: 0.3333 - val_loss: 4.0110 - val_acc: 0.2000
Epoch 10/100
 - 17s - loss: 4.4581 - acc: 0.3333 - val_loss: 2.7944 - val_acc: 0.4500
Epoch 11/100
 - 18s - loss: 3.3385 - acc: 0.4167 - va

Epoch 88/100
 - 17s - loss: 0.2231 - acc: 0.9583 - val_loss: 0.4852 - val_acc: 0.8333
Epoch 89/100
 - 17s - loss: 0.4893 - acc: 0.8750 - val_loss: 0.6329 - val_acc: 0.9000
Epoch 90/100
 - 18s - loss: 0.3892 - acc: 0.8333 - val_loss: 0.3015 - val_acc: 0.9000
Epoch 91/100
 - 16s - loss: 0.5751 - acc: 0.8333 - val_loss: 0.1783 - val_acc: 0.9000
Epoch 92/100
 - 17s - loss: 0.1720 - acc: 0.9583 - val_loss: 1.1401 - val_acc: 0.8500
Epoch 93/100
 - 17s - loss: 1.1971 - acc: 0.7952 - val_loss: 0.2845 - val_acc: 0.9000
Epoch 94/100
 - 17s - loss: 0.3185 - acc: 0.9583 - val_loss: 1.3725 - val_acc: 0.8000
Epoch 95/100
 - 17s - loss: 1.1807 - acc: 0.8333 - val_loss: 0.7282 - val_acc: 0.8500
Epoch 96/100
 - 18s - loss: 0.0262 - acc: 1.0000 - val_loss: 0.3377 - val_acc: 0.8889
Epoch 97/100
 - 17s - loss: 0.6720 - acc: 0.9167 - val_loss: 0.4681 - val_acc: 0.9500
Epoch 98/100
 - 16s - loss: 0.3039 - acc: 0.9167 - val_loss: 1.0620 - val_acc: 0.9000
Epoch 99/100
 - 16s - loss: 0.0576 - acc: 0.9583 - val

In [None]:
for key in hist.history:
    print(key)

In [None]:
asdf =

In [None]:
!ls

In [12]:
model.save('hieroglyphs_vgg19_N35_only_custom_images.h5')

In [None]:
# predictions = model.predict_generator(test_batches, steps=1, verbose=1)

In [13]:
STEP_SIZE_TEST=test_batches.n//test_batches.batch_size
test_batches.reset()
pred=model.predict_generator(test_batches,
steps=STEP_SIZE_TEST,
verbose=1)
# print(pred)

[[9.99973178e-01 2.63251186e-05 5.02307557e-07]
 [7.55695999e-01 2.44300872e-01 3.11120584e-06]
 [1.00000000e+00 2.85441294e-15 4.34610042e-10]
 [1.00000000e+00 3.16065352e-09 6.28422148e-10]
 [9.99751866e-01 2.75514982e-18 2.48137832e-04]
 [1.00000000e+00 1.17585478e-14 4.33601732e-09]
 [6.47314417e-04 6.35666907e-01 3.63685757e-01]
 [9.70085204e-01 2.99147833e-02 2.13802887e-09]
 [9.99544203e-01 1.69322163e-06 4.54144349e-04]
 [9.99998569e-01 1.69218028e-09 1.43858256e-06]
 [1.00000000e+00 5.95598332e-13 5.90283271e-08]
 [9.99999285e-01 7.46490969e-07 1.81530453e-08]
 [9.99999881e-01 3.51675876e-11 1.04026007e-07]
 [5.60435047e-03 9.94394422e-01 1.22321558e-06]
 [1.00000000e+00 4.12518353e-11 3.68662184e-11]
 [1.00000000e+00 3.33161276e-12 9.99112112e-11]
 [9.97398376e-01 1.20960991e-07 2.60152901e-03]
 [9.99998569e-01 3.29491286e-07 1.06654215e-06]
 [9.99980211e-01 1.93540018e-05 4.29059781e-07]
 [9.96335387e-01 4.27486346e-04 3.23714339e-03]
 [9.99972582e-01 1.18550679e-05 1.561228

In [14]:
predicted_class_indices=np.argmax(pred,axis=1)
predicted_class_indices

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2])

In [15]:
labels = (train_batches.class_indices)
print(labels)
labels = dict((v,k) for k,v in labels.items())
print(labels)
predictions = [labels[k] for k in predicted_class_indices]

{'N35': 0, 'N35_part': 1, 'noise': 2}
{0: 'N35', 1: 'N35_part', 2: 'noise'}


In [16]:
predictions[:11]

['N35',
 'N35',
 'N35',
 'N35',
 'N35',
 'N35',
 'N35_part',
 'N35',
 'N35',
 'N35',
 'N35']

In [24]:
def get_glyph_name(series):
    file = series['filename']
    m = re.match(r'^([A-Z]{1}\d+)/', file)
    part = re.search(r'(N35_part)/', file)
    if m:
        return m.group(1)
    elif part:
        return part.group(1)
    else:
        return 'noise'

In [25]:
pred_df = pd.DataFrame(predictions, columns=['predicted'])
pred_df = pred_df.merge(pd.DataFrame(test_batches.filenames, columns=['filename']), left_index=True, right_index=True)
pred_df['glyph'] = pred_df.apply(lambda row: get_glyph_name(row), axis=1)
pred_df['match'] = pred_df['predicted'] == pred_df['glyph']
pred_df.to_csv('/Users/scott/p5/m17_predictions.csv', index=False)

In [26]:
acc = pred_df['match'].sum() / len(pred_df)
acc

0.8481012658227848

In [28]:
pred_df.sample(5)

Unnamed: 0,predicted,filename,glyph,match
14,N35,N35/img_x_1488_y_153.png,N35,True
25,N35,N35/img_x_419_y_497.png,N35,True
144,noise,noise/img_x_756_y_605.png,noise,True
121,N35_part,noise/img_x_1599_y_282.png,noise,False
136,noise,noise/img_x_437_y_597.png,noise,True


In [None]:
m17 = pred_df[pred_df['predicted'] == 'M17']

In [None]:
m17.head()

In [None]:
m17.filename[:5]

In [None]:
import shutil
for image_file in ['/Users/scott/p5/data/interim/m17/' + name for name in m17.filename.sample(100)]:
    print(image_file)
    shutil.copy2(image_file, '/Users/scott/p5/data/interim/m17/positives')
    


In [None]:
 # /Users/scott/p5/data/interim/m17

In [None]:
model = load_model(os.path.expanduser('hieroglyphs_vgg19_M17_only.h5'))

In [None]:
test_batches = ImageDataGenerator(preprocessing_function=to_gray_scale).flow_from_directory(os.path.expanduser('/Users/scott/p5/data/interim/m17'), 
                                                        target_size=(224,224),
                                                        shuffle=False,
                                                        batch_size=1)

In [None]:
# model.predict()
# cv.im

### PLAY

In [None]:
temp = np.random.randn(50,100)
# temp

In [None]:
plt.imshow(temp,cmap = 'jet')
plt.colorbar()

In [None]:
import cv2 


In [None]:
!ls

In [None]:
img = cv2.imread('SarcEastGH.jpg')  


In [None]:
img.shape

In [None]:
plt.figure(figsize=[10,10])
plt.imshow(img)

In [None]:
temp = np.random.randn(1060,1720,3)


In [None]:
plt.figure(figsize=[10,10])
plt.imshow(img+temp)