In [1]:
%matplotlib inline
import os
import numpy as np
import hashlib
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pytesseract
import pandas as pd
import cv2

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense

Using TensorFlow backend.


In [2]:
VIOLATIONS = [
    # entry permit
    'Entry Permit: Invalid Id Number',
    'Entry Permit: Invalid Expiration Date',
    'Entry Permit: Invalid Name',
    'Missing Entry Permit',
    'Forged Entry Permit',

    # passport
    'Missing Passport',
    'Passport: Invalid Gender',
    'Passport: Invalid Expiration Date',
    'Passport: Non-matching photo',
    'Passport: Invalid Issuing City',
    
    # id card
    'Missing Id Card',
    'Id Card: Invalid Birth Date',
    'Id Card: Non-matching photo',
    'Id Card: Invalid Weight',
    'Id Card: Invalid Height',
    'Id Card: Invalid Name',
    'Id Card: Invalid District',

    # id supplement
    'Missing Id Supplement',
    'Id Supplement: Invalid Expiration Date',
    'Id Supplement: Invalid Weight',
    'Id Supplement: Invalid Height',
    'Id Supplement: Invalid Thumbprint',
    'Id Supplement: Invalid Description',

    # responses
    'Incorrect purpose response',
    'Incorrect visit duration response',
    
    # Diplomatic
    'No Diplomatic Access to Arstotzka',
    'Diplomatic Auth: Invalid Name',
    'Diplomatic Auth: Invalid Id Number',
    'Forged Diplomatic Auth',

    # Work pass
    'Missing Work Pass',
    'Work Pass: Invalid Name',
    'Forged Work Pass',
    'Work Pass: Invalid Work End Date',

    # Wanted criminal
    'Wanted criminal admitted',
]

In [3]:
df = pd.concat((
    pd.concat((
        pd.read_csv(os.path.join(root, 'output.csv'), names=VIOLATIONS),
        pd.DataFrame([[os.path.basename(root)]], columns=['id'])
    ), axis=1)
    for root, dirs, files in os.walk('data')
    if 'input.png' in files and 'output.csv' in files
)).set_index('id')
df

Unnamed: 0_level_0,Entry Permit: Invalid Id Number,Entry Permit: Invalid Expiration Date,Entry Permit: Invalid Name,Missing Entry Permit,Forged Entry Permit,Missing Passport,Passport: Invalid Gender,Passport: Invalid Expiration Date,Passport: Non-matching photo,Passport: Invalid Issuing City,...,Incorrect visit duration response,No Diplomatic Access to Arstotzka,Diplomatic Auth: Invalid Name,Diplomatic Auth: Invalid Id Number,Forged Diplomatic Auth,Missing Work Pass,Work Pass: Invalid Name,Forged Work Pass,Work Pass: Invalid Work End Date,Wanted criminal admitted
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
d7bad5ce-5c10-4491-b4e8-65c3e9b91c47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
eb677b4b-1f4e-4d49-a1a9-be9b64db0917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
c0fded9c-edd4-44d4-b14a-05d088889744,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
f1843889-0750-4bed-b3a7-8cc5ff15a881,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
c57622ba-1183-4b0a-bc8c-cbe4c7817d22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6bb935f9-754d-4cb3-b643-a6aa2bd543d9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00a35126-7d0c-4c7b-b0a1-8ee518e0d79c,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46770a21-e41c-4739-8431-f46b7d4fc369,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
f57d3e67-1b30-4535-b7c6-fe11cfc5739d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67c34a99-30dd-4361-955f-c1db66c0104e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df['Wanted criminal admitted'].sum(), df.shape[0]

(308, 7705)

In [7]:
df = pd.concat((df[df['Wanted criminal admitted'] == 1], df[df['Wanted criminal admitted'] == 0].sample(n=df['Wanted criminal admitted'].sum())))
df.shape

(616, 34)

In [22]:
X, y = [], []
for id, tag in zip(df.index, df['Wanted criminal admitted']):
    im = Image.open(os.path.join('data', id, 'input.png'))
    out = im.resize([int(0.5 * s) for s in im.size])
    X.append(np.array(out))
    y.append(tag)

In [23]:
X[0].shape

(320, 5387, 3)

In [24]:
# https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html

model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=X[0].shape))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [None]:
model.fit(np.array(X), y,
          epochs=50,
          batch_size=batch_size,
          validation_split=0.2,
          verbose = 1)

Train on 492 samples, validate on 124 samples
Epoch 1/50
Epoch 2/50
 48/492 [=>............................] - ETA: 20:28 - loss: 8.0590 - acc: 0.5000