In [None]:
import pandas as pd 
import numpy as np
import os
from matplotlib import pyplot as plt
from sklearn.feature_extraction.image import extract_patches_2d
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix
from sklearn.svm import SVC, OneClassSVM
from sklearn.ensemble import IsolationForest, RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
np.random.seed(11)

# Preprocessing
Loading data, extracting 15,000 100x100 patches per image (selected randomly), shaping into a 2d matrix, creating train/test splits, establishing k folds

In [None]:
n = 25
max_patches = 10000
X, y = None, None
for artist in os.listdir('images'):
    for painting in os.listdir('images/{0}'.format(artist)):
        pth = os.path.join('images',artist,painting)
        if artist == 'campendonk':
            p = round(max_patches/15)
        else:
            p = round(max_patches/2)
        x = extract_patches_2d(plt.imread(pth), patch_size=(n,n), max_patches=p)
        if artist == 'campendonk':
            y_ = np.zeros(x.shape[0])
        else:
            y_ = np.ones(x.shape[0])
        if X is None:
            X = x
            y = y_
        else:
            X = np.vstack((X, x))
            y = np.hstack((y, y_))
y = y.reshape((-1, 1)).ravel()
original_shape = X.shape
X = X.reshape((-1, X.shape[1] * X.shape[2] * X.shape[3]))/255
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
sample = np.random.choice(X_train.shape[0], 6000)

# KNeighbors

In [None]:
knn = KNeighborsClassifier(n_jobs=-1)
scores = []
for i in np.linspace(3,25,10):
    knn.set_params(n_neighbors=round(i))
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    score = recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred), matthews_corrcoef(y_test, y_pred)
    scores.append(score)
plt.plot([round(i) for i in np.linspace(3,25,10)], scores)
plt.title('KNN')
plt.xlabel('Neighbors')
plt.ylabel('Score')
plt.legend(['Recall','Precision','F1','MCC'])
plt.savefig('outputs/knn.png')
final_k = round(np.linspace(3,25,10)[np.argmax(np.array(scores)[:,1])])
knn.set_params(n_neighbors=final_k)
knn.fit(X_train, y_train)
print(confusion_matrix(y_test, knn.predict(X_test)))

# Linear SVM

In [None]:
svm = SVC(kernel='linear')
scores = []
for i in np.linspace(.001,100,10):
    svm.set_params(C=i)
    svm.fit(X_train[sample], y_train[sample])
    y_pred = svm.predict(X_test)
    score = recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred), matthews_corrcoef(y_test, y_pred)
    scores.append(score)
plt.plot(np.linspace(.0001,100,10), scores)
plt.legend(['Recall','Precision','F1','MCC'])
plt.title('Linear SVM')
plt.xlabel('C')
plt.ylabel('Score')
plt.savefig('outputs/linear svm.png')
final_c = np.linspace(.001,100,10)[np.argmax(np.array(scores)[:,0])]
svm.set_params(C=final_c)
svm.fit(X_train[sample],y_train[sample])
print(confusion_matrix(y_test, svm.predict(X_test)))

# RBF SVM

In [None]:
svm_r = SVC(kernel='rbf')
scores = []
for i in np.linspace(.001,1,10):
    svm_r.set_params(C=i)
    svm_r.fit(X_train[sample], y_train[sample])
    y_pred = svm_r.predict(X_test)
    score = recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred), matthews_corrcoef(y_test, y_pred)
    scores.append(score)
plt.plot(np.linspace(.001,1,10), scores)
plt.legend(['Recall','Precision','F1','MCC'])
plt.title('RBF SVM')
plt.xlabel('C')
plt.ylabel('Score')
plt.savefig('outputs/rbf svm.png')
final_c = np.linspace(.001,1,10)[np.argmax(np.array(scores)[:,0])]
svm_r.set_params(C=final_c)
svm_r.fit(X_train[sample],y_train[sample])
print(confusion_matrix(y_test, svm_r.predict(X_test)))

# Random Forest

In [None]:
rf = RandomForestClassifier(n_jobs=-1)
scores = []
for i in np.linspace(5,100,10):
    rf.set_params(n_estimators=round(i))
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    score = recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred), matthews_corrcoef(y_test, y_pred)
    scores.append(score)
plt.plot([round(i) for i in np.linspace(5,100,10)], scores)
plt.legend(['Recall','Precision','F1','MCC'])
plt.title('Random Forest')
plt.xlabel('Trees')
plt.ylabel('Score')
plt.savefig('outputs/random forest.png')
final_trees = round(np.linspace(5,100,10)[np.argmax(np.array(scores)[:,0])])
rf.set_params(n_estimators=final_trees)
rf.fit(X_train, y_train)
print(confusion_matrix(y_test, rf.predict(X_test)))

# AdaBoost

In [None]:
ada = AdaBoostClassifier()
scores = []
for i in np.linspace(3,150,10):
    ada.set_params(n_estimators=round(i))
    ada.fit(X_train, y_train)
    y_pred = ada.predict(X_test)
    score = recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred), matthews_corrcoef(y_test, y_pred)
    scores.append(score)
plt.plot([round(i) for i in np.linspace(3,150,10)], scores)
plt.legend(['Recall','Precision','F1','MCC'])
plt.title('AdaBoost')
plt.xlabel('Stumps')
plt.ylabel('Score')
plt.savefig('outputs/adaboost.png')
final_s = round(np.linspace(3,150,10)[np.argmax(np.array(scores)[:,0])])
ada.set_params(n_estimators=final_s)
ada.fit(X_train, y_train)
print(confusion_matrix(y_test, ada.predict(X_test)))

# Naive Bayes

In [None]:
nb = GaussianNB()
scores = []
for i in np.linspace(.0001,1,10):
    nb.set_params(var_smoothing=i)
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    score = recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred), matthews_corrcoef(y_test, y_pred)
    scores.append(score)
plt.plot(np.linspace(.0001,1,10), scores)
plt.legend(['Recall','Precision','F1','MCC'])
plt.title('Naive Bayes')
plt.xlabel('Smoothing')
plt.ylabel('Score')
plt.savefig('outputs/naive bayes.png')
final_v = np.linspace(.0001,1,10)[np.argmax(np.array(scores)[:,0])]
nb.set_params(var_smoothing=final_v)
nb.fit(X_train, y_train)
print(confusion_matrix(y_test, nb.predict(X_test)))

# One Class SVM - Linear

In [None]:
oc_svm_l = OneClassSVM(kernel='linear')
scores = []
for i in np.linspace(.0001,1,10):
    oc_svm_l.set_params(nu=i)
    oc_svm_l.fit(X_train[y_train==0])
    y_pred = np.where(oc_svm_l.predict(X_test)==1,0,1)
    score = recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred), matthews_corrcoef(y_test, y_pred)
    scores.append(score)
plt.plot(np.linspace(.0001,1,10), scores)
plt.legend(['Recall','Precision','F1','MCC'])
plt.title('One Class SVM - Linear')
plt.xlabel('Gamma')
plt.ylabel('Score')
plt.savefig('outputs/ocsvm-l.png')
final_v = np.linspace(.0001,1,10)[np.argmax(np.array(scores)[:,0])]
oc_svm_l.set_params(nu=final_v)
oc_svm_l.fit(X_train, y_train)
print(confusion_matrix(y_test, np.where(oc_svm_l.predict(X_test)==1,0,1)))

# One Class SVM - RBF

In [None]:
oc_svm_r = OneClassSVM(kernel='rbf')
scores = []
for i in np.linspace(.0001,1,10):
    oc_svm_r.set_params(nu=i)
    oc_svm_r.fit(X_train[y_train==0])
    y_pred = np.where(oc_svm_r.predict(X_test)==1,0,1)
    score = recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred), matthews_corrcoef(y_test, y_pred)
    scores.append(score)
plt.plot(np.linspace(.0001,1,10), scores)
plt.legend(['Recall','Precision','F1','MCC'])
plt.title('One Class SVM - RBF')
plt.xlabel('Gamma')
plt.ylabel('Score')
plt.savefig('outputs/ocsvm-r.png')
final_v = np.linspace(.0001,1,10)[np.argmax(np.array(scores)[:,0])]
oc_svm_r.set_params(nu=final_v)
oc_svm_r.fit(X_train[y_train==0])
print(confusion_matrix(y_test, np.where(oc_svm_r.predict(X_test)==1,0,1)))

# Isolation Forest

In [None]:
iso_f = IsolationForest(n_jobs=-1)
scores = []
for i in np.linspace(5,100,10):
    iso_f.set_params(n_estimators=round(i))
    iso_f.fit(X_train[y_train==0])
    y_pred = np.where(iso_f.predict(X_test)==1,0,1)
    score = recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred), matthews_corrcoef(y_test, y_pred)
    scores.append(score)
plt.plot([round(i) for i in np.linspace(5,100,10)], scores)
plt.legend(['Recall','Precision','F1','MCC'])
plt.title('Isolation Forest')
plt.xlabel('Trees')
plt.ylabel('Score')
plt.savefig('outputs/iso-f.png')
final_v = round(np.linspace(5,100,10)[np.argmax(np.array(scores)[:,1])])
iso_f.set_params(n_estimators=final_v)
iso_f.fit(X_train[y_train==0])
print(confusion_matrix(y_test, np.where(iso_f.predict(X_test)==1,0,1)))

# Can RandomForest correctly place each image?

In [None]:
rf = RandomForestClassifier(n_jobs=-1)
scores = []
for i in np.linspace(21,61,10):
    rf.set_params(n_estimators=round(i))
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    score = recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred), matthews_corrcoef(y_test, y_pred)
    scores.append(score)
plt.plot([round(i) for i in np.linspace(21,61,10)], scores)
plt.legend(['Recall','Precision','F1','MCC'])
plt.title('Random Forest')
plt.xlabel('Trees')
plt.ylabel('Score')
plt.savefig('outputs/random forest refinement.png')
final_trees = round(np.linspace(21,61,10)[np.argmax(np.array(scores)[:,0])])
rf.set_params(n_estimators=final_trees)
rf.fit(X_train, y_train)
print(confusion_matrix(y_test, rf.predict(X_test)))

In [None]:
final_trees

In [None]:
counter = 0
max_patches = 500000
for artist in os.listdir('images'):
    for painting in os.listdir('images/{0}'.format(artist)):
        pth = os.path.join('images',artist,painting)
        x = extract_patches_2d(plt.imread(pth), patch_size=(n,n), max_patches=max_patches)
        x = x.reshape((-1,x.shape[1] * x.shape[2] * x.shape[3]))/255
        y = rf.predict(x)
        answer = np.mean(y)
        if answer >= 0.5:
            artist_answer = 'beltracchi'
            verdict = 'counterfeit'
        else:
            artist_answer = 'campendonk'
            verdict = 'real'
        if artist_answer == artist:
            check = 'correct'
            counter += 1
        else:
            check = 'incorrect'
        print("Painting {0} scored {1}, identifying the artist as {2} and calling it {3}. This answer is {4}".format(
            painting,
            str(round(answer,5)),
            artist_answer,
            verdict,
            check
        ))
print('Total correct: {0} out of 17 ({1}%)'.format(str(counter), str(round(counter/17,4)*100)))

# Further experimenting: Test images

In [None]:
test_paintings = ['Landschaft-mit-Pferden.jpg','bucolic-landscape-1913.jpg']
n = 25
max_patches = 50000
X, y = None, None
for artist in os.listdir('images'):
    for painting in os.listdir('images/{0}'.format(artist)):
        if painting in test_paintings:
            continue
        pth = os.path.join('images',artist,painting)
        if artist == 'campendonk':
            p = round(max_patches/15)
        else:
            p = round(max_patches/2)
        x = extract_patches_2d(plt.imread(pth), patch_size=(n,n), max_patches=p)
        if artist == 'campendonk':
            y_ = np.zeros(x.shape[0])
        else:
            y_ = np.ones(x.shape[0])
        if X is None:
            X = x
            y = y_
        else:
            X = np.vstack((X, x))
            y = np.hstack((y, y_))
y = y.reshape((-1, 1)).ravel()
original_shape = X.shape
X = X.reshape((-1, X.shape[1] * X.shape[2] * X.shape[3]))/255
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [None]:
# Searching for generalization
rf = RandomForestClassifier(n_jobs=-1)
scores = []
for i in np.linspace(5,100,10):
    rf.set_params(n_estimators=round(i))
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    score = recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred), matthews_corrcoef(y_test, y_pred)
    scores.append(score)
plt.plot([round(i) for i in np.linspace(5,100,10)], scores)
plt.legend(['Recall','Precision','F1','MCC'])
plt.title('Random Forest')
plt.xlabel('Trees')
plt.ylabel('Score')
plt.savefig('outputs/random forest with test condition.png')
final_trees = round(np.linspace(5,100,10)[np.argmax(scores[1])])
rf.set_params(n_estimators=final_trees)
rf.fit(X, y)
print(confusion_matrix(y_test, rf.predict(X_test)))

In [None]:
# Refining
rf = RandomForestClassifier(n_jobs=-1)
scores = []
for i in np.linspace(21,41,20):
    rf.set_params(n_estimators=round(i))
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    score = recall_score(y_test, y_pred), precision_score(y_test, y_pred), f1_score(y_test, y_pred), matthews_corrcoef(y_test, y_pred)
    scores.append(score)
plt.plot([round(i) for i in np.linspace(21,41,20)], scores)
plt.legend(['Recall','Precision','F1','MCC'])
plt.title('Random Forest')
plt.xlabel('Trees')
plt.ylabel('Score')
plt.savefig('outputs/random forest refinement with test condition.png')
final_trees = round(np.linspace(21,41,20)[np.argmax(np.array(scores)[:,0])])
rf.set_params(n_estimators=final_trees)
rf.fit(X, y)
print(confusion_matrix(y_test, rf.predict(X_test)))

In [None]:
final_trees

In [None]:
counter = 0
max_patches = 500000
for artist in os.listdir('images'):
    for painting in os.listdir('images/{0}'.format(artist)):
        if painting not in test_paintings:
            continue
        pth = os.path.join('images',artist,painting)
        x = extract_patches_2d(plt.imread(pth), patch_size=(n,n), max_patches=max_patches)
        x = x.reshape((-1,x.shape[1] * x.shape[2] * x.shape[3]))/255
        y = rf.predict(x)
        answer = np.mean(y)
        if answer >= 0.5:
            artist_answer = 'beltracchi'
            verdict = 'counterfeit'
        else:
            artist_answer = 'campendonk'
            verdict = 'real'
        if artist_answer == artist:
            check = 'correct'
            counter += 1
        else:
            check = 'incorrect'
        print("Painting {0} scored {1}, identifying the artist as {2} and calling it {3}. This answer is {4}".format(
            painting,
            str(round(answer,5)),
            artist_answer,
            verdict,
            check
        ))
print('Total correct: {0} out of 2 ({1}%)'.format(str(counter), str(round(counter/2,4)*100)))