In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [64]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [48]:
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [65]:
artworks_sample = artworks.sample(frac=0.1, random_state=1)

In [66]:
# Select Columns.
artworks_sample = artworks_sample[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks_sample['URL'] = artworks_sample['URL'].notnull()
artworks_sample['ThumbnailURL'] = artworks_sample['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks_sample = artworks_sample[artworks_sample['Department']!='Film']
artworks_sample = artworks_sample[artworks_sample['Department']!='Media and Performance Art']
artworks_sample = artworks_sample[artworks_sample['Department']!='Fluxus Collection']

# Drop missing data.
artworks_sample = artworks_sample.dropna()

In [67]:
artworks_sample.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
72712,Maryan S. Maryan,(American),(Male),1967,Drawings & Prints,1974-01-10,True,True,59.5,76.3
31528,Savely Sorine,(Russian),(Male),1922,Drawings & Prints,1932-02-24,True,True,28.6,34.3
122991,Raymond Pettibon,(American),(Male),1985,Drawings & Prints,2014-12-17,True,True,21.6,14.0
23083,André Derain,(French),(Male),1938,Drawings & Prints,1964-10-06,False,False,20.0,15.1
5154,Fritz Schleifer,(German),(Male),1923,Architecture & Design,1999-06-30,True,True,100.3,73.3426


In [68]:
# Get data types.
artworks_sample.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [69]:
artworks_sample['DateAcquired'] = pd.to_datetime(artworks_sample.DateAcquired)
artworks_sample['YearAcquired'] = artworks_sample.DateAcquired.dt.year
artworks_sample['YearAcquired'].dtype

dtype('int64')

In [70]:
# Remove multiple nationalities, genders, and artists.
artworks_sample.loc[artworks_sample['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks_sample.loc[artworks_sample['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks_sample.loc[artworks_sample['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks_sample['Date'] = pd.Series(artworks_sample.Date.str.extract('([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks_sample.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks_sample.Artist)
nationalities = pd.get_dummies(artworks_sample.Nationality)
dates = pd.get_dummies(artworks_sample.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks_sample.Department

## Single Hidden Layer (1000)

In [71]:
# Alright! We've done our prep, let's build the model.
# Neural networks are hugely computationally intensive.
# This may take several minutes to run.

# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [72]:
mlp.score(X, Y)

0.7065994500458295

In [73]:
Y.value_counts()/len(Y)

Drawings & Prints        0.623465
Photography              0.227589
Architecture & Design    0.110174
Painting & Sculpture     0.035014
Media and Performance    0.003758
Name: Department, dtype: float64

In [74]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)

array([0.55469108, 0.67673993, 0.62356717, 0.68256881, 0.68715596])

Not bad, but if we know that 62% of the rows are under Drawings & Prints group. A score of 62% is easily achived if the model clasify all the rows under Drawings & paints group. In addition crossvalidation scores are way less than the mlp score for the whole sample

## Two Hidden Layers (500)

In [75]:
# Establish and fit the model, with two layers, with 500 perceptrons on each layer.
mlp = MLPClassifier(hidden_layer_sizes=(500, 500))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(500, 500), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [76]:
mlp.score(X, Y)

0.7099908340971586

In [78]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)

array([0.71899314, 0.67124542, 0.67079321, 0.7087156 , 0.69495413])

Eventhough, the mlp score is almost the same as the mlp score for the previous model, but the cross validation scores are better.

## Four Hidden Layers (250)

In [79]:
# Establish and fit the model, with four layers, with 250 perceptrons on each layer.
mlp = MLPClassifier(hidden_layer_sizes=(250, 250, 250, 250))
mlp.fit(X, Y)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(250, 250, 250, 250), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [80]:
mlp.score(X, Y)

0.7961503208065994

In [81]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)

array([0.74462243, 0.7239011 , 0.70884915, 0.7206422 , 0.7233945 ])

the score improved and it is much better but its score is much higher than cross validation scores

## Single Hidden Layer (100)

In [82]:
# Establish and fit the model, with a single, 100 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(100,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [83]:
mlp.score(X, Y)

0.6752520623281393

In [84]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)

array([0.69748284, 0.67948718, 0.69050894, 0.71422018, 0.66376147])

lower score but more consistent with the cross validation scores

## Two Hidden Layers (100)

In [85]:
# Establish and fit the model, with two layers, with 100 perceptrons on each layer.
mlp = MLPClassifier(hidden_layer_sizes=(100, 100))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [86]:
mlp.score(X, Y)

0.7377635197066911

In [87]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)

array([0.65217391, 0.69230769, 0.57313159, 0.5706422 , 0.70412844])

Good score but again it is much higher than cross validation scores

## Four Hidden Layers (100)

In [88]:
# Establish and fit the model, with four layers, with 100 perceptrons on each layer.
mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [89]:
mlp.score(X, Y)

0.7358386801099909

In [90]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)



array([0.70160183, 0.65796703, 0.72673086, 0.72201835, 0.72477064])

Good score with more consistent cross validation score. This score and cross validation is the best among the previous options