In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [3]:
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [4]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [8]:
artworks.head(n=100)

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm),YearAcquired
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6000,168.9000,1996
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451,1995
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3000,31.8000,1997
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8000,50.8000,1995
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4000,19.1000,1997
5,Bernard Tschumi,(),(Male),1976-77,Architecture & Design,1995-01-17,True,True,35.6000,45.7000,1995
6,Bernard Tschumi,(),(Male),1976-77,Architecture & Design,1995-01-17,True,True,35.6000,45.7000,1995
7,Bernard Tschumi,(),(Male),1976-77,Architecture & Design,1995-01-17,True,True,35.6000,45.7000,1995
8,Bernard Tschumi,(),(Male),1976-77,Architecture & Design,1995-01-17,True,True,35.6000,45.7000,1995
9,Bernard Tschumi,(),(Male),1976-77,Architecture & Design,1995-01-17,True,True,35.6000,45.7000,1995


In [6]:
# Get data types.
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [7]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [10]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat artists with other variables
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

In [11]:
# Alright! We've done our prep, let's build the model.
# Neural networks are hugely computationally intensive.
# This may take several minutes to run.

# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [12]:
mlp.score(X, Y)

0.70527009478029967

In [13]:
Y.value_counts()/len(Y)

Prints & Illustrated Books    0.523811
Photography                   0.225079
Architecture & Design         0.112399
Drawings                      0.103997
Painting & Sculpture          0.034714
Name: Department, dtype: float64

In [14]:
from sklearn.model_selection import cross_val_score


array([ 0.60579485,  0.69938948,  0.4706595 ,  0.53358209,  0.51613685])

In [15]:
# play around with different layers
# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,100,10))
mlp.fit(X, Y)
mlp.score(X, Y)
cross_val_score(mlp, X, Y, cv=5)

array([ 0.52376569,  0.52379107,  0.57915395,  0.52384183,  0.52384183])

In [16]:
# adjust the alpha values
mlp = MLPClassifier(hidden_layer_sizes=(1000,),alpha =.01)
mlp.fit(X, Y)
mlp.score(X, Y)
cross_val_score(mlp, X, Y, cv=5)

array([ 0.57517322,  0.66014149,  0.58269128,  0.48996899,  0.43356271])

In [17]:
# try changing the activation function for the hidden layer
# adjust the alpha values
mlp = MLPClassifier(hidden_layer_sizes=(1000,),activation ='logistic')
mlp.fit(X, Y)
mlp.score(X, Y)
cross_val_score(mlp, X, Y, cv=5)

array([ 0.60317845,  0.70064929,  0.57658574,  0.56784261,  0.52200039])

### Conclusions:
Only a few possible values were tested. Using a logistic activation function seemed to improve the original model.  Changing the alpha values and # and size of layers had a negative impact.