In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score

%matplotlib inline

In [5]:
pd.set_option('display.max_columns', None)

In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

In [6]:
artworks.head()

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,Dimensions,CreditLine,AccessionNumber,Classification,Department,DateAcquired,Cataloged,ObjectID,URL,ThumbnailURL,Circumference (cm),Depth (cm),Diameter (cm),Height (cm),Length (cm),Weight (kg),Width (cm),Seat Height (cm),Duration (sec.)
0,"Ferdinandsbrücke Project, Vienna, Austria, Ele...",Otto Wagner,6210,"(Austrian, 1841–1918)",(Austrian),(1841),(1918),(Male),1896,Ink and cut-and-pasted painted pages on paper,"19 1/8 x 66 1/2"" (48.6 x 168.9 cm)",Fractional and promised gift of Jo Carole and ...,885.1996,Architecture,Architecture & Design,1996-04-09,Y,2,http://www.moma.org/collection/works/2,http://www.moma.org/media/W1siZiIsIjU5NDA1Il0s...,,,,48.6,,,168.9,,
1,"City of Music, National Superior Conservatory ...",Christian de Portzamparc,7470,"(French, born 1944)",(French),(1944),(0),(Male),1987,Paint and colored pencil on print,"16 x 11 3/4"" (40.6 x 29.8 cm)",Gift of the architect in honor of Lily Auchinc...,1.1995,Architecture,Architecture & Design,1995-01-17,Y,3,http://www.moma.org/collection/works/3,http://www.moma.org/media/W1siZiIsIjk3Il0sWyJw...,,,,40.6401,,,29.8451,,
2,"Villa near Vienna Project, Outside Vienna, Aus...",Emil Hoppe,7605,"(Austrian, 1876–1957)",(Austrian),(1876),(1957),(Male),1903,"Graphite, pen, color pencil, ink, and gouache ...","13 1/2 x 12 1/2"" (34.3 x 31.8 cm)",Gift of Jo Carole and Ronald S. Lauder,1.1997,Architecture,Architecture & Design,1997-01-15,Y,4,http://www.moma.org/collection/works/4,http://www.moma.org/media/W1siZiIsIjk4Il0sWyJw...,,,,34.3,,,31.8,,
3,"The Manhattan Transcripts Project, New York, N...",Bernard Tschumi,7056,"(French and Swiss, born Switzerland 1944)",(),(1944),(0),(Male),1980,Photographic reproduction with colored synthet...,"20 x 20"" (50.8 x 50.8 cm)",Purchase and partial gift of the architect in ...,2.1995,Architecture,Architecture & Design,1995-01-17,Y,5,http://www.moma.org/collection/works/5,http://www.moma.org/media/W1siZiIsIjEyNCJdLFsi...,,,,50.8,,,50.8,,
4,"Villa, project, outside Vienna, Austria, Exter...",Emil Hoppe,7605,"(Austrian, 1876–1957)",(Austrian),(1876),(1957),(Male),1903,"Graphite, color pencil, ink, and gouache on tr...","15 1/8 x 7 1/2"" (38.4 x 19.1 cm)",Gift of Jo Carole and Ronald S. Lauder,2.1997,Architecture,Architecture & Design,1997-01-15,Y,6,http://www.moma.org/collection/works/6,http://www.moma.org/media/W1siZiIsIjEyNiJdLFsi...,,,,38.4,,,19.1,,


In [7]:
artworks['URL'] = artworks['URL'].notnull()


In [9]:
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

In [10]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]


In [11]:
# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

In [12]:
# Drop missing data.
artworks = artworks.dropna()

In [13]:
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


In [15]:
artworks.Department.value_counts()

Prints & Illustrated Books    54759
Photography                   23981
Architecture & Design         11690
Drawings                      10781
Painting & Sculpture           3557
Name: Department, dtype: int64

In [16]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [26]:
artworks.Date.value_counts()

1971                                                          1628
1967                                                          1599
1968                                                          1377
1966                                                          1354
1965                                                          1346
1973                                                          1225
1969                                                          1162
1970                                                          1155
2003                                                          1095
1963                                                          1082
1964                                                          1080
1930                                                           987
1962                                                           972
1972                                                           944
2001                                                          

In [24]:
artworks.loc[artworks['Gender'].str.contains("\) \("), 'Gender'] = '\(multiple_persons\)'

In [25]:
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

In [27]:
# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

In [28]:
artworks.Date.value_counts()

1967    2183
1969    2041
1968    2027
1965    2009
1966    1987
1971    1887
1970    1731
1964    1640
1930    1589
1962    1556
1963    1526
1973    1510
2003    1355
1972    1321
1948    1257
1928    1232
1938    1204
1931    1195
2001    1135
1926    1135
1980    1111
1947    1102
1920    1086
2002    1067
1927    1064
1961    1063
1976    1049
1950    1043
1975    1034
1960    1007
        ... 
1883      23
1840      22
1879      20
1886      20
1816      18
1825      18
1878      15
1851      14
1844      12
1845       9
1841       9
1882       9
1837       9
1847       2
1768       2
1832       2
1839       2
1846       2
1786       1
1828       1
1800       1
1501       1
1848       1
1838       1
1808       1
1805       1
1799       1
1809       1
1811       1
1842       1
Name: Date, Length: 197, dtype: int64

In [42]:
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

In [43]:
X.head()

Unnamed: 0,URL,ThumbnailURL,Height (cm),Width (cm),YearAcquired,Gender_(),Gender_(Female),Gender_(Male),Gender_(male),Gender_\(multiple_persons\),(),(Albanian),(Algerian),(American),(Argentine),(Australian),(Austrian),(Azerbaijani),(Bahamian),(Belgian),(Bolivian),(Bosnian),(Brazilian),(British),(Bulgarian),(Cambodian),(Cameroonian),(Canadian Inuit),(Canadian),(Chilean),(Chinese),(Colombian),(Congolese),(Costa Rican),(Croatian),(Cuban),(Czech),(Czechoslovakian),(Danish),(Dutch),(Ecuadorian),(Egyptian),(Estonian),(Ethiopian),(Finnish),(French),(Georgian),(German),(Ghanaian),(Greek),(Guatemalan),(Guyanese),(Haitian),(Hungarian),(Icelandic),(Indian),(Iranian),(Irish),(Israeli),(Italian),(Ivorian),(Japanese),(Kenyan),(Korean),(Kuwaiti),(Latvian),(Lebanese),(Lithuanian),(Luxembourgish),(Malaysian),(Malian),(Mexican),(Moroccan),(Namibian),(Nationality Unknown),(Nationality unknown),(New Zealander),(Nicaraguan),(Nigerian),(Norwegian),(Pakistani),(Palestinian),(Panamanian),(Paraguayan),(Peruvian),(Polish),(Portuguese),(Puerto Rican),(Romanian),(Russian),(Scottish),(Senegalese),(Serbian),(Singaporean),(Slovak),(Slovenian),(South African),(Spanish),(Sudanese),(Swedish),(Swiss),(Taiwanese),(Tanzanian),(Thai),(Tunisian),(Turkish),(Ugandan),(Ukrainian),(Uruguayan),(Various),(Venezuelan),(Yugoslav),(Zimbabwean),(nationality unknown),\(multiple_nationalities\),1501,1768,1786,1797,1799,1800,1805,1808,1809,1810,1811,1816,1818,1825,1828,1832,1837,1838,1839,1840,1841,1842,1843,1844,1845,1846,1847,1848,1849,1850,1851,1852,1853,1854,1855,1856,1857,1858,1859,1860,1861,1862,1863,1864,1865,1866,1867,1868,1869,1870,1871,1872,1873,1874,1875,1876,1877,1878,1879,1880,1881,1882,1883,1884,1885,1886,1887,1888,1889,1890,1891,1892,1893,1894,1895,1896,1897,1898,1899,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,True,True,48.6,168.9,1996,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,True,True,40.6401,29.8451,1995,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,True,True,34.3,31.8,1997,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,True,True,50.8,50.8,1995,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,True,True,38.4,19.1,1997,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [44]:
# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)
mlp.score(X, Y)

0.7208498778252902

Subset of Data
Let's split the data using 10% of the data to build our models for decreased run time. 

In [50]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.1, random_state=42)
print(len(X))
print(len(X_train))

104768
10476




In [51]:
# Establish and fit the model, with default settings and training set.
mlp = MLPClassifier()
mlp.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [52]:
# 5-fold Cross Validation
cross_val_score(mlp, X_train, Y_train, cv=5)

array([ 0.53695756,  0.52003817,  0.60763723,  0.59045346,  0.5848065 ])

With all the default parameters, the average score was 57%

In [54]:
# Establish and fit the model, with logistic activation.
mlp_logit = MLPClassifier(activation='logistic')
mlp_logit.fit(X_train, Y_train)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [55]:
# 5-fold Cross Validation
cross_val_score(mlp_logit, X_train, Y_train, cv=5)

array([ 0.63423939,  0.63931298,  0.6548926 ,  0.63579952,  0.62971811])

Changing the activation function to "logistic" from "relu" increased the accuracy score for the model, and was less overfit. The average accuracy with one hidden layer of 100 nodes is 64%.

In [56]:
# Establish and fit the model, with logistic activation with different layer sizes.
mlp_logit = MLPClassifier(activation='logistic', hidden_layer_sizes=[10])
mlp_logit.fit(X_train, Y_train)
# 5-fold Cross Validation
cross_val_score(mlp_logit, X_train, Y_train, cv=5)

array([ 0.54697186,  0.55057252,  0.61050119,  0.60047733,  0.61681796])

The average score with one hidden layer of 10 nodes is 59%. So when the number of nodes are decreased, the accuracy also decreased from 64% to 59%.

In [57]:
# Establish and fit the model, with logistic activation with different layer sizes.
mlp_logit = MLPClassifier(activation='logistic', hidden_layer_sizes=[50])
mlp_logit.fit(X_train, Y_train)
# 5-fold Cross Validation
cross_val_score(mlp_logit, X_train, Y_train, cv=5)

array([ 0.63710062,  0.62881679,  0.63150358,  0.624821  ,  0.6383182 ])

The average score with one hidden layer of 50 nodes is 63%. Though the number of nodes are decreased to a half of the default value, the accuracy decreased but just a little, from 64% to 63%.

In [59]:
# Establish and fit the model, with logistic activation with different layer sizes.
mlp_logit = MLPClassifier(activation='logistic', hidden_layer_sizes=[200])
mlp_logit.fit(X_train, Y_train)
# 5-fold Cross Validation
cross_val_score(mlp_logit, X_train, Y_train, cv=5)

array([ 0.64043872,  0.6240458 ,  0.64677804,  0.64868735,  0.64882943])

When the number of nodes are doubled of the default value, the accuracy remained same i.e 64% which was the score with the default number of nodes.
So we can go ahead with the default value of nodes.

Let's see the scores with change in the alpha regularization parameter.

In [65]:
# Establish and fit the model, with logistic activation
mlp_logit = MLPClassifier(activation='logistic')
mlp_logit.fit(X_train, Y_train)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train, Y_train, cv=5)
print(score)
print(score.mean())

[ 0.63948498  0.63979008  0.63054893  0.64439141  0.64500717]
0.63984451115


With default alpha value of 0.0001, the score is 64%

In [67]:
# Establish and fit the model, with logistic activation with different layer sizes.
mlp_logit = MLPClassifier(activation='logistic', alpha=1e-6)
mlp_logit.fit(X_train, Y_train)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train, Y_train, cv=5)
print(score)
print(score.mean())

[ 0.62756319  0.62977099  0.64200477  0.63866348  0.63975155]
0.635550797684


When the alpha value is decreased, the score remained same 64%

In [68]:
# Establish and fit the model, with logistic activation with different layer sizes.
mlp_logit = MLPClassifier(activation='logistic', alpha=0.1)
mlp_logit.fit(X_train, Y_train)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train, Y_train, cv=5)
print(score)
print(score.mean())

[ 0.5884597   0.59923664  0.6052506   0.58806683  0.60009556]
0.596221864923


The score decreased when the alpha regularization parameter is increased.

We can go ahead with default values of alpha.

Let's try with 2 hidden layers with each of 100 nodes

In [70]:
# More Layers, logistic activation and default alpha
mlp_logit = MLPClassifier(activation='logistic', hidden_layer_sizes=(100, 100))
mlp_logit.fit(X_train, Y_train)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train, Y_train, cv=5)
print(score)
print(score.mean())


[ 0.65331426  0.63979008  0.64105012  0.62768496  0.63497372]
0.639362628053


The average accuracy with one hidden layer of 100 nodes was 64%. The score remained same even if we took 2 hidden layers of 100 nodes each.

In [71]:
# one big Layers, logistic activation and default alpha
mlp_logit = MLPClassifier(activation='logistic', hidden_layer_sizes=[1000])
mlp_logit.fit(X_train, Y_train)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train, Y_train, cv=5)
print(score)
print(score.mean())


[ 0.61373391  0.64933206  0.6353222   0.64343675  0.59006211]
0.626377405666


When we tried with one hidden layer with 1000 nodes instead of 2 hidden layers of 100 each, the score decreased from 64% to 63%

Let's try changing the number of iterations.

In [73]:

mlp_logit = MLPClassifier(activation='logistic', max_iter=1000)
mlp_logit.fit(X_train, Y_train)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train, Y_train, cv=5)
print(score)
print(score.mean())


[ 0.63185503  0.62833969  0.63770883  0.62863962  0.64070712]
0.633450058662


for the default max iterations of 200, the score was 64%. When the max_iter parameter is changed to a high value, the score decreased but a little bit.

Let's do some feature engineering and run this for a subset of the data to improve runtime

In [119]:
X2 = artworks.drop(['Department', 'DateAcquired','YearAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X2 = pd.get_dummies(X2, sparse=True)
X2 = pd.concat([X2, nationalities, dates], axis=1)

Y = artworks.Department

In [121]:
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y, train_size=0.1, random_state=42)
print(len(X2))
print(len(X_train2))

104768
10476




In [122]:
X_train2['Height_norm'] = (X_train2['Height (cm)'] - X_train2['Height (cm)'].mean() ) / X_train2['Height (cm)'].std()
X_train2['Width_norm'] = (X_train2['Width (cm)'] - X_train2['Width (cm)'].mean() ) / X_train2['Width (cm)'].std()
X_train2 = X_train2.drop(['Height (cm)','Width (cm)'], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [123]:
X_train2.head()

Unnamed: 0,URL,ThumbnailURL,Gender_(),Gender_(Female),Gender_(Male),Gender_(male),Gender_\(multiple_persons\),(),(Albanian),(Algerian),(American),(Argentine),(Australian),(Austrian),(Azerbaijani),(Bahamian),(Belgian),(Bolivian),(Bosnian),(Brazilian),(British),(Bulgarian),(Cambodian),(Cameroonian),(Canadian Inuit),(Canadian),(Chilean),(Chinese),(Colombian),(Congolese),(Costa Rican),(Croatian),(Cuban),(Czech),(Czechoslovakian),(Danish),(Dutch),(Ecuadorian),(Egyptian),(Estonian),(Ethiopian),(Finnish),(French),(Georgian),(German),(Ghanaian),(Greek),(Guatemalan),(Guyanese),(Haitian),(Hungarian),(Icelandic),(Indian),(Iranian),(Irish),(Israeli),(Italian),(Ivorian),(Japanese),(Kenyan),(Korean),(Kuwaiti),(Latvian),(Lebanese),(Lithuanian),(Luxembourgish),(Malaysian),(Malian),(Mexican),(Moroccan),(Namibian),(Nationality Unknown),(Nationality unknown),(New Zealander),(Nicaraguan),(Nigerian),(Norwegian),(Pakistani),(Palestinian),(Panamanian),(Paraguayan),(Peruvian),(Polish),(Portuguese),(Puerto Rican),(Romanian),(Russian),(Scottish),(Senegalese),(Serbian),(Singaporean),(Slovak),(Slovenian),(South African),(Spanish),(Sudanese),(Swedish),(Swiss),(Taiwanese),(Tanzanian),(Thai),(Tunisian),(Turkish),(Ugandan),(Ukrainian),(Uruguayan),(Various),(Venezuelan),(Yugoslav),(Zimbabwean),(nationality unknown),\(multiple_nationalities\),1501,1768,1786,1797,1799,1800,1805,1808,1809,1810,1811,1816,1818,1825,1828,1832,1837,1838,1839,1840,1841,1842,1843,1844,1845,1846,1847,1848,1849,1850,1851,1852,1853,1854,1855,1856,1857,1858,1859,1860,1861,1862,1863,1864,1865,1866,1867,1868,1869,1870,1871,1872,1873,1874,1875,1876,1877,1878,1879,1880,1881,1882,1883,1884,1885,1886,1887,1888,1889,1890,1891,1892,1893,1894,1895,1896,1897,1898,1899,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,Height_norm,Width_norm
78975,True,True,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.473441,0.008575
85426,True,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.383623,-0.23203
53236,False,False,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.242508,-0.2937
115042,True,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,-0.301261,-0.290662
111846,False,False,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.585404,-0.193447


In [124]:

mlp_logit = MLPClassifier(activation='logistic')
mlp_logit.fit(X_train2, Y_train2)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train2, Y_train2, cv=5)
print(score)
print(score.mean())




[ 0.6948021   0.6851145   0.67303103  0.67923628  0.69278548]
0.68499387611




After scaling the height and width column, we see improvement in the score. To avoid the convergence warning, we need to increase the maximum iterations.

In [126]:
mlp_logit = MLPClassifier(activation='logistic' , max_iter=1000)
mlp_logit.fit(X_train2, Y_train2)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train2, Y_train2, cv=5)
print(score)
print(score.mean())

[ 0.72722938  0.73139313  0.72458234  0.72410501  0.72288581]
0.726039133149


After scaling the height and width column and increasing the max iter parameter, we see improvement in the score. The score improved from 68% to 73% with activation function as "Logistic"

In [127]:
mlp_logit = MLPClassifier(activation='logistic' , max_iter=1000, hidden_layer_sizes=(50))
mlp_logit.fit(X_train2, Y_train2)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train2, Y_train2, cv=5)
print(score)
print(score.mean())

[ 0.72961373  0.73425573  0.7150358   0.71217184  0.72623029]
0.723461477555


With the default hidden layer size of 100, the score was 73%. When the hidden layer size is 50, the score decreased but very little, from 72.6% to 72.3%

In [128]:
mlp_logit = MLPClassifier(activation='logistic' , max_iter=1000, hidden_layer_sizes=(200))
mlp_logit.fit(X_train2, Y_train2)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train2, Y_train2, cv=5)
print(score)
print(score.mean())

[ 0.73295184  0.65696565  0.71264916  0.66205251  0.67367415]
0.687658661478


When we doubled the hidden layer size, the score reduced a lot, i.e from 73% to 69%

In [129]:
mlp_logit = MLPClassifier(activation='logistic' , max_iter=1000, hidden_layer_sizes=(100,100))
mlp_logit.fit(X_train2, Y_train2)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train2, Y_train2, cv=5)
print(score)
print(score.mean())

[ 0.65712923  0.72471374  0.65298329  0.68878282  0.7032967 ]
0.685381157155


In [None]:
The default number of hidden layer size works best.

In [130]:
mlp_logit = MLPClassifier(activation='logistic' , max_iter=1000, alpha = 0.01)
mlp_logit.fit(X_train2, Y_train2)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train2, Y_train2, cv=5)
print(score)
print(score.mean())

[ 0.71292322  0.71135496  0.67112172  0.70023866  0.70043   ]
0.699213714425


With the default aplha value , the score was 73% which is now reduced to 70% when we increased the alpha value.

In [131]:
mlp_logit = MLPClassifier(activation='logistic' , max_iter=1000, alpha = 1)
mlp_logit.fit(X_train2, Y_train2)
# 5-fold Cross Validation
score = cross_val_score(mlp_logit, X_train2, Y_train2, cv=5)
print(score)
print(score.mean())

[ 0.57033858  0.56917939  0.56372315  0.5699284   0.56521739]
0.56767738217


With further increase in alpha value, the score reduced to 57%.